import numpy as np
import pyopencl as cl
import pyopencl.array as cla

ctx = cl.create_some_context(answers=["nvi", 0])
queue = cl.CommandQueue(ctx)

prg = cl.Program(ctx, """
__kernel void sum(
    __global const float *a_g, __global const float *b_g, __global float *res_g)
{
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

print(prg.binaries[0].decode())

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35042711
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//

.version 8.2
.target sm_70, texmode_independent
.address_size 64

	// .globl	sum

.entry sum(
	.param .u64 .ptr .global .align 4 sum_param_0,
	.param .u64 .ptr .global .align 4 sum_param_1,
	.param .u64 .ptr .global .align 4 sum_param_2
)
{
	.reg .f32 	%f<4>;
	.reg .b32 	%r<7>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [sum_param_0];
	ld.param.u64 	%rd2, [sum_param_1];
	ld.param.u64 	%rd3, [sum_param_2];
	mov.b32 	%r1, %envreg3;
	mov.u32 	%r2, %ntid.x;
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %tid.x;
	add.s32 	%r5, %r4, %r1;
	mad.lo.s32 	%r6, %r2, %r3, %r5;
	mul.wide.s32 	%rd4, %r6, 4;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.f32 	%f1, [%rd5];
	add.s64 	%rd6, %rd2, %rd4;
	ld.global.f32 	%f2, [%rd6];
	add.f32 	%f3, %f1, %f2;
	add.s64 	%rd7, %rd3, %rd4;
	st.global.f32 	[%rd7], %f3;
	ret;

}

!mkdir -p tmp

binary = prg.binaries[0]

with open("tmp/binary.ptx", "wb") as outf:
    outf.write(hacked_binary)
    
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'sum' for 'sm_70'
ptxas info    : Function properties for sum
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 376 bytes cmem[0]

!cuobjdump --dump-sass tmp/binary.o

	code for sm_70
		Function : sum
	.headerflags	@"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
                                                                          /* 0x000fc40000000f00 */
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;          /* 0x000000fffffff389 */
                                                                          /* 0x000fe200000e00ff */
        /*0020*/                   S2R R0, SR_TID.X ;                     /* 0x0000000000007919 */
                                                                          /* 0x000e220000002100 */
        /*0030*/                   MOV R7, 0x4 ;                          /* 0x0000000400077802 */
                                                                          /* 0x000fc60000000f00 */
        /*0040*/                   S2R R3, SR_CTAID.X ;                   /* 0x0000000000037919 */
                                                                          /* 0x000e620000002500 */
        /*0050*/                   IADD3 R0, R0, c[0x0][0x94], RZ ;       /* 0x0000250000007a10 */
                                                                          /* 0x001fca0007ffe0ff */
        /*0060*/                   IMAD R0, R3, c[0x0][0x0], R0 ;         /* 0x0000000003007a24 */
                                                                          /* 0x002fc800078e0200 */
        /*0070*/                   IMAD.WIDE R2, R0, R7, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                          /* 0x000fc800078e0207 */
        /*0080*/                   IMAD.WIDE R4, R0, R7, c[0x0][0x168] ;  /* 0x00005a0000047625 */
                                                                          /* 0x000fc800078e0207 */
        /*0090*/                   LDG.E.SYS R2, [R2] ;                   /* 0x0000000002027381 */
                                                                          /* 0x000ea800001ee900 */
        /*00a0*/                   LDG.E.SYS R5, [R4] ;                   /* 0x0000000004057381 */
                                                                          /* 0x000ea200001ee900 */
        /*00b0*/                   IMAD.WIDE R6, R0, R7, c[0x0][0x170] ;  /* 0x00005c0000067625 */
                                                                          /* 0x000fc800078e0207 */
        /*00c0*/                   FADD R9, R2, R5 ;                      /* 0x0000000502097221 */
                                                                          /* 0x004fd00000000000 */
        /*00d0*/                   STG.E.SYS [R6], R9 ;                   /* 0x0000000906007386 */
                                                                          /* 0x000fe2000010e900 */
        /*00e0*/                   EXIT ;                                 /* 0x000000000000794d */
                                                                          /* 0x000fea0003800000 */
        /*00f0*/                   BRA 0xf0;                              /* 0xfffffff000007947 */
                                                                          /* 0x000fc0000383ffff */
		..........

prg = cl.Program(ctx, """
__kernel void sum(
    __global float *a_g, int n)
{
    int gid = get_global_id(0);
    
    // try dividing by n
    
    int row = gid / 117;
    int col = gid % 117;
    
    a_g[row * 128 + col] *= 2;
    // a_g[gid] *= 2;
    
}
""").build()

binary = prg.binaries[0]

with open("tmp/binary.ptx", "wb") as outf:
    outf.write(hacked_binary)
    
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)
!cuobjdump --dump-sass tmp/binary.o | cut -c -80

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'sum' for 'sm_70'
ptxas info    : Function properties for sum
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 376 bytes cmem[0]

	code for sm_70
		Function : sum
	.headerflags	@"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 E
        /*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x0
                                                                          /* 0x0
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;          /* 0x0
                                                                          /* 0x0
        /*0020*/                   S2R R0, SR_TID.X ;                     /* 0x0
                                                                          /* 0x0
        /*0030*/                   MOV R7, 0x4 ;                          /* 0x0
                                                                          /* 0x0
        /*0040*/                   S2R R3, SR_CTAID.X ;                   /* 0x0
                                                                          /* 0x0
        /*0050*/                   IADD3 R0, R0, c[0x0][0x94], RZ ;       /* 0x0
                                                                          /* 0x0
        /*0060*/                   IMAD R0, R3, c[0x0][0x0], R0 ;         /* 0x0
                                                                          /* 0x0
        /*0070*/                   IMAD.WIDE R2, R0, R7, c[0x0][0x160] ;  /* 0x0
                                                                          /* 0x0
        /*0080*/                   IMAD.WIDE R4, R0, R7, c[0x0][0x168] ;  /* 0x0
                                                                          /* 0x0
        /*0090*/                   LDG.E.SYS R2, [R2] ;                   /* 0x0
                                                                          /* 0x0
        /*00a0*/                   LDG.E.SYS R5, [R4] ;                   /* 0x0
                                                                          /* 0x0
        /*00b0*/                   IMAD.WIDE R6, R0, R7, c[0x0][0x170] ;  /* 0x0
                                                                          /* 0x0
        /*00c0*/                   FADD R9, R2, R5 ;                      /* 0x0
                                                                          /* 0x0
        /*00d0*/                   STG.E.SYS [R6], R9 ;                   /* 0x0
                                                                          /* 0x0
        /*00e0*/                   EXIT ;                                 /* 0x0
                                                                          /* 0x0
        /*00f0*/                   BRA 0xf0;                              /* 0xf
                                                                          /* 0x0
		..........

prg = cl.Program(ctx, """
__kernel void sum(
    __global const float *a_g, __global const float *b_g, __global float *res_g, int n)
{
    int gsize = get_global_size(0);
    
    for (int i = get_global_id(0); i < n; i += gsize)
      res_g[i] = a_g[i] + b_g[i];
      
    res_g[get_global_id(0)] = 15;
}
""").build()

binary = prg.binaries[0]
#print(binary.decode())
with open("tmp/binary.ptx", "wb") as outf:
    outf.write(binary)
    
!(cd tmp; ptxas --gpu-name sm_70 --verbose binary.ptx -o binary.o)
!cuobjdump --dump-sass tmp/binary.o | cut -c -80

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'sum' for 'sm_70'
ptxas info    : Function properties for sum
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 28 registers, 380 bytes cmem[0]

	code for sm_70
		Function : sum
	.headerflags	@"EF_CUDA_TEXMODE_INDEPENDENT EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 E
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;      
                                                                                
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;                
                                                                                
        /*0020*/                   S2R R0, SR_TID.X ;                           
                                                                                
        /*0030*/                   BMOV.32.CLEAR RZ, B0 ;                       
                                                                                
        /*0040*/                   BSSY B0, 0x670 ;                             
                                                                                
        /*0050*/                   S2R R3, SR_CTAID.X ;                         
                                                                                
        /*0060*/                   IADD3 R0, R0, c[0x0][0x94], RZ ;             
                                                                                
        /*0070*/                   IMAD R0, R3, c[0x0][0x0], R0 ;               
                                                                                
        /*0080*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x178], PT ; 
                                                                                
        /*0090*/                   SHF.R.S32.HI R3, RZ, 0x1f, R0 ;              
                                                                                
        /*00a0*/               @P0 BRA 0x660 ;                                  
                                                                                
        /*00b0*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x0] ;       
                                                                                
        /*00c0*/                   MOV R6, RZ ;                                 
                                                                                
        /*00d0*/                   IMAD.MOV.U32 R5, RZ, RZ, R0 ;                
                                                                                
        /*00e0*/                   BMOV.32.CLEAR RZ, B1 ;                       
                                                                                
        /*00f0*/                   IMAD R2, R2, c[0x0][0xa0], RZ ;              
                                                                                
        /*0100*/                   BSSY B1, 0x440 ;                             
                                                                                
        /*0110*/                   LOP3.LUT R4, RZ, R5, RZ, 0x33, !PT ;         
                                                                                
        /*0120*/                   IADD3 R9, RZ, -R2, RZ ;                      
                                                                                
        /*0130*/                   I2F.U32.RP R8, R2 ;                          
                                                                                
        /*0140*/                   ISETP.NE.U32.AND P2, PT, R2, RZ, PT ;        
                                                                                
        /*0150*/                   MUFU.RCP R8, R8 ;                            
                                                                                
        /*0160*/                   IADD3 R10, R8, 0xffffffe, RZ ;               
                                                                                
        /*0170*/                   F2I.FTZ.U32.TRUNC.NTZ R7, R10 ;              
                                                                                
        /*0180*/                   IMAD R9, R9, R7, RZ ;                        
                                                                                
        /*0190*/                   IMAD.WIDE.U32 R6, R7, R9, R6 ;               
                                                                                
        /*01a0*/                   IADD3 R9, R4, c[0x0][0x178], RZ ;            
                                                                                
        /*01b0*/                   IMAD.WIDE.U32 R6, R7, R9, RZ ;               
                                                                                
        /*01c0*/                   IMAD.MOV R4, RZ, RZ, -R7 ;                   
                                                                                
        /*01d0*/                   IMAD R9, R2, R4, R9 ;                        
                                                                                
        /*01e0*/                   ISETP.GE.U32.AND P0, PT, R9, R2, PT ;        
                                                                                
        /*01f0*/               @P0 IADD3 R9, -R2, R9, RZ ;                      
                                                                                
        /*0200*/               @P0 IADD3 R7, R7, 0x1, RZ ;                      
                                                                                
        /*0210*/                   ISETP.GE.U32.AND P1, PT, R9, R2, PT ;        
                                                                                
        /*0220*/               @P1 IADD3 R7, R7, 0x1, RZ ;                      
                                                                                
        /*0230*/              @!P2 LOP3.LUT R7, RZ, R2, RZ, 0x33, !PT ;         
                                                                                
        /*0240*/                   IADD3 R4, R7.reuse, 0x1, RZ ;                
                                                                                
        /*0250*/                   ISETP.GE.U32.AND P0, PT, R7, 0x3, PT ;       
                                                                                
        /*0260*/                   LOP3.LUT P1, R4, R4, 0x3, RZ, 0xc0, !PT ;    
                                                                                
        /*0270*/              @!P1 BRA 0x430 ;                                  
                                                                                
        /*0280*/                   IMAD.SHL.U32 R8, R0.reuse, 0x4, RZ ;         
                                                                                
        /*0290*/                   SHF.L.U64.HI R9, R0, 0x2, R3 ;               
                                                                                
        /*02a0*/                   IADD3 R18, P1, R8.reuse, c[0x0][0x170], RZ ; 
                                                                                
        /*02b0*/                   IADD3 R6, P2, R8.reuse, c[0x0][0x168], RZ ;  
                                                                                
        /*02c0*/                   IADD3 R8, P3, R8, c[0x0][0x160], RZ ;        
                                                                                
        /*02d0*/                   IADD3.X R21, R9.reuse, c[0x0][0x174], RZ, P1,
                                                                                
        /*02e0*/                   IADD3.X R7, R9.reuse, c[0x0][0x16c], RZ, P2, 
                                                                                
        /*02f0*/                   IADD3.X R9, R9, c[0x0][0x164], RZ, P3, !PT ; 
                                                                                
        /*0300*/                   LDG.E.SYS R10, [R6] ;                        
                                                                                
        /*0310*/                   LDG.E.SYS R11, [R8] ;                        
                                                                                
        /*0320*/                   MOV R16, R18 ;                               
                                                                                
        /*0330*/                   IMAD.MOV.U32 R17, RZ, RZ, R21 ;              
                                                                                
        /*0340*/                   IADD3 R4, R4, -0x1, RZ ;                     
                                                                                
        /*0350*/                   IMAD.WIDE R12, R2.reuse, 0x4, R6 ;           
                                                                                
        /*0360*/                   IADD3 R5, R2, R5, RZ ;                       
                                                                                
        /*0370*/                   ISETP.NE.AND P1, PT, R4, RZ, PT ;            
                                                                                
        /*0380*/                   IMAD.WIDE R14, R2, 0x4, R8 ;                 
                                                                                
        /*0390*/                   MOV R6, R12 ;                                
                                                                                
        /*03a0*/                   IMAD.MOV.U32 R7, RZ, RZ, R13 ;               
                                                                                
        /*03b0*/                   MOV R8, R14 ;                                
                                                                                
        /*03c0*/                   IMAD.MOV.U32 R9, RZ, RZ, R15 ;               
                                                                                
        /*03d0*/                   FADD R19, R10, R11 ;                         
                                                                                
        /*03e0*/                   IMAD.WIDE R10, R2, 0x4, R16 ;                
                                                                                
        /*03f0*/                   IMAD.MOV.U32 R21, RZ, RZ, R11 ;              
                                                                                
        /*0400*/                   MOV R18, R10 ;                               
                                                                                
        /*0410*/                   STG.E.SYS [R16], R19 ;                       
                                                                                
        /*0420*/               @P1 BRA 0x300 ;                                  
                                                                                
        /*0430*/                   BSYNC B1 ;                                   
                                                                                
        /*0440*/              @!P0 BRA 0x660 ;                                  
                                                                                
        /*0450*/                   IMAD.MOV.U32 R10, RZ, RZ, 0x4 ;              
                                                                                
        /*0460*/                   IMAD.WIDE R8, R5, R10, c[0x0][0x168] ;       
                                                                                
        /*0470*/                   IMAD.WIDE R6, R5, R10, c[0x0][0x160] ;       
                                                                                
        /*0480*/                   LDG.E.SYS R4, [R8] ;                         
                                                                                
        /*0490*/                   LDG.E.SYS R13, [R6] ;                        
                                                                                
        /*04a0*/                   IMAD.WIDE R10, R5, R10, c[0x0][0x170] ;      
                                                                                
        /*04b0*/                   IMAD.WIDE R14, R2, 0x4, R8 ;                 
                                                                                
        /*04c0*/                   FADD R21, R4, R13 ;                          
                                                                                
        /*04d0*/                   IMAD.WIDE R12, R2, 0x4, R6 ;                 
                                                                                
        /*04e0*/                   STG.E.SYS [R10], R21 ;                       
                                                                                
        /*04f0*/                   LDG.E.SYS R4, [R14] ;                        
                                                                                
        /*0500*/                   LDG.E.SYS R19, [R12] ;                       
                                                                                
        /*0510*/                   IMAD.WIDE R16, R2, 0x4, R10 ;                
                                                                                
        /*0520*/                   IMAD.WIDE R8, R2, 0x4, R14 ;                 
                                                                                
        /*0530*/                   IMAD.WIDE R6, R2, 0x4, R12 ;                 
                                                                                
        /*0540*/                   FADD R23, R4, R19 ;                          
                                                                                
        /*0550*/                   STG.E.SYS [R16], R23 ;                       
                                                                                
        /*0560*/                   LDG.E.SYS R4, [R8] ;                         
                                                                                
        /*0570*/                   LDG.E.SYS R25, [R6] ;                        
                                                                                
        /*0580*/                   IMAD.WIDE R18, R2, 0x4, R16 ;                
                                                                                
        /*0590*/                   IMAD.WIDE R14, R2, 0x4, R8 ;                 
                                                                                
        /*05a0*/                   IMAD.WIDE R10, R2, 0x4, R6 ;                 
                                                                                
        /*05b0*/                   FADD R25, R4, R25 ;                          
                                                                                
        /*05c0*/                   STG.E.SYS [R18], R25 ;                       
                                                                                
        /*05d0*/                   LDG.E.SYS R14, [R14] ;                       
                                                                                
        /*05e0*/                   LDG.E.SYS R11, [R10] ;                       
                                                                                
        /*05f0*/                   IMAD.WIDE R12, R2.reuse, 0x4, R18 ;          
                                                                                
        /*0600*/                   IADD3 R5, R2, R5, R2 ;                       
                                                                                
        /*0610*/                   IADD3 R5, R2, R5, R2 ;                       
                                                                                
        /*0620*/                   ISETP.GE.AND P0, PT, R5, c[0x0][0x178], PT ; 
                                                                                
        /*0630*/                   FADD R17, R14, R11 ;                         
                                                                                
        /*0640*/                   STG.E.SYS [R12], R17 ;                       
                                                                                
        /*0650*/              @!P0 BRA 0x450 ;                                  
                                                                                
        /*0660*/                   BSYNC B0 ;                                   
                                                                                
        /*0670*/                   LEA R2, P0, R0.reuse, c[0x0][0x170], 0x2 ;   
                                                                                
        /*0680*/                   MOV R5, 0x41700000 ;                         
                                                                                
        /*0690*/                   LEA.HI.X R3, R0, c[0x0][0x174], R3, 0x2, P0 ;
                                                                                
        /*06a0*/                   STG.E.SYS [R2], R5 ;                         
                                                                                
        /*06b0*/                   EXIT ;                                       
                                                                                
        /*06c0*/                   BRA 0x6c0;                                   
                                                                                
        /*06d0*/                   NOP;                                         
                                                                                
        /*06e0*/                   NOP;                                         
                                                                                
        /*06f0*/                   NOP;                                         
                                                                                
		..........

%%writefile tmp/vector-add.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
 
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;
 
    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}
 
int main( int argc, char* argv[] )
{
    // Size of vectors
    int n = 100000;
 
    double *h_a;
    double *h_b;
    double *h_c;
 
    double *d_a;
    double *d_b;
    double *d_c;
 
    size_t bytes = n*sizeof(double);
 
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
 
    int i;
    for( i = 0; i < n; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
    }
 
    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
 
    int blockSize, gridSize;
 
    blockSize = 1024;
 
    gridSize = (int)ceil((float)n/blockSize);
 
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
 
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
 
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
 
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}

Writing tmp/vector-add.cu

!(cd tmp; nvcc -c vector-add.cu)
!cuobjdump --dump-sass tmp/vector-add.o

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_52
		Function : _Z6vecAddPdS_S_i
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
                                                                                 /* 0x001cfc00e22007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;                        /* 0x4c98078000870001 */
        /*0010*/                   S2R R0, SR_CTAID.X ;                          /* 0xf0c8000002570000 */
        /*0018*/                   S2R R2, SR_TID.X ;                            /* 0xf0c8000002170002 */
                                                                                 /* 0x001fd842fec20ff1 */
        /*0028*/                   XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ;  /* 0x4f107f8000270003 */
        /*0030*/                   XMAD R2, R0.reuse, c[0x0] [0x8], R2 ;         /* 0x4e00010000270002 */
        /*0038*/                   XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ;          /* 0x5b30011800370000 */
                                                                                 /* 0x001ff400fd4007ed */
        /*0048*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT ;  /* 0x4b6d038005670007 */
        /*0050*/                   NOP ;                                         /* 0x50b0000000070f00 */
        /*0058*/               @P0 EXIT ;                                        /* 0xe30000000000000f */
                                                                                 /* 0x081fd800fea207f1 */
        /*0068*/                   SHL R8, R0.reuse, 0x3 ;                       /* 0x3848000000370008 */
        /*0070*/                   SHR R0, R0, 0x1d ;                            /* 0x3829000001d70000 */
        /*0078*/                   IADD R4.CC, R8.reuse, c[0x0][0x140] ;         /* 0x4c10800005070804 */
                                                                                 /* 0x001fd800fe0207f2 */
        /*0088*/                   IADD.X R5, R0.reuse, c[0x0][0x144] ;          /* 0x4c10080005170005 */
        /*0090*/                   IADD R6.CC, R8, c[0x0][0x148] ;               /* 0x4c10800005270806 */
        /*0098*/                   LDG.E.64 R4, [R4] ;                           /* 0xeed5200000070404 */
                                                                                 /* 0x001fd800f62007e2 */
        /*00a8*/                   IADD.X R7, R0, c[0x0][0x14c] ;                /* 0x4c10080005370007 */
        /*00b0*/                   LDG.E.64 R2, [R6] ;                           /* 0xeed5200000070602 */
        /*00b8*/                   IADD R8.CC, R8, c[0x0][0x150] ;               /* 0x4c10800005470808 */
                                                                                 /* 0x003fc420e28007f7 */
        /*00c8*/                   IADD.X R9, R0, c[0x0][0x154] ;                /* 0x4c10080005570009 */
        /*00d0*/                   DADD R2, R2, R4 ;                             /* 0x5c70000000470202 */
        /*00d8*/                   STG.E.64 [R8], R2 ;                           /* 0xeedd200000070802 */
                                                                                 /* 0x001ffc00ffe007e8 */
        /*00e8*/                   NOP ;                                         /* 0x50b0000000070f00 */
        /*00f0*/                   EXIT ;                                        /* 0xe30000000007000f */
        /*00f8*/                   BRA 0xf8 ;                                    /* 0xe2400fffff87000f */
		..........


Fatbin ptx code:
================
arch = sm_52
code version = [8,2]
host = linux
compile_size = 64bit
compressed

!echo _Z6vecAddPdS_S_i | c++filt

vecAdd(double*, double*, double*, int)

prg = cl.Program(ctx, """
__kernel void getlaneid(__global int *d_ptr, int length)
{
    int elemID = get_global_id(0);
    if (elemID < length)
    {
        unsigned int laneid;
        asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
        d_ptr[elemID] = laneid;
    }
}
""").build()

print(prg.binaries[0].decode())

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35042711
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//

.version 8.2
.target sm_70, texmode_independent
.address_size 64

	// .globl	getlaneid

.entry getlaneid(
	.param .u64 .ptr .global .align 4 getlaneid_param_0,
	.param .u32 getlaneid_param_1
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<9>;
	.reg .b64 	%rd<4>;


	ld.param.u64 	%rd1, [getlaneid_param_0];
	ld.param.u32 	%r2, [getlaneid_param_1];
	mov.b32 	%r3, %envreg3;
	mov.u32 	%r4, %ctaid.x;
	mov.u32 	%r5, %ntid.x;
	mov.u32 	%r6, %tid.x;
	add.s32 	%r7, %r6, %r3;
	mad.lo.s32 	%r1, %r5, %r4, %r7;
	setp.ge.s32 	%p1, %r1, %r2;
	@%p1 bra 	$L__BB0_2;

	// begin inline asm
	mov.u32 %r8, %laneid;
	// end inline asm
	mul.wide.s32 	%rd2, %r1, 4;
	add.s64 	%rd3, %rd1, %rd2;
	st.global.u32 	[%rd3], %r8;

$L__BB0_2:
	ret;

}

a = cla.empty(queue, 5000, np.uint32)
prg.getlaneid(queue, lanes.shape, None, a.data, np.uint32(a.size))

<pyopencl._cl.Event at 0x7fa528c8b360>

a[:500]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,
        2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,
        4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,
        6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,
        8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,
       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  1,  2,  3,  4,
        5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,
        7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 25, 26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
       26, 27, 28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
       28, 29, 30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
       13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
       30, 31,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,  0,  1,
        2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25], dtype=uint32)

PTX and SASS (Nvidia)¶

C Kernel to Source Code¶

Is Division Expensive?¶

An Example with Control Flow¶

From CUDA¶

Inline PTX¶