652 lines
100 KiB
LLVM
652 lines
100 KiB
LLVM
|
; ModuleID = 'kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc'
|
||
|
source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
|
||
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||
|
target triple = "x86_64-unknown-linux-gnu"
|
||
|
|
||
|
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
|
||
|
%struct.dim3 = type { i32, i32, i32 }
|
||
|
%struct.CUstream_st = type opaque
|
||
|
|
||
|
$_ZN4dim3C2Ejjj = comdat any
|
||
|
|
||
|
@.str = private unnamed_addr constant [75 x i8] c"# of blocks = %d, # of threads/block = %d (ensure that device can handle)\0A\00", align 1
|
||
|
@.str.1 = private unnamed_addr constant [21 x i8] c"cudaMalloc recordsD\00", align 1
|
||
|
@.str.2 = private unnamed_addr constant [23 x i8] c"cudaMalloc currKnodeD\00", align 1
|
||
|
@.str.3 = private unnamed_addr constant [20 x i8] c"cudaMalloc offsetD\00", align 1
|
||
|
@.str.4 = private unnamed_addr constant [23 x i8] c"cudaMalloc lastKnodeD\00", align 1
|
||
|
@.str.5 = private unnamed_addr constant [22 x i8] c"cudaMalloc offset_2D\00", align 1
|
||
|
@.str.6 = private unnamed_addr constant [18 x i8] c"cudaMalloc startD\00", align 1
|
||
|
@.str.7 = private unnamed_addr constant [16 x i8] c"cudaMalloc endD\00", align 1
|
||
|
@.str.8 = private unnamed_addr constant [21 x i8] c"cudaMalloc ansDStart\00", align 1
|
||
|
@.str.9 = private unnamed_addr constant [22 x i8] c"cudaMalloc ansDLength\00", align 1
|
||
|
@.str.10 = private unnamed_addr constant [27 x i8] c"cudaMalloc cudaMemcpy memD\00", align 1
|
||
|
@.str.11 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy currKnodeD\00", align 1
|
||
|
@.str.12 = private unnamed_addr constant [30 x i8] c"cudaMalloc cudaMemcpy offsetD\00", align 1
|
||
|
@.str.13 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy lastKnodeD\00", align 1
|
||
|
@.str.14 = private unnamed_addr constant [32 x i8] c"cudaMalloc cudaMemcpy offset_2D\00", align 1
|
||
|
@.str.15 = private unnamed_addr constant [18 x i8] c"cudaMemcpy startD\00", align 1
|
||
|
@.str.16 = private unnamed_addr constant [16 x i8] c"cudaMemcpy endD\00", align 1
|
||
|
@.str.17 = private unnamed_addr constant [21 x i8] c"cudaMemcpy ansDStart\00", align 1
|
||
|
@.str.18 = private unnamed_addr constant [22 x i8] c"cudaMemcpy ansDLength\00", align 1
|
||
|
@.str.19 = private unnamed_addr constant [11 x i8] c"findRangeK\00", align 1
|
||
|
@.str.20 = private unnamed_addr constant [52 x i8] c"Time spent in different stages of GPU_CUDA KERNEL:\0A\00", align 1
|
||
|
@.str.21 = private unnamed_addr constant [54 x i8] c"%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\0A\00", align 1
|
||
|
@.str.22 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: ALO\0A\00", align 1
|
||
|
@.str.23 = private unnamed_addr constant [41 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY IN\0A\00", align 1
|
||
|
@.str.24 = private unnamed_addr constant [36 x i8] c"%15.12f s, %15.12f % : GPU: KERNEL\0A\00", align 1
|
||
|
@.str.25 = private unnamed_addr constant [42 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY OUT\0A\00", align 1
|
||
|
@.str.26 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: FRE\0A\00", align 1
|
||
|
@.str.27 = private unnamed_addr constant [13 x i8] c"Total time:\0A\00", align 1
|
||
|
@.str.28 = private unnamed_addr constant [9 x i8] c"%.12f s\0A\00", align 1
|
||
|
@0 = private constant [26033 x i8] c"P\EDU\BA\01\00\10\00\A0e\00\00\00\00\00\00\02\00\01\01@\00\00\00\C8V\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00 V\00\00\00\00\00\00\E0S\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00.nv.constant0.findRangeK\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00findRangeK\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00threadIdx\00blockIdx\00.nv.constant0.findRangeK\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00=\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00x\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\83\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\8D\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\96\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\00O\00\00\00\00\00\00\04/\08\00\06\00\00\00\1C\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00h\00\00\00\04\11\08\00\06\00\00\00h\00\00\00\010\00\00\01*\00\00\04\0A\08\00\05\00\00\00@\01X\00\03\19X\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\B8\0A\00\00\04\1C\04\00\D0N\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03<d\00\01\00\87\00\80\07\98L\01\01\87\F9\FF\FF\0F\1C\00\00w\03\00\00\C8\F0\EF\1F\E0\FD\03\BC\7F\00\07\01\07\00\80\03l[\0F\00\80\00\00\00@\E2\C0\00\10\00\00\00\A0\E3\EF\1F\E0!\03\BC\7F\00\00\01\F7\0F\00\00\10\\\00\0A\07\00\00\00\E0\\\02\00\07\00\80\07\98\\\EF\1F\E0\FD\03\BC\7F\00\03\00\F7\0F\80\07\98\\\00\00'\00\80\07\98\\\03\007\00\80\07\98\\\EF\1F\E0\FD\03\BC\7F\00\02\00\07\00\80\07\98\\\03\007\00\80\07\98\\\0C\00\17\00\80\07\98L\EF\1F\E0\FD\03\BC\7F\00\00\00\17\04\80\07\98L\0C\02\C7\00\00\80\10\\\00\03\07\00\00\08\10\\\EF\1F\E0!\03\BC\7F\00\02\F0\07\19\00\00\00\01\02\02\07\00\00\00\95\EF\06\00'\00\80\07\98\\\EF\1F\E0\FD\03\BC\7F\00\07\007\00\80\07\98\\\06\00g\00\80\07\98\\\07\00w\00\80\07\98\\\EF\1F\E0!\03\BC\7F\00\02\F0\87\18\00\00\00\01\02\02\07\00\00\
|
||
|
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([26033 x i8], [26033 x i8]* @0, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
|
||
|
@__cuda_gpubin_handle = internal global i8** null, align 8
|
||
|
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]
|
||
|
|
||
|
; Function Attrs: noinline optnone uwtable
|
||
|
define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
|
||
|
entry:
|
||
|
%height.addr = alloca i64, align 8
|
||
|
%knodesD.addr = alloca %struct.knode*, align 8
|
||
|
%knodes_elem.addr = alloca i64, align 8
|
||
|
%currKnodeD.addr = alloca i64*, align 8
|
||
|
%offsetD.addr = alloca i64*, align 8
|
||
|
%lastKnodeD.addr = alloca i64*, align 8
|
||
|
%offset_2D.addr = alloca i64*, align 8
|
||
|
%startD.addr = alloca i32*, align 8
|
||
|
%endD.addr = alloca i32*, align 8
|
||
|
%RecstartD.addr = alloca i32*, align 8
|
||
|
%ReclenD.addr = alloca i32*, align 8
|
||
|
%grid_dim = alloca %struct.dim3, align 8
|
||
|
%block_dim = alloca %struct.dim3, align 8
|
||
|
%shmem_size = alloca i64, align 8
|
||
|
%stream = alloca i8*, align 8
|
||
|
%grid_dim.coerce = alloca { i64, i32 }, align 8
|
||
|
%block_dim.coerce = alloca { i64, i32 }, align 8
|
||
|
store i64 %height, i64* %height.addr, align 8
|
||
|
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
|
||
|
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||
|
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
|
||
|
store i64* %offsetD, i64** %offsetD.addr, align 8
|
||
|
store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
|
||
|
store i64* %offset_2D, i64** %offset_2D.addr, align 8
|
||
|
store i32* %startD, i32** %startD.addr, align 8
|
||
|
store i32* %endD, i32** %endD.addr, align 8
|
||
|
store i32* %RecstartD, i32** %RecstartD.addr, align 8
|
||
|
store i32* %ReclenD, i32** %ReclenD.addr, align 8
|
||
|
%kernel_args = alloca i8*, i64 11, align 16
|
||
|
%0 = bitcast i64* %height.addr to i8*
|
||
|
%1 = getelementptr i8*, i8** %kernel_args, i32 0
|
||
|
store i8* %0, i8** %1
|
||
|
%2 = bitcast %struct.knode** %knodesD.addr to i8*
|
||
|
%3 = getelementptr i8*, i8** %kernel_args, i32 1
|
||
|
store i8* %2, i8** %3
|
||
|
%4 = bitcast i64* %knodes_elem.addr to i8*
|
||
|
%5 = getelementptr i8*, i8** %kernel_args, i32 2
|
||
|
store i8* %4, i8** %5
|
||
|
%6 = bitcast i64** %currKnodeD.addr to i8*
|
||
|
%7 = getelementptr i8*, i8** %kernel_args, i32 3
|
||
|
store i8* %6, i8** %7
|
||
|
%8 = bitcast i64** %offsetD.addr to i8*
|
||
|
%9 = getelementptr i8*, i8** %kernel_args, i32 4
|
||
|
store i8* %8, i8** %9
|
||
|
%10 = bitcast i64** %lastKnodeD.addr to i8*
|
||
|
%11 = getelementptr i8*, i8** %kernel_args, i32 5
|
||
|
store i8* %10, i8** %11
|
||
|
%12 = bitcast i64** %offset_2D.addr to i8*
|
||
|
%13 = getelementptr i8*, i8** %kernel_args, i32 6
|
||
|
store i8* %12, i8** %13
|
||
|
%14 = bitcast i32** %startD.addr to i8*
|
||
|
%15 = getelementptr i8*, i8** %kernel_args, i32 7
|
||
|
store i8* %14, i8** %15
|
||
|
%16 = bitcast i32** %endD.addr to i8*
|
||
|
%17 = getelementptr i8*, i8** %kernel_args, i32 8
|
||
|
store i8* %16, i8** %17
|
||
|
%18 = bitcast i32** %RecstartD.addr to i8*
|
||
|
%19 = getelementptr i8*, i8** %kernel_args, i32 9
|
||
|
store i8* %18, i8** %19
|
||
|
%20 = bitcast i32** %ReclenD.addr to i8*
|
||
|
%21 = getelementptr i8*, i8** %kernel_args, i32 10
|
||
|
store i8* %20, i8** %21
|
||
|
%22 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
|
||
|
%23 = load i64, i64* %shmem_size, align 8
|
||
|
%24 = load i8*, i8** %stream, align 8
|
||
|
%25 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
|
||
|
%26 = bitcast %struct.dim3* %grid_dim to i8*
|
||
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false)
|
||
|
%27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
|
||
|
%28 = load i64, i64* %27, align 8
|
||
|
%29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
|
||
|
%30 = load i32, i32* %29, align 8
|
||
|
%31 = bitcast { i64, i32 }* %block_dim.coerce to i8*
|
||
|
%32 = bitcast %struct.dim3* %block_dim to i8*
|
||
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false)
|
||
|
%33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
|
||
|
%34 = load i64, i64* %33, align 8
|
||
|
%35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
|
||
|
%36 = load i32, i32* %35, align 8
|
||
|
%37 = bitcast i8* %24 to %struct.CUstream_st*
|
||
|
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i64 %28, i32 %30, i64 %34, i32 %36, i8** %kernel_args, i64 %23, %struct.CUstream_st* %37)
|
||
|
br label %setup.end
|
||
|
|
||
|
setup.end: ; preds = %entry
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)
|
||
|
|
||
|
declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)
|
||
|
|
||
|
; Function Attrs: argmemonly nounwind willreturn
|
||
|
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
|
||
|
|
||
|
; Function Attrs: noinline optnone uwtable
|
||
|
define dso_local void @kernel_gpu_cuda_wrapper_2(%struct.knode* %knodes, i64 %knodes_elem, i64 %knodes_mem, i32 %order, i64 %maxheight, i32 %count, i64* %currKnode, i64* %offset, i64* %lastKnode, i64* %offset_2, i32* %start, i32* %end, i32* %recstart, i32* %reclength) #0 {
|
||
|
entry:
|
||
|
%knodes.addr = alloca %struct.knode*, align 8
|
||
|
%knodes_elem.addr = alloca i64, align 8
|
||
|
%knodes_mem.addr = alloca i64, align 8
|
||
|
%order.addr = alloca i32, align 4
|
||
|
%maxheight.addr = alloca i64, align 8
|
||
|
%count.addr = alloca i32, align 4
|
||
|
%currKnode.addr = alloca i64*, align 8
|
||
|
%offset.addr = alloca i64*, align 8
|
||
|
%lastKnode.addr = alloca i64*, align 8
|
||
|
%offset_2.addr = alloca i64*, align 8
|
||
|
%start.addr = alloca i32*, align 8
|
||
|
%end.addr = alloca i32*, align 8
|
||
|
%recstart.addr = alloca i32*, align 8
|
||
|
%reclength.addr = alloca i32*, align 8
|
||
|
%time0 = alloca i64, align 8
|
||
|
%time1 = alloca i64, align 8
|
||
|
%time2 = alloca i64, align 8
|
||
|
%time3 = alloca i64, align 8
|
||
|
%time4 = alloca i64, align 8
|
||
|
%time5 = alloca i64, align 8
|
||
|
%time6 = alloca i64, align 8
|
||
|
%numBlocks = alloca i32, align 4
|
||
|
%threadsPerBlock = alloca i32, align 4
|
||
|
%knodesD = alloca %struct.knode*, align 8
|
||
|
%currKnodeD = alloca i64*, align 8
|
||
|
%offsetD = alloca i64*, align 8
|
||
|
%lastKnodeD = alloca i64*, align 8
|
||
|
%offset_2D = alloca i64*, align 8
|
||
|
%startD = alloca i32*, align 8
|
||
|
%endD = alloca i32*, align 8
|
||
|
%ansDStart = alloca i32*, align 8
|
||
|
%ansDLength = alloca i32*, align 8
|
||
|
%agg.tmp = alloca %struct.dim3, align 4
|
||
|
%agg.tmp54 = alloca %struct.dim3, align 4
|
||
|
%agg.tmp.coerce = alloca { i64, i32 }, align 4
|
||
|
%agg.tmp54.coerce = alloca { i64, i32 }, align 4
|
||
|
store %struct.knode* %knodes, %struct.knode** %knodes.addr, align 8
|
||
|
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||
|
store i64 %knodes_mem, i64* %knodes_mem.addr, align 8
|
||
|
store i32 %order, i32* %order.addr, align 4
|
||
|
store i64 %maxheight, i64* %maxheight.addr, align 8
|
||
|
store i32 %count, i32* %count.addr, align 4
|
||
|
store i64* %currKnode, i64** %currKnode.addr, align 8
|
||
|
store i64* %offset, i64** %offset.addr, align 8
|
||
|
store i64* %lastKnode, i64** %lastKnode.addr, align 8
|
||
|
store i64* %offset_2, i64** %offset_2.addr, align 8
|
||
|
store i32* %start, i32** %start.addr, align 8
|
||
|
store i32* %end, i32** %end.addr, align 8
|
||
|
store i32* %recstart, i32** %recstart.addr, align 8
|
||
|
store i32* %reclength, i32** %reclength.addr, align 8
|
||
|
%call = call i64 @get_time()
|
||
|
store i64 %call, i64* %time0, align 8
|
||
|
%call1 = call i32 @cudaThreadSynchronize()
|
||
|
%0 = load i32, i32* %count.addr, align 4
|
||
|
store i32 %0, i32* %numBlocks, align 4
|
||
|
%1 = load i32, i32* %order.addr, align 4
|
||
|
%cmp = icmp slt i32 %1, 1024
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%2 = load i32, i32* %order.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %2, %cond.true ], [ 1024, %cond.false ]
|
||
|
store i32 %cond, i32* %threadsPerBlock, align 4
|
||
|
%3 = load i32, i32* %numBlocks, align 4
|
||
|
%4 = load i32, i32* %threadsPerBlock, align 4
|
||
|
%call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i64 0, i64 0), i32 %3, i32 %4)
|
||
|
%call3 = call i64 @get_time()
|
||
|
store i64 %call3, i64* %time1, align 8
|
||
|
%5 = bitcast %struct.knode** %knodesD to i8**
|
||
|
%6 = load i64, i64* %knodes_mem.addr, align 8
|
||
|
%call4 = call i32 @cudaMalloc(i8** %5, i64 %6)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i64 0, i64 0))
|
||
|
%7 = bitcast i64** %currKnodeD to i8**
|
||
|
%8 = load i32, i32* %count.addr, align 4
|
||
|
%conv = sext i32 %8 to i64
|
||
|
%mul = mul i64 %conv, 8
|
||
|
%call5 = call i32 @cudaMalloc(i8** %7, i64 %mul)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.2, i64 0, i64 0))
|
||
|
%9 = bitcast i64** %offsetD to i8**
|
||
|
%10 = load i32, i32* %count.addr, align 4
|
||
|
%conv6 = sext i32 %10 to i64
|
||
|
%mul7 = mul i64 %conv6, 8
|
||
|
%call8 = call i32 @cudaMalloc(i8** %9, i64 %mul7)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.3, i64 0, i64 0))
|
||
|
%11 = bitcast i64** %lastKnodeD to i8**
|
||
|
%12 = load i32, i32* %count.addr, align 4
|
||
|
%conv9 = sext i32 %12 to i64
|
||
|
%mul10 = mul i64 %conv9, 8
|
||
|
%call11 = call i32 @cudaMalloc(i8** %11, i64 %mul10)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.4, i64 0, i64 0))
|
||
|
%13 = bitcast i64** %offset_2D to i8**
|
||
|
%14 = load i32, i32* %count.addr, align 4
|
||
|
%conv12 = sext i32 %14 to i64
|
||
|
%mul13 = mul i64 %conv12, 8
|
||
|
%call14 = call i32 @cudaMalloc(i8** %13, i64 %mul13)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.5, i64 0, i64 0))
|
||
|
%15 = bitcast i32** %startD to i8**
|
||
|
%16 = load i32, i32* %count.addr, align 4
|
||
|
%conv15 = sext i32 %16 to i64
|
||
|
%mul16 = mul i64 %conv15, 4
|
||
|
%call17 = call i32 @cudaMalloc(i8** %15, i64 %mul16)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.6, i64 0, i64 0))
|
||
|
%17 = bitcast i32** %endD to i8**
|
||
|
%18 = load i32, i32* %count.addr, align 4
|
||
|
%conv18 = sext i32 %18 to i64
|
||
|
%mul19 = mul i64 %conv18, 4
|
||
|
%call20 = call i32 @cudaMalloc(i8** %17, i64 %mul19)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0))
|
||
|
%19 = bitcast i32** %ansDStart to i8**
|
||
|
%20 = load i32, i32* %count.addr, align 4
|
||
|
%conv21 = sext i32 %20 to i64
|
||
|
%mul22 = mul i64 %conv21, 4
|
||
|
%call23 = call i32 @cudaMalloc(i8** %19, i64 %mul22)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.8, i64 0, i64 0))
|
||
|
%21 = bitcast i32** %ansDLength to i8**
|
||
|
%22 = load i32, i32* %count.addr, align 4
|
||
|
%conv24 = sext i32 %22 to i64
|
||
|
%mul25 = mul i64 %conv24, 4
|
||
|
%call26 = call i32 @cudaMalloc(i8** %21, i64 %mul25)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.9, i64 0, i64 0))
|
||
|
%call27 = call i64 @get_time()
|
||
|
store i64 %call27, i64* %time2, align 8
|
||
|
%23 = load %struct.knode*, %struct.knode** %knodesD, align 8
|
||
|
%24 = bitcast %struct.knode* %23 to i8*
|
||
|
%25 = load %struct.knode*, %struct.knode** %knodes.addr, align 8
|
||
|
%26 = bitcast %struct.knode* %25 to i8*
|
||
|
%27 = load i64, i64* %knodes_mem.addr, align 8
|
||
|
%call28 = call i32 @cudaMemcpy(i8* %24, i8* %26, i64 %27, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.10, i64 0, i64 0))
|
||
|
%28 = load i64*, i64** %currKnodeD, align 8
|
||
|
%29 = bitcast i64* %28 to i8*
|
||
|
%30 = load i64*, i64** %currKnode.addr, align 8
|
||
|
%31 = bitcast i64* %30 to i8*
|
||
|
%32 = load i32, i32* %count.addr, align 4
|
||
|
%conv29 = sext i32 %32 to i64
|
||
|
%mul30 = mul i64 %conv29, 8
|
||
|
%call31 = call i32 @cudaMemcpy(i8* %29, i8* %31, i64 %mul30, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.11, i64 0, i64 0))
|
||
|
%33 = load i64*, i64** %offsetD, align 8
|
||
|
%34 = bitcast i64* %33 to i8*
|
||
|
%35 = load i64*, i64** %offset.addr, align 8
|
||
|
%36 = bitcast i64* %35 to i8*
|
||
|
%37 = load i32, i32* %count.addr, align 4
|
||
|
%conv32 = sext i32 %37 to i64
|
||
|
%mul33 = mul i64 %conv32, 8
|
||
|
%call34 = call i32 @cudaMemcpy(i8* %34, i8* %36, i64 %mul33, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.12, i64 0, i64 0))
|
||
|
%38 = load i64*, i64** %lastKnodeD, align 8
|
||
|
%39 = bitcast i64* %38 to i8*
|
||
|
%40 = load i64*, i64** %lastKnode.addr, align 8
|
||
|
%41 = bitcast i64* %40 to i8*
|
||
|
%42 = load i32, i32* %count.addr, align 4
|
||
|
%conv35 = sext i32 %42 to i64
|
||
|
%mul36 = mul i64 %conv35, 8
|
||
|
%call37 = call i32 @cudaMemcpy(i8* %39, i8* %41, i64 %mul36, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.13, i64 0, i64 0))
|
||
|
%43 = load i64*, i64** %offset_2D, align 8
|
||
|
%44 = bitcast i64* %43 to i8*
|
||
|
%45 = load i64*, i64** %offset_2.addr, align 8
|
||
|
%46 = bitcast i64* %45 to i8*
|
||
|
%47 = load i32, i32* %count.addr, align 4
|
||
|
%conv38 = sext i32 %47 to i64
|
||
|
%mul39 = mul i64 %conv38, 8
|
||
|
%call40 = call i32 @cudaMemcpy(i8* %44, i8* %46, i64 %mul39, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.14, i64 0, i64 0))
|
||
|
%48 = load i32*, i32** %startD, align 8
|
||
|
%49 = bitcast i32* %48 to i8*
|
||
|
%50 = load i32*, i32** %start.addr, align 8
|
||
|
%51 = bitcast i32* %50 to i8*
|
||
|
%52 = load i32, i32* %count.addr, align 4
|
||
|
%conv41 = sext i32 %52 to i64
|
||
|
%mul42 = mul i64 %conv41, 4
|
||
|
%call43 = call i32 @cudaMemcpy(i8* %49, i8* %51, i64 %mul42, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.15, i64 0, i64 0))
|
||
|
%53 = load i32*, i32** %endD, align 8
|
||
|
%54 = bitcast i32* %53 to i8*
|
||
|
%55 = load i32*, i32** %end.addr, align 8
|
||
|
%56 = bitcast i32* %55 to i8*
|
||
|
%57 = load i32, i32* %count.addr, align 4
|
||
|
%conv44 = sext i32 %57 to i64
|
||
|
%mul45 = mul i64 %conv44, 4
|
||
|
%call46 = call i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul45, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.16, i64 0, i64 0))
|
||
|
%58 = load i32*, i32** %ansDStart, align 8
|
||
|
%59 = bitcast i32* %58 to i8*
|
||
|
%60 = load i32*, i32** %recstart.addr, align 8
|
||
|
%61 = bitcast i32* %60 to i8*
|
||
|
%62 = load i32, i32* %count.addr, align 4
|
||
|
%conv47 = sext i32 %62 to i64
|
||
|
%mul48 = mul i64 %conv47, 4
|
||
|
%call49 = call i32 @cudaMemcpy(i8* %59, i8* %61, i64 %mul48, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0))
|
||
|
%63 = load i32*, i32** %ansDLength, align 8
|
||
|
%64 = bitcast i32* %63 to i8*
|
||
|
%65 = load i32*, i32** %reclength.addr, align 8
|
||
|
%66 = bitcast i32* %65 to i8*
|
||
|
%67 = load i32, i32* %count.addr, align 4
|
||
|
%conv50 = sext i32 %67 to i64
|
||
|
%mul51 = mul i64 %conv50, 4
|
||
|
%call52 = call i32 @cudaMemcpy(i8* %64, i8* %66, i64 %mul51, i32 1)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0))
|
||
|
%call53 = call i64 @get_time()
|
||
|
store i64 %call53, i64* %time3, align 8
|
||
|
%68 = load i32, i32* %numBlocks, align 4
|
||
|
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %68, i32 1, i32 1)
|
||
|
%69 = load i32, i32* %threadsPerBlock, align 4
|
||
|
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp54, i32 %69, i32 1, i32 1)
|
||
|
%70 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
|
||
|
%71 = bitcast %struct.dim3* %agg.tmp to i8*
|
||
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %70, i8* align 4 %71, i64 12, i1 false)
|
||
|
%72 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
|
||
|
%73 = load i64, i64* %72, align 4
|
||
|
%74 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
|
||
|
%75 = load i32, i32* %74, align 4
|
||
|
%76 = bitcast { i64, i32 }* %agg.tmp54.coerce to i8*
|
||
|
%77 = bitcast %struct.dim3* %agg.tmp54 to i8*
|
||
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %76, i8* align 4 %77, i64 12, i1 false)
|
||
|
%78 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 0
|
||
|
%79 = load i64, i64* %78, align 4
|
||
|
%80 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 1
|
||
|
%81 = load i32, i32* %80, align 4
|
||
|
%call55 = call i32 @__cudaPushCallConfiguration(i64 %73, i32 %75, i64 %79, i32 %81, i64 0, i8* null)
|
||
|
%tobool = icmp ne i32 %call55, 0
|
||
|
br i1 %tobool, label %kcall.end, label %kcall.configok
|
||
|
|
||
|
kcall.configok: ; preds = %cond.end
|
||
|
%82 = load i64, i64* %maxheight.addr, align 8
|
||
|
%83 = load %struct.knode*, %struct.knode** %knodesD, align 8
|
||
|
%84 = load i64, i64* %knodes_elem.addr, align 8
|
||
|
%85 = load i64*, i64** %currKnodeD, align 8
|
||
|
%86 = load i64*, i64** %offsetD, align 8
|
||
|
%87 = load i64*, i64** %lastKnodeD, align 8
|
||
|
%88 = load i64*, i64** %offset_2D, align 8
|
||
|
%89 = load i32*, i32** %startD, align 8
|
||
|
%90 = load i32*, i32** %endD, align 8
|
||
|
%91 = load i32*, i32** %ansDStart, align 8
|
||
|
%92 = load i32*, i32** %ansDLength, align 8
|
||
|
call void @findRangeK(i64 %82, %struct.knode* %83, i64 %84, i64* %85, i64* %86, i64* %87, i64* %88, i32* %89, i32* %90, i32* %91, i32* %92)
|
||
|
br label %kcall.end
|
||
|
|
||
|
kcall.end: ; preds = %kcall.configok, %cond.end
|
||
|
%call56 = call i32 @cudaThreadSynchronize()
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0))
|
||
|
%call57 = call i64 @get_time()
|
||
|
store i64 %call57, i64* %time4, align 8
|
||
|
%93 = load i32*, i32** %recstart.addr, align 8
|
||
|
%94 = bitcast i32* %93 to i8*
|
||
|
%95 = load i32*, i32** %ansDStart, align 8
|
||
|
%96 = bitcast i32* %95 to i8*
|
||
|
%97 = load i32, i32* %count.addr, align 4
|
||
|
%conv58 = sext i32 %97 to i64
|
||
|
%mul59 = mul i64 %conv58, 4
|
||
|
%call60 = call i32 @cudaMemcpy(i8* %94, i8* %96, i64 %mul59, i32 2)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0))
|
||
|
%98 = load i32*, i32** %reclength.addr, align 8
|
||
|
%99 = bitcast i32* %98 to i8*
|
||
|
%100 = load i32*, i32** %ansDLength, align 8
|
||
|
%101 = bitcast i32* %100 to i8*
|
||
|
%102 = load i32, i32* %count.addr, align 4
|
||
|
%conv61 = sext i32 %102 to i64
|
||
|
%mul62 = mul i64 %conv61, 4
|
||
|
%call63 = call i32 @cudaMemcpy(i8* %99, i8* %101, i64 %mul62, i32 2)
|
||
|
call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0))
|
||
|
%call64 = call i64 @get_time()
|
||
|
store i64 %call64, i64* %time5, align 8
|
||
|
%103 = load %struct.knode*, %struct.knode** %knodesD, align 8
|
||
|
%104 = bitcast %struct.knode* %103 to i8*
|
||
|
%call65 = call i32 @cudaFree(i8* %104)
|
||
|
%105 = load i64*, i64** %currKnodeD, align 8
|
||
|
%106 = bitcast i64* %105 to i8*
|
||
|
%call66 = call i32 @cudaFree(i8* %106)
|
||
|
%107 = load i64*, i64** %offsetD, align 8
|
||
|
%108 = bitcast i64* %107 to i8*
|
||
|
%call67 = call i32 @cudaFree(i8* %108)
|
||
|
%109 = load i64*, i64** %lastKnodeD, align 8
|
||
|
%110 = bitcast i64* %109 to i8*
|
||
|
%call68 = call i32 @cudaFree(i8* %110)
|
||
|
%111 = load i64*, i64** %offset_2D, align 8
|
||
|
%112 = bitcast i64* %111 to i8*
|
||
|
%call69 = call i32 @cudaFree(i8* %112)
|
||
|
%113 = load i32*, i32** %startD, align 8
|
||
|
%114 = bitcast i32* %113 to i8*
|
||
|
%call70 = call i32 @cudaFree(i8* %114)
|
||
|
%115 = load i32*, i32** %endD, align 8
|
||
|
%116 = bitcast i32* %115 to i8*
|
||
|
%call71 = call i32 @cudaFree(i8* %116)
|
||
|
%117 = load i32*, i32** %ansDStart, align 8
|
||
|
%118 = bitcast i32* %117 to i8*
|
||
|
%call72 = call i32 @cudaFree(i8* %118)
|
||
|
%119 = load i32*, i32** %ansDLength, align 8
|
||
|
%120 = bitcast i32* %119 to i8*
|
||
|
%call73 = call i32 @cudaFree(i8* %120)
|
||
|
%call74 = call i64 @get_time()
|
||
|
store i64 %call74, i64* %time6, align 8
|
||
|
%call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.20, i64 0, i64 0))
|
||
|
%121 = load i64, i64* %time1, align 8
|
||
|
%122 = load i64, i64* %time0, align 8
|
||
|
%sub = sub nsw i64 %121, %122
|
||
|
%conv76 = sitofp i64 %sub to float
|
||
|
%div = fdiv float %conv76, 1.000000e+06
|
||
|
%conv77 = fpext float %div to double
|
||
|
%123 = load i64, i64* %time1, align 8
|
||
|
%124 = load i64, i64* %time0, align 8
|
||
|
%sub78 = sub nsw i64 %123, %124
|
||
|
%conv79 = sitofp i64 %sub78 to float
|
||
|
%125 = load i64, i64* %time6, align 8
|
||
|
%126 = load i64, i64* %time0, align 8
|
||
|
%sub80 = sub nsw i64 %125, %126
|
||
|
%conv81 = sitofp i64 %sub80 to float
|
||
|
%div82 = fdiv float %conv79, %conv81
|
||
|
%mul83 = fmul contract float %div82, 1.000000e+02
|
||
|
%conv84 = fpext float %mul83 to double
|
||
|
%call85 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.21, i64 0, i64 0), double %conv77, double %conv84)
|
||
|
%127 = load i64, i64* %time2, align 8
|
||
|
%128 = load i64, i64* %time1, align 8
|
||
|
%sub86 = sub nsw i64 %127, %128
|
||
|
%conv87 = sitofp i64 %sub86 to float
|
||
|
%div88 = fdiv float %conv87, 1.000000e+06
|
||
|
%conv89 = fpext float %div88 to double
|
||
|
%129 = load i64, i64* %time2, align 8
|
||
|
%130 = load i64, i64* %time1, align 8
|
||
|
%sub90 = sub nsw i64 %129, %130
|
||
|
%conv91 = sitofp i64 %sub90 to float
|
||
|
%131 = load i64, i64* %time6, align 8
|
||
|
%132 = load i64, i64* %time0, align 8
|
||
|
%sub92 = sub nsw i64 %131, %132
|
||
|
%conv93 = sitofp i64 %sub92 to float
|
||
|
%div94 = fdiv float %conv91, %conv93
|
||
|
%mul95 = fmul contract float %div94, 1.000000e+02
|
||
|
%conv96 = fpext float %mul95 to double
|
||
|
%call97 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.22, i64 0, i64 0), double %conv89, double %conv96)
|
||
|
%133 = load i64, i64* %time3, align 8
|
||
|
%134 = load i64, i64* %time2, align 8
|
||
|
%sub98 = sub nsw i64 %133, %134
|
||
|
%conv99 = sitofp i64 %sub98 to float
|
||
|
%div100 = fdiv float %conv99, 1.000000e+06
|
||
|
%conv101 = fpext float %div100 to double
|
||
|
%135 = load i64, i64* %time3, align 8
|
||
|
%136 = load i64, i64* %time2, align 8
|
||
|
%sub102 = sub nsw i64 %135, %136
|
||
|
%conv103 = sitofp i64 %sub102 to float
|
||
|
%137 = load i64, i64* %time6, align 8
|
||
|
%138 = load i64, i64* %time0, align 8
|
||
|
%sub104 = sub nsw i64 %137, %138
|
||
|
%conv105 = sitofp i64 %sub104 to float
|
||
|
%div106 = fdiv float %conv103, %conv105
|
||
|
%mul107 = fmul contract float %div106, 1.000000e+02
|
||
|
%conv108 = fpext float %mul107 to double
|
||
|
%call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.23, i64 0, i64 0), double %conv101, double %conv108)
|
||
|
%139 = load i64, i64* %time4, align 8
|
||
|
%140 = load i64, i64* %time3, align 8
|
||
|
%sub110 = sub nsw i64 %139, %140
|
||
|
%conv111 = sitofp i64 %sub110 to float
|
||
|
%div112 = fdiv float %conv111, 1.000000e+06
|
||
|
%conv113 = fpext float %div112 to double
|
||
|
%141 = load i64, i64* %time4, align 8
|
||
|
%142 = load i64, i64* %time3, align 8
|
||
|
%sub114 = sub nsw i64 %141, %142
|
||
|
%conv115 = sitofp i64 %sub114 to float
|
||
|
%143 = load i64, i64* %time6, align 8
|
||
|
%144 = load i64, i64* %time0, align 8
|
||
|
%sub116 = sub nsw i64 %143, %144
|
||
|
%conv117 = sitofp i64 %sub116 to float
|
||
|
%div118 = fdiv float %conv115, %conv117
|
||
|
%mul119 = fmul contract float %div118, 1.000000e+02
|
||
|
%conv120 = fpext float %mul119 to double
|
||
|
%call121 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.24, i64 0, i64 0), double %conv113, double %conv120)
|
||
|
%145 = load i64, i64* %time5, align 8
|
||
|
%146 = load i64, i64* %time4, align 8
|
||
|
%sub122 = sub nsw i64 %145, %146
|
||
|
%conv123 = sitofp i64 %sub122 to float
|
||
|
%div124 = fdiv float %conv123, 1.000000e+06
|
||
|
%conv125 = fpext float %div124 to double
|
||
|
%147 = load i64, i64* %time5, align 8
|
||
|
%148 = load i64, i64* %time4, align 8
|
||
|
%sub126 = sub nsw i64 %147, %148
|
||
|
%conv127 = sitofp i64 %sub126 to float
|
||
|
%149 = load i64, i64* %time6, align 8
|
||
|
%150 = load i64, i64* %time0, align 8
|
||
|
%sub128 = sub nsw i64 %149, %150
|
||
|
%conv129 = sitofp i64 %sub128 to float
|
||
|
%div130 = fdiv float %conv127, %conv129
|
||
|
%mul131 = fmul contract float %div130, 1.000000e+02
|
||
|
%conv132 = fpext float %mul131 to double
|
||
|
%call133 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.25, i64 0, i64 0), double %conv125, double %conv132)
|
||
|
%151 = load i64, i64* %time6, align 8
|
||
|
%152 = load i64, i64* %time5, align 8
|
||
|
%sub134 = sub nsw i64 %151, %152
|
||
|
%conv135 = sitofp i64 %sub134 to float
|
||
|
%div136 = fdiv float %conv135, 1.000000e+06
|
||
|
%conv137 = fpext float %div136 to double
|
||
|
%153 = load i64, i64* %time6, align 8
|
||
|
%154 = load i64, i64* %time5, align 8
|
||
|
%sub138 = sub nsw i64 %153, %154
|
||
|
%conv139 = sitofp i64 %sub138 to float
|
||
|
%155 = load i64, i64* %time6, align 8
|
||
|
%156 = load i64, i64* %time0, align 8
|
||
|
%sub140 = sub nsw i64 %155, %156
|
||
|
%conv141 = sitofp i64 %sub140 to float
|
||
|
%div142 = fdiv float %conv139, %conv141
|
||
|
%mul143 = fmul contract float %div142, 1.000000e+02
|
||
|
%conv144 = fpext float %mul143 to double
|
||
|
%call145 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.26, i64 0, i64 0), double %conv137, double %conv144)
|
||
|
%call146 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.27, i64 0, i64 0))
|
||
|
%157 = load i64, i64* %time6, align 8
|
||
|
%158 = load i64, i64* %time0, align 8
|
||
|
%sub147 = sub nsw i64 %157, %158
|
||
|
%conv148 = sitofp i64 %sub147 to float
|
||
|
%div149 = fdiv float %conv148, 1.000000e+06
|
||
|
%conv150 = fpext float %div149 to double
|
||
|
%call151 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.28, i64 0, i64 0), double %conv150)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local i64 @get_time() #2
|
||
|
|
||
|
declare dso_local i32 @cudaThreadSynchronize() #2
|
||
|
|
||
|
declare dso_local i32 @printf(i8*, ...) #2
|
||
|
|
||
|
declare dso_local i32 @cudaMalloc(i8**, i64) #2
|
||
|
|
||
|
declare dso_local void @checkCUDAError(i8*) #2
|
||
|
|
||
|
declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2
|
||
|
|
||
|
declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2
|
||
|
|
||
|
; Function Attrs: noinline nounwind optnone uwtable
|
||
|
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #3 comdat align 2 {
|
||
|
entry:
|
||
|
%this.addr = alloca %struct.dim3*, align 8
|
||
|
%vx.addr = alloca i32, align 4
|
||
|
%vy.addr = alloca i32, align 4
|
||
|
%vz.addr = alloca i32, align 4
|
||
|
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
|
||
|
store i32 %vx, i32* %vx.addr, align 4
|
||
|
store i32 %vy, i32* %vy.addr, align 4
|
||
|
store i32 %vz, i32* %vz.addr, align 4
|
||
|
%this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
|
||
|
%x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
|
||
|
%0 = load i32, i32* %vx.addr, align 4
|
||
|
store i32 %0, i32* %x, align 4
|
||
|
%y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
|
||
|
%1 = load i32, i32* %vy.addr, align 4
|
||
|
store i32 %1, i32* %y, align 4
|
||
|
%z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
|
||
|
%2 = load i32, i32* %vz.addr, align 4
|
||
|
store i32 %2, i32* %z, align 4
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local i32 @cudaFree(i8*) #2
|
||
|
|
||
|
define internal void @__cuda_register_globals(i8** %0) {
|
||
|
entry:
|
||
|
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)
|
||
|
|
||
|
declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)
|
||
|
|
||
|
declare dso_local i8** @__cudaRegisterFatBinary(i8*)
|
||
|
|
||
|
define internal void @__cuda_module_ctor(i8* %0) {
|
||
|
entry:
|
||
|
%1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
|
||
|
store i8** %1, i8*** @__cuda_gpubin_handle, align 8
|
||
|
call void @__cuda_register_globals(i8** %1)
|
||
|
call void @__cudaRegisterFatBinaryEnd(i8** %1)
|
||
|
%2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)
|
||
|
|
||
|
declare dso_local void @__cudaUnregisterFatBinary(i8**)
|
||
|
|
||
|
define internal void @__cuda_module_dtor(i8* %0) {
|
||
|
entry:
|
||
|
%1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
|
||
|
call void @__cudaUnregisterFatBinary(i8** %1)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
declare dso_local i32 @atexit(void (i8*)*)
|
||
|
|
||
|
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
attributes #1 = { argmemonly nounwind willreturn }
|
||
|
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
attributes #3 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
|
||
|
!llvm.module.flags = !{!0, !1}
|
||
|
!llvm.ident = !{!2}
|
||
|
|
||
|
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||
|
!1 = !{i32 1, !"wchar_size", i32 4}
|
||
|
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|