; ModuleID = 'kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc' source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 } %struct.dim3 = type { i32, i32, i32 } %struct.CUstream_st = type opaque $_ZN4dim3C2Ejjj = comdat any @.str = private unnamed_addr constant [75 x i8] c"# of blocks = %d, # of threads/block = %d (ensure that device can handle)\0A\00", align 1 @.str.1 = private unnamed_addr constant [21 x i8] c"cudaMalloc recordsD\00", align 1 @.str.2 = private unnamed_addr constant [23 x i8] c"cudaMalloc currKnodeD\00", align 1 @.str.3 = private unnamed_addr constant [20 x i8] c"cudaMalloc offsetD\00", align 1 @.str.4 = private unnamed_addr constant [23 x i8] c"cudaMalloc lastKnodeD\00", align 1 @.str.5 = private unnamed_addr constant [22 x i8] c"cudaMalloc offset_2D\00", align 1 @.str.6 = private unnamed_addr constant [18 x i8] c"cudaMalloc startD\00", align 1 @.str.7 = private unnamed_addr constant [16 x i8] c"cudaMalloc endD\00", align 1 @.str.8 = private unnamed_addr constant [21 x i8] c"cudaMalloc ansDStart\00", align 1 @.str.9 = private unnamed_addr constant [22 x i8] c"cudaMalloc ansDLength\00", align 1 @.str.10 = private unnamed_addr constant [27 x i8] c"cudaMalloc cudaMemcpy memD\00", align 1 @.str.11 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy currKnodeD\00", align 1 @.str.12 = private unnamed_addr constant [30 x i8] c"cudaMalloc cudaMemcpy offsetD\00", align 1 @.str.13 = private unnamed_addr constant [33 x i8] c"cudaMalloc cudaMemcpy lastKnodeD\00", align 1 @.str.14 = private unnamed_addr constant [32 x i8] c"cudaMalloc cudaMemcpy offset_2D\00", align 1 @.str.15 = private unnamed_addr constant [18 x i8] c"cudaMemcpy startD\00", align 1 @.str.16 = private unnamed_addr constant [16 x i8] c"cudaMemcpy endD\00", align 1 @.str.17 = private unnamed_addr constant [21 x i8] c"cudaMemcpy ansDStart\00", align 1 @.str.18 = private unnamed_addr constant [22 x i8] c"cudaMemcpy ansDLength\00", align 1 @.str.19 = private unnamed_addr constant [11 x i8] c"findRangeK\00", align 1 @.str.20 = private unnamed_addr constant [52 x i8] c"Time spent in different stages of GPU_CUDA KERNEL:\0A\00", align 1 @.str.21 = private unnamed_addr constant [54 x i8] c"%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\0A\00", align 1 @.str.22 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: ALO\0A\00", align 1 @.str.23 = private unnamed_addr constant [41 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY IN\0A\00", align 1 @.str.24 = private unnamed_addr constant [36 x i8] c"%15.12f s, %15.12f % : GPU: KERNEL\0A\00", align 1 @.str.25 = private unnamed_addr constant [42 x i8] c"%15.12f s, %15.12f % : GPU MEM: COPY OUT\0A\00", align 1 @.str.26 = private unnamed_addr constant [37 x i8] c"%15.12f s, %15.12f % : GPU MEM: FRE\0A\00", align 1 @.str.27 = private unnamed_addr constant [13 x i8] c"Total time:\0A\00", align 1 @.str.28 = private unnamed_addr constant [9 x i8] c"%.12f s\0A\00", align 1 @0 = private constant [26033 x i8] c"P\EDU\BA\01\00\10\00\A0e\00\00\00\00\00\00\02\00\01\01@\00\00\00\C8V\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00 V\00\00\00\00\00\00\E0S\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00.nv.constant0.findRangeK\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00findRangeK\00.text.findRangeK\00.nv.info.findRangeK\00.nv.shared.findRangeK\00.nv.global\00threadIdx\00blockIdx\00.nv.constant0.findRangeK\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00=\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00x\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\83\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\8D\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\96\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00\00O\00\00\00\00\00\00\04/\08\00\06\00\00\00\1C\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00h\00\00\00\04\11\08\00\06\00\00\00hprd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB3\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\0Avisible .entry findRangeK\93\04\00\8D\00\06\18\00\0Eg\04\0F \00\02\1F1 \00\0C\1F2 \00\0C\1F3 \00\0C\1F4 \00\0C\1F5 \00\0C\1F6 \00\0C\1F7 \00\0C\1F8 \00\0C\1F9 \01\0D\0FH\0B\14_6[104y\04\15\BCpred %p<11>]\0B\1E8^\0B\1F1\8F\04\0D\1F6`\0B\1A\00>\03\0F\E7\00\00\0Fh\03\01\1F0+\00\01\1F9*\00\00\1F9)\00\01\1F8)\00\00\1F8)\00\01\1F7)\00\00\1F7)\00\01\1F6)\00\00\1F6)\00\01\1F5)\00\00\1E5)\00\0F]\05\03\1F4)\00\01\1F3\86\05\04\0E1\03\0F\06\05\04\0Es\01\0F\DA\04\04\0E\C3\03\13]\E0\01#to\B7\14\05/\00\122\B8\04\04\B8\11\0B\1E\00\123\1E\00\1F2?\00\06\124!\00\1F0?\00\03\125\1E\00\1F4?\00\06\116!\00\1F9>\00\03\127>\00\1F6>\00\06\118!\00\1F8>\00\03\129>\00\1F8>\00\05!20!\00\1F7>\00\02\2221\B9\05\1F0>\00\06\028\01\1F6>\00\03\027\01\1F27\01\06\1227\01\1F5>\00\03\026\01\1F26\01\06\1226\01\1F4>\00\03\026\01\1F26\01\06$28\94\06\0F>\00\01\026\01/28\DE\06\19\1A9\17\00)16\C9\06\0C\E0\06*27\18\00\03\E1\06:d25\18\00\134u\00\1B2H\00\144w\00\0B\8E\00\135w\00\1B1\8F\00\136x\00\1B1x\00\137x\00\1B1x\00\148\ED\00\1B3\18\10k%tid.x\D7\11\05\D8\11\09t\0CN%cta-\00\1F9\CB\07\03\1B0&\00\146\BB\07\F2\01bra.uni LBB6_1;\0A\08\00\10:\F3\03\11s=\00Ed30,5\00\01\0B\04\04W\04$1,\B9\01\B0;\0Asetp.ge.s\1C\004p1,9\00\01(\00\A3;\0A@%p1 brag\00\1B6x\00\132x\00'2:`\00\149x\00\198w\00591,\EC\01\17;\A7\00592,\02\01T;\0Ashl\9A\06493, \00\833;\0Aadd.s\19\00$4,Q\00\01'\00\09f\00\00\87\05\01$\00\95];\0Amul.lo7\00$6,\22\00J2068S\00$7,\BB\00\01*\00\09\A1\00%8,\D0\01\0A\A1\00$9, \00\192N\00H100,V\00\129\A2\00\02,\02\123\A1\00\89100+1032$\01E101,\B1\02\09l\005102\0E\01\0Cm\00%3,<\00\02*\00\08o\00\144o\00\153\EE\01\12t\DA\003p4,\90\00\00'\00\01\EC\01\164\EC\01\1B7\EB\01\133\EB\01)3:\B0\00\1F4\EC\01\02?105\ED\01\03?106\EE\01\04E107,\22\00\1B3\E2\00%8,V\00\02+\00\0Am\00\149\E4\00\1D8\F5\01\101e\08\04%\00\0D\F7\01\101\AE\08\04\C8\00\02.\00\08<\01\185\F9\01$ad\C6\00\02\99\06\00\1E\00\02\19\07\00N\00\05\E1\00\024\07*16\DD\00\04,\07\1E1\BF\01\04&\07\04\8B\00(13\83\00\137\DB\00/14.\02\00/15.\02\05&16M\01\0Cp\00%7,=\00\02+\00\08p\00\148p\00\147/\02#le/\02#5,\91\00\00'\00\01/\02\1F5/\02\07\134/\02\194/\02/18/\02\03/19/\02\04/20/\02\05\03\85\07\01\22\00\0B/\02\03\7F\07\141\E6\07)21m\00\142\82\03.22/\02\03}\07\01%\00\0E/\02\03~\07#11\91\02*24\AE\00\1F6*\04\04\131y\07.12\90\01\03s\07\05\\\00*27T\00\04\DD\02J28+46\01\05\E1\05-16\CA\05%6,>\00\02+\00\01\B1\01\166\B1\01\0C\CB\05\135\B1\01\195\B1\01/31\B1\01\03/32\B1\01\04/33\B1\01\05534,\22\00\0B\B1\01535,V\00\02+\00\0Am\00\136\03\01.35\B1\01537,%\00\0E\B1\01538,\C8\00\02.\00\0A\AE\00\1F9\B1\01\05540,\22\00\0B\B1\01941,\\\00)40T\00#42\AE\00,41\B1\01543,\D7\08\0BT\00%4,\22\00\04\02\01\05x\08\01 \00\03x\08+42\B8\01\136\B8\01*6:\18\00\137\18\00\197\D0\01/45\D0\01\03546,Y\09\0A\CE\00\1F7\D0\01\05\144\FF\00\1D7\D0\01549,V\00\02+\00\09m\00$50\22\01\1E9\D0\01551,%\00\0E\D0\01552,\C8\00\02.\00\09\AE\00/53\D0\01\05554,\22\00\0B\D0\01955,\\\00(54\11\05\04\7F\03/55\81\05\00556,H\0A\0Bp\00\167\1E\01\0Cp\00%8,=\00\02+\00\07p\00$20p\00\1D8\B0\07#7,\91\00\00'\00\01\D0\03\177\9C\09\0C\19\02\138\01\02\198\01\02/59\01\02\03/60\01\02\04/61\01\02\05562,\22\00\0B\01\02563,V\00\02+\00\0Am\00\04\95\08.63\01\02565,%\00\0E\01\02566,\C8\00\02.\00\08=\01\1F1\B1\07\03\02\11\06\00\1E\00\0F\B1\07\00\116W\01+22\DD\00\03\A5\01\1E6\C0\01969,\8B\00(68\83\00\04]\06/690\02\00/700\02\05&71M\01\0Bp\00572,=\00\02+\00\08p\00\04K\01-72\B1\07#8,\91\00\00'\00\010\02\1F80\02\08\1390\02\1990\02/730\02\03/740\02\04/750\02\05576,\22\00\0B0\02577,V\00\02+\00\0Am\00\04\96\08.770\02579,%\00\0E0\02580,\C8\00\02.\00\09\AE\00/811\04\05\138v\01\1E8\91\01983,\\\00*82T\00\04\93\01,83\01\06/85\B2\07\07%9,>\00\02+\00\01\B2\01\179\B2\01\1C1\F6\0D\140\F7\0D\190\B4\01/86\B4\01\03/87\B4\01\04/88\B4\01\05\148\96\00\1D8\B4\01\03,\0D#18\BC\01)89m\00#91\06\01.90\B4\01\00\C6\0C\04%\00\0E\B4\01\03\F9\0D#181\02\1A9`\01/94\B4\01\05\00\DB\0D\04\22\00\0B\B4\01\03\FD\0D\05\\\00*95T\00\04h\0B,96\B4\01\05\FF\0D;48]T\00%9,\22\00\04\02\01\08\B5\07$99u\10\0C\1E\0B\151\B1\0F\1B1\B7\07$12\1A\00\D82:\0Abar.sync 0\F2\03\09\96\0C\01D\02\14n\F2\03\02\ED\0C\00\22\00\02 \06'10?\02\1C4\85\00\04\D3\0D\191\D4\0D8200\A6\08\06'\02/20\0B\06\05\132\9D\0E-20\0B\06\132\9E\0E\132\13\0F\142\9E\0E\04m\00\03-\03\112\A0\0E\08\19\00\09B\0E\09R\00%6,\22\00\04R\00\07w\01#20\EC\11:204R\00\187\C9\01\08\D8\00\1F8\D8\00\06%9,\22\00\0B\D8\00\03X\0E\132\B7\0E)20\FF\023211\D8\00\04\FA\16\05\19\00\182\85\03\09R\00%3,\22\00\04R\00\08\D8\00$13\DC\12\0D\09\04\04n\0D*145\02\0A&\00\04\E3\0B715:O\02\186&\12\074\07#7,\1E\00\1F1\83\12\02/27\84\12\05\186N\02/32\8C\04\02/33@\0C\03/34\8B\01\04\03#\0C\00 \00\0A\89\01\00\F8\0B\03Q\00\01'\00\09f\00\03\D6\03-36\83\04\03\BD\0B\00\22\00\0D\81\04\00\F4\0B\03\BB\00\01*\00\08\A1\00/40}\04\04\03\16\0C\00 \00\0A{\04\00\10\0C\06U\00\184\BD\07\03\FC\02.42+\08/43\DB\0F\04%44\09\01\0Bh\00$5,9\00\01'\00\07h\00\03\14\13-451\04\222,\87\00\22%ro\06\1720\04\1C8\E1\01\04S\0C\191T\0C\1F4n\06\03/47\E2\01\03\1F4m\03\05\035\0C\1D4k\03\00\0A\0C\03Q\00\01'\00\08f\00\135g\03\1E5e\06\03\CF\0B\1F5c\06\01\00\06\0C\03\BB\00\01*\00\08\A1\00\1F5_\06\05\03(\0C\1D5z\01\03\C9\0B\03U\00\185\92\0A\03\BA\0E\1C5V\06\00\1F\0C\04i\16\0Ae\00\158\06\01\0Be\00$9,9\00\01'\00\07\95\03\00\1D\00\01\94\03\0C\D4\0F$18\B2\01\1E8M\06\03\DF\1A\0F\AD\15\03/61+\08\03\1F6\AD\15\05\03\F2\0B\1D6\AD\15\00\C7\0B\03Q\00\01'\00\09f\00\03\99\02\1E6\AD\15\03]\0B\1F6\AD\15\01\02\B2\0B\13d]\0C\196\AD\15\1F6\AD\15\05\03\B6\0B\1D6Y\01\03W\0B\04U\00\08\AC\15\03?\04\1F7\AA\15\00/71\CA\0B\04%72\09\01\0Bh\00$3,9\00\01'\00\07h\00\03\DF\0A-73\A0\03\223,\87\00\22%rr\13\163\A0\03\1D2\E0\05\04\C2\0B\191\C3\0B\1F7\A3\15\03/75\E2\01\03\1F7\A1\15\05\03\A4\0B\1D7\9F\15\00y\0B\03Q\00\01'\00\09f\00\03?\0F\1E7\9A\15\03>\0B\1F7\98\15\01\00u\0B\03\BB\00\01*\00\08\A1\00/82\E2\01\04\03\97\0B\1D8z\01\03S\0B\03U\00\188`\15\03\9F\00+84\A0\03/85\A0\03\04%86\06\01\0Be\00$7,9\00\01'\00\08D\10\03\D4\0F\118T\15$ubG\16\221,\82\00\00\22\00\09`\16#2,\1F\00\09\B9\14\05\A6\0B\1B8:\08\03\A5\0B%88{\00\07\1B\04\128\1B\04\0D\EF\0D\142=\0C\C020:\0Aret;\0A\0A}\0A\00\00\00\00\00\00", section ".nv_fatbin", align 8 @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([26033 x i8], [26033 x i8]* @0, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 @__cuda_gpubin_handle = internal global i8** null, align 8 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] ; Function Attrs: noinline optnone uwtable define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 { entry: %height.addr = alloca i64, align 8 %knodesD.addr = alloca %struct.knode*, align 8 %knodes_elem.addr = alloca i64, align 8 %currKnodeD.addr = alloca i64*, align 8 %offsetD.addr = alloca i64*, align 8 %lastKnodeD.addr = alloca i64*, align 8 %offset_2D.addr = alloca i64*, align 8 %startD.addr = alloca i32*, align 8 %endD.addr = alloca i32*, align 8 %RecstartD.addr = alloca i32*, align 8 %ReclenD.addr = alloca i32*, align 8 %grid_dim = alloca %struct.dim3, align 8 %block_dim = alloca %struct.dim3, align 8 %shmem_size = alloca i64, align 8 %stream = alloca i8*, align 8 %grid_dim.coerce = alloca { i64, i32 }, align 8 %block_dim.coerce = alloca { i64, i32 }, align 8 store i64 %height, i64* %height.addr, align 8 store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8 store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 store i64* %currKnodeD, i64** %currKnodeD.addr, align 8 store i64* %offsetD, i64** %offsetD.addr, align 8 store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8 store i64* %offset_2D, i64** %offset_2D.addr, align 8 store i32* %startD, i32** %startD.addr, align 8 store i32* %endD, i32** %endD.addr, align 8 store i32* %RecstartD, i32** %RecstartD.addr, align 8 store i32* %ReclenD, i32** %ReclenD.addr, align 8 %kernel_args = alloca i8*, i64 11, align 16 %0 = bitcast i64* %height.addr to i8* %1 = getelementptr i8*, i8** %kernel_args, i32 0 store i8* %0, i8** %1 %2 = bitcast %struct.knode** %knodesD.addr to i8* %3 = getelementptr i8*, i8** %kernel_args, i32 1 store i8* %2, i8** %3 %4 = bitcast i64* %knodes_elem.addr to i8* %5 = getelementptr i8*, i8** %kernel_args, i32 2 store i8* %4, i8** %5 %6 = bitcast i64** %currKnodeD.addr to i8* %7 = getelementptr i8*, i8** %kernel_args, i32 3 store i8* %6, i8** %7 %8 = bitcast i64** %offsetD.addr to i8* %9 = getelementptr i8*, i8** %kernel_args, i32 4 store i8* %8, i8** %9 %10 = bitcast i64** %lastKnodeD.addr to i8* %11 = getelementptr i8*, i8** %kernel_args, i32 5 store i8* %10, i8** %11 %12 = bitcast i64** %offset_2D.addr to i8* %13 = getelementptr i8*, i8** %kernel_args, i32 6 store i8* %12, i8** %13 %14 = bitcast i32** %startD.addr to i8* %15 = getelementptr i8*, i8** %kernel_args, i32 7 store i8* %14, i8** %15 %16 = bitcast i32** %endD.addr to i8* %17 = getelementptr i8*, i8** %kernel_args, i32 8 store i8* %16, i8** %17 %18 = bitcast i32** %RecstartD.addr to i8* %19 = getelementptr i8*, i8** %kernel_args, i32 9 store i8* %18, i8** %19 %20 = bitcast i32** %ReclenD.addr to i8* %21 = getelementptr i8*, i8** %kernel_args, i32 10 store i8* %20, i8** %21 %22 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) %23 = load i64, i64* %shmem_size, align 8 %24 = load i8*, i8** %stream, align 8 %25 = bitcast { i64, i32 }* %grid_dim.coerce to i8* %26 = bitcast %struct.dim3* %grid_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %25, i8* align 8 %26, i64 12, i1 false) %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 %28 = load i64, i64* %27, align 8 %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 %30 = load i32, i32* %29, align 8 %31 = bitcast { i64, i32 }* %block_dim.coerce to i8* %32 = bitcast %struct.dim3* %block_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false) %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 %34 = load i64, i64* %33, align 8 %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 %36 = load i32, i32* %35, align 8 %37 = bitcast i8* %24 to %struct.CUstream_st* %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i64 %28, i32 %30, i64 %34, i32 %36, i8** %kernel_args, i64 %23, %struct.CUstream_st* %37) br label %setup.end setup.end: ; preds = %entry ret void } declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 ; Function Attrs: noinline optnone uwtable define dso_local void @kernel_gpu_cuda_wrapper_2(%struct.knode* %knodes, i64 %knodes_elem, i64 %knodes_mem, i32 %order, i64 %maxheight, i32 %count, i64* %currKnode, i64* %offset, i64* %lastKnode, i64* %offset_2, i32* %start, i32* %end, i32* %recstart, i32* %reclength) #0 { entry: %knodes.addr = alloca %struct.knode*, align 8 %knodes_elem.addr = alloca i64, align 8 %knodes_mem.addr = alloca i64, align 8 %order.addr = alloca i32, align 4 %maxheight.addr = alloca i64, align 8 %count.addr = alloca i32, align 4 %currKnode.addr = alloca i64*, align 8 %offset.addr = alloca i64*, align 8 %lastKnode.addr = alloca i64*, align 8 %offset_2.addr = alloca i64*, align 8 %start.addr = alloca i32*, align 8 %end.addr = alloca i32*, align 8 %recstart.addr = alloca i32*, align 8 %reclength.addr = alloca i32*, align 8 %time0 = alloca i64, align 8 %time1 = alloca i64, align 8 %time2 = alloca i64, align 8 %time3 = alloca i64, align 8 %time4 = alloca i64, align 8 %time5 = alloca i64, align 8 %time6 = alloca i64, align 8 %numBlocks = alloca i32, align 4 %threadsPerBlock = alloca i32, align 4 %knodesD = alloca %struct.knode*, align 8 %currKnodeD = alloca i64*, align 8 %offsetD = alloca i64*, align 8 %lastKnodeD = alloca i64*, align 8 %offset_2D = alloca i64*, align 8 %startD = alloca i32*, align 8 %endD = alloca i32*, align 8 %ansDStart = alloca i32*, align 8 %ansDLength = alloca i32*, align 8 %agg.tmp = alloca %struct.dim3, align 4 %agg.tmp54 = alloca %struct.dim3, align 4 %agg.tmp.coerce = alloca { i64, i32 }, align 4 %agg.tmp54.coerce = alloca { i64, i32 }, align 4 store %struct.knode* %knodes, %struct.knode** %knodes.addr, align 8 store i64 %knodes_elem, i64* %knodes_elem.addr, align 8 store i64 %knodes_mem, i64* %knodes_mem.addr, align 8 store i32 %order, i32* %order.addr, align 4 store i64 %maxheight, i64* %maxheight.addr, align 8 store i32 %count, i32* %count.addr, align 4 store i64* %currKnode, i64** %currKnode.addr, align 8 store i64* %offset, i64** %offset.addr, align 8 store i64* %lastKnode, i64** %lastKnode.addr, align 8 store i64* %offset_2, i64** %offset_2.addr, align 8 store i32* %start, i32** %start.addr, align 8 store i32* %end, i32** %end.addr, align 8 store i32* %recstart, i32** %recstart.addr, align 8 store i32* %reclength, i32** %reclength.addr, align 8 %call = call i64 @get_time() store i64 %call, i64* %time0, align 8 %call1 = call i32 @cudaThreadSynchronize() %0 = load i32, i32* %count.addr, align 4 store i32 %0, i32* %numBlocks, align 4 %1 = load i32, i32* %order.addr, align 4 %cmp = icmp slt i32 %1, 1024 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %2 = load i32, i32* %order.addr, align 4 br label %cond.end cond.false: ; preds = %entry br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %2, %cond.true ], [ 1024, %cond.false ] store i32 %cond, i32* %threadsPerBlock, align 4 %3 = load i32, i32* %numBlocks, align 4 %4 = load i32, i32* %threadsPerBlock, align 4 %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @.str, i64 0, i64 0), i32 %3, i32 %4) %call3 = call i64 @get_time() store i64 %call3, i64* %time1, align 8 %5 = bitcast %struct.knode** %knodesD to i8** %6 = load i64, i64* %knodes_mem.addr, align 8 %call4 = call i32 @cudaMalloc(i8** %5, i64 %6) call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i64 0, i64 0)) %7 = bitcast i64** %currKnodeD to i8** %8 = load i32, i32* %count.addr, align 4 %conv = sext i32 %8 to i64 %mul = mul i64 %conv, 8 %call5 = call i32 @cudaMalloc(i8** %7, i64 %mul) call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.2, i64 0, i64 0)) %9 = bitcast i64** %offsetD to i8** %10 = load i32, i32* %count.addr, align 4 %conv6 = sext i32 %10 to i64 %mul7 = mul i64 %conv6, 8 %call8 = call i32 @cudaMalloc(i8** %9, i64 %mul7) call void @checkCUDAError(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.3, i64 0, i64 0)) %11 = bitcast i64** %lastKnodeD to i8** %12 = load i32, i32* %count.addr, align 4 %conv9 = sext i32 %12 to i64 %mul10 = mul i64 %conv9, 8 %call11 = call i32 @cudaMalloc(i8** %11, i64 %mul10) call void @checkCUDAError(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.4, i64 0, i64 0)) %13 = bitcast i64** %offset_2D to i8** %14 = load i32, i32* %count.addr, align 4 %conv12 = sext i32 %14 to i64 %mul13 = mul i64 %conv12, 8 %call14 = call i32 @cudaMalloc(i8** %13, i64 %mul13) call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.5, i64 0, i64 0)) %15 = bitcast i32** %startD to i8** %16 = load i32, i32* %count.addr, align 4 %conv15 = sext i32 %16 to i64 %mul16 = mul i64 %conv15, 4 %call17 = call i32 @cudaMalloc(i8** %15, i64 %mul16) call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.6, i64 0, i64 0)) %17 = bitcast i32** %endD to i8** %18 = load i32, i32* %count.addr, align 4 %conv18 = sext i32 %18 to i64 %mul19 = mul i64 %conv18, 4 %call20 = call i32 @cudaMalloc(i8** %17, i64 %mul19) call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.7, i64 0, i64 0)) %19 = bitcast i32** %ansDStart to i8** %20 = load i32, i32* %count.addr, align 4 %conv21 = sext i32 %20 to i64 %mul22 = mul i64 %conv21, 4 %call23 = call i32 @cudaMalloc(i8** %19, i64 %mul22) call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.8, i64 0, i64 0)) %21 = bitcast i32** %ansDLength to i8** %22 = load i32, i32* %count.addr, align 4 %conv24 = sext i32 %22 to i64 %mul25 = mul i64 %conv24, 4 %call26 = call i32 @cudaMalloc(i8** %21, i64 %mul25) call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.9, i64 0, i64 0)) %call27 = call i64 @get_time() store i64 %call27, i64* %time2, align 8 %23 = load %struct.knode*, %struct.knode** %knodesD, align 8 %24 = bitcast %struct.knode* %23 to i8* %25 = load %struct.knode*, %struct.knode** %knodes.addr, align 8 %26 = bitcast %struct.knode* %25 to i8* %27 = load i64, i64* %knodes_mem.addr, align 8 %call28 = call i32 @cudaMemcpy(i8* %24, i8* %26, i64 %27, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.10, i64 0, i64 0)) %28 = load i64*, i64** %currKnodeD, align 8 %29 = bitcast i64* %28 to i8* %30 = load i64*, i64** %currKnode.addr, align 8 %31 = bitcast i64* %30 to i8* %32 = load i32, i32* %count.addr, align 4 %conv29 = sext i32 %32 to i64 %mul30 = mul i64 %conv29, 8 %call31 = call i32 @cudaMemcpy(i8* %29, i8* %31, i64 %mul30, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.11, i64 0, i64 0)) %33 = load i64*, i64** %offsetD, align 8 %34 = bitcast i64* %33 to i8* %35 = load i64*, i64** %offset.addr, align 8 %36 = bitcast i64* %35 to i8* %37 = load i32, i32* %count.addr, align 4 %conv32 = sext i32 %37 to i64 %mul33 = mul i64 %conv32, 8 %call34 = call i32 @cudaMemcpy(i8* %34, i8* %36, i64 %mul33, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.12, i64 0, i64 0)) %38 = load i64*, i64** %lastKnodeD, align 8 %39 = bitcast i64* %38 to i8* %40 = load i64*, i64** %lastKnode.addr, align 8 %41 = bitcast i64* %40 to i8* %42 = load i32, i32* %count.addr, align 4 %conv35 = sext i32 %42 to i64 %mul36 = mul i64 %conv35, 8 %call37 = call i32 @cudaMemcpy(i8* %39, i8* %41, i64 %mul36, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.13, i64 0, i64 0)) %43 = load i64*, i64** %offset_2D, align 8 %44 = bitcast i64* %43 to i8* %45 = load i64*, i64** %offset_2.addr, align 8 %46 = bitcast i64* %45 to i8* %47 = load i32, i32* %count.addr, align 4 %conv38 = sext i32 %47 to i64 %mul39 = mul i64 %conv38, 8 %call40 = call i32 @cudaMemcpy(i8* %44, i8* %46, i64 %mul39, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.14, i64 0, i64 0)) %48 = load i32*, i32** %startD, align 8 %49 = bitcast i32* %48 to i8* %50 = load i32*, i32** %start.addr, align 8 %51 = bitcast i32* %50 to i8* %52 = load i32, i32* %count.addr, align 4 %conv41 = sext i32 %52 to i64 %mul42 = mul i64 %conv41, 4 %call43 = call i32 @cudaMemcpy(i8* %49, i8* %51, i64 %mul42, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.15, i64 0, i64 0)) %53 = load i32*, i32** %endD, align 8 %54 = bitcast i32* %53 to i8* %55 = load i32*, i32** %end.addr, align 8 %56 = bitcast i32* %55 to i8* %57 = load i32, i32* %count.addr, align 4 %conv44 = sext i32 %57 to i64 %mul45 = mul i64 %conv44, 4 %call46 = call i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul45, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.16, i64 0, i64 0)) %58 = load i32*, i32** %ansDStart, align 8 %59 = bitcast i32* %58 to i8* %60 = load i32*, i32** %recstart.addr, align 8 %61 = bitcast i32* %60 to i8* %62 = load i32, i32* %count.addr, align 4 %conv47 = sext i32 %62 to i64 %mul48 = mul i64 %conv47, 4 %call49 = call i32 @cudaMemcpy(i8* %59, i8* %61, i64 %mul48, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) %63 = load i32*, i32** %ansDLength, align 8 %64 = bitcast i32* %63 to i8* %65 = load i32*, i32** %reclength.addr, align 8 %66 = bitcast i32* %65 to i8* %67 = load i32, i32* %count.addr, align 4 %conv50 = sext i32 %67 to i64 %mul51 = mul i64 %conv50, 4 %call52 = call i32 @cudaMemcpy(i8* %64, i8* %66, i64 %mul51, i32 1) call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0)) %call53 = call i64 @get_time() store i64 %call53, i64* %time3, align 8 %68 = load i32, i32* %numBlocks, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 %68, i32 1, i32 1) %69 = load i32, i32* %threadsPerBlock, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp54, i32 %69, i32 1, i32 1) %70 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* %71 = bitcast %struct.dim3* %agg.tmp to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %70, i8* align 4 %71, i64 12, i1 false) %72 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 %73 = load i64, i64* %72, align 4 %74 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 %75 = load i32, i32* %74, align 4 %76 = bitcast { i64, i32 }* %agg.tmp54.coerce to i8* %77 = bitcast %struct.dim3* %agg.tmp54 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %76, i8* align 4 %77, i64 12, i1 false) %78 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 0 %79 = load i64, i64* %78, align 4 %80 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp54.coerce, i32 0, i32 1 %81 = load i32, i32* %80, align 4 %call55 = call i32 @__cudaPushCallConfiguration(i64 %73, i32 %75, i64 %79, i32 %81, i64 0, i8* null) %tobool = icmp ne i32 %call55, 0 br i1 %tobool, label %kcall.end, label %kcall.configok kcall.configok: ; preds = %cond.end %82 = load i64, i64* %maxheight.addr, align 8 %83 = load %struct.knode*, %struct.knode** %knodesD, align 8 %84 = load i64, i64* %knodes_elem.addr, align 8 %85 = load i64*, i64** %currKnodeD, align 8 %86 = load i64*, i64** %offsetD, align 8 %87 = load i64*, i64** %lastKnodeD, align 8 %88 = load i64*, i64** %offset_2D, align 8 %89 = load i32*, i32** %startD, align 8 %90 = load i32*, i32** %endD, align 8 %91 = load i32*, i32** %ansDStart, align 8 %92 = load i32*, i32** %ansDLength, align 8 call void @findRangeK(i64 %82, %struct.knode* %83, i64 %84, i64* %85, i64* %86, i64* %87, i64* %88, i32* %89, i32* %90, i32* %91, i32* %92) br label %kcall.end kcall.end: ; preds = %kcall.configok, %cond.end %call56 = call i32 @cudaThreadSynchronize() call void @checkCUDAError(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0)) %call57 = call i64 @get_time() store i64 %call57, i64* %time4, align 8 %93 = load i32*, i32** %recstart.addr, align 8 %94 = bitcast i32* %93 to i8* %95 = load i32*, i32** %ansDStart, align 8 %96 = bitcast i32* %95 to i8* %97 = load i32, i32* %count.addr, align 4 %conv58 = sext i32 %97 to i64 %mul59 = mul i64 %conv58, 4 %call60 = call i32 @cudaMemcpy(i8* %94, i8* %96, i64 %mul59, i32 2) call void @checkCUDAError(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) %98 = load i32*, i32** %reclength.addr, align 8 %99 = bitcast i32* %98 to i8* %100 = load i32*, i32** %ansDLength, align 8 %101 = bitcast i32* %100 to i8* %102 = load i32, i32* %count.addr, align 4 %conv61 = sext i32 %102 to i64 %mul62 = mul i64 %conv61, 4 %call63 = call i32 @cudaMemcpy(i8* %99, i8* %101, i64 %mul62, i32 2) call void @checkCUDAError(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.18, i64 0, i64 0)) %call64 = call i64 @get_time() store i64 %call64, i64* %time5, align 8 %103 = load %struct.knode*, %struct.knode** %knodesD, align 8 %104 = bitcast %struct.knode* %103 to i8* %call65 = call i32 @cudaFree(i8* %104) %105 = load i64*, i64** %currKnodeD, align 8 %106 = bitcast i64* %105 to i8* %call66 = call i32 @cudaFree(i8* %106) %107 = load i64*, i64** %offsetD, align 8 %108 = bitcast i64* %107 to i8* %call67 = call i32 @cudaFree(i8* %108) %109 = load i64*, i64** %lastKnodeD, align 8 %110 = bitcast i64* %109 to i8* %call68 = call i32 @cudaFree(i8* %110) %111 = load i64*, i64** %offset_2D, align 8 %112 = bitcast i64* %111 to i8* %call69 = call i32 @cudaFree(i8* %112) %113 = load i32*, i32** %startD, align 8 %114 = bitcast i32* %113 to i8* %call70 = call i32 @cudaFree(i8* %114) %115 = load i32*, i32** %endD, align 8 %116 = bitcast i32* %115 to i8* %call71 = call i32 @cudaFree(i8* %116) %117 = load i32*, i32** %ansDStart, align 8 %118 = bitcast i32* %117 to i8* %call72 = call i32 @cudaFree(i8* %118) %119 = load i32*, i32** %ansDLength, align 8 %120 = bitcast i32* %119 to i8* %call73 = call i32 @cudaFree(i8* %120) %call74 = call i64 @get_time() store i64 %call74, i64* %time6, align 8 %call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.20, i64 0, i64 0)) %121 = load i64, i64* %time1, align 8 %122 = load i64, i64* %time0, align 8 %sub = sub nsw i64 %121, %122 %conv76 = sitofp i64 %sub to float %div = fdiv float %conv76, 1.000000e+06 %conv77 = fpext float %div to double %123 = load i64, i64* %time1, align 8 %124 = load i64, i64* %time0, align 8 %sub78 = sub nsw i64 %123, %124 %conv79 = sitofp i64 %sub78 to float %125 = load i64, i64* %time6, align 8 %126 = load i64, i64* %time0, align 8 %sub80 = sub nsw i64 %125, %126 %conv81 = sitofp i64 %sub80 to float %div82 = fdiv float %conv79, %conv81 %mul83 = fmul contract float %div82, 1.000000e+02 %conv84 = fpext float %mul83 to double %call85 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.21, i64 0, i64 0), double %conv77, double %conv84) %127 = load i64, i64* %time2, align 8 %128 = load i64, i64* %time1, align 8 %sub86 = sub nsw i64 %127, %128 %conv87 = sitofp i64 %sub86 to float %div88 = fdiv float %conv87, 1.000000e+06 %conv89 = fpext float %div88 to double %129 = load i64, i64* %time2, align 8 %130 = load i64, i64* %time1, align 8 %sub90 = sub nsw i64 %129, %130 %conv91 = sitofp i64 %sub90 to float %131 = load i64, i64* %time6, align 8 %132 = load i64, i64* %time0, align 8 %sub92 = sub nsw i64 %131, %132 %conv93 = sitofp i64 %sub92 to float %div94 = fdiv float %conv91, %conv93 %mul95 = fmul contract float %div94, 1.000000e+02 %conv96 = fpext float %mul95 to double %call97 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.22, i64 0, i64 0), double %conv89, double %conv96) %133 = load i64, i64* %time3, align 8 %134 = load i64, i64* %time2, align 8 %sub98 = sub nsw i64 %133, %134 %conv99 = sitofp i64 %sub98 to float %div100 = fdiv float %conv99, 1.000000e+06 %conv101 = fpext float %div100 to double %135 = load i64, i64* %time3, align 8 %136 = load i64, i64* %time2, align 8 %sub102 = sub nsw i64 %135, %136 %conv103 = sitofp i64 %sub102 to float %137 = load i64, i64* %time6, align 8 %138 = load i64, i64* %time0, align 8 %sub104 = sub nsw i64 %137, %138 %conv105 = sitofp i64 %sub104 to float %div106 = fdiv float %conv103, %conv105 %mul107 = fmul contract float %div106, 1.000000e+02 %conv108 = fpext float %mul107 to double %call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.23, i64 0, i64 0), double %conv101, double %conv108) %139 = load i64, i64* %time4, align 8 %140 = load i64, i64* %time3, align 8 %sub110 = sub nsw i64 %139, %140 %conv111 = sitofp i64 %sub110 to float %div112 = fdiv float %conv111, 1.000000e+06 %conv113 = fpext float %div112 to double %141 = load i64, i64* %time4, align 8 %142 = load i64, i64* %time3, align 8 %sub114 = sub nsw i64 %141, %142 %conv115 = sitofp i64 %sub114 to float %143 = load i64, i64* %time6, align 8 %144 = load i64, i64* %time0, align 8 %sub116 = sub nsw i64 %143, %144 %conv117 = sitofp i64 %sub116 to float %div118 = fdiv float %conv115, %conv117 %mul119 = fmul contract float %div118, 1.000000e+02 %conv120 = fpext float %mul119 to double %call121 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.24, i64 0, i64 0), double %conv113, double %conv120) %145 = load i64, i64* %time5, align 8 %146 = load i64, i64* %time4, align 8 %sub122 = sub nsw i64 %145, %146 %conv123 = sitofp i64 %sub122 to float %div124 = fdiv float %conv123, 1.000000e+06 %conv125 = fpext float %div124 to double %147 = load i64, i64* %time5, align 8 %148 = load i64, i64* %time4, align 8 %sub126 = sub nsw i64 %147, %148 %conv127 = sitofp i64 %sub126 to float %149 = load i64, i64* %time6, align 8 %150 = load i64, i64* %time0, align 8 %sub128 = sub nsw i64 %149, %150 %conv129 = sitofp i64 %sub128 to float %div130 = fdiv float %conv127, %conv129 %mul131 = fmul contract float %div130, 1.000000e+02 %conv132 = fpext float %mul131 to double %call133 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.25, i64 0, i64 0), double %conv125, double %conv132) %151 = load i64, i64* %time6, align 8 %152 = load i64, i64* %time5, align 8 %sub134 = sub nsw i64 %151, %152 %conv135 = sitofp i64 %sub134 to float %div136 = fdiv float %conv135, 1.000000e+06 %conv137 = fpext float %div136 to double %153 = load i64, i64* %time6, align 8 %154 = load i64, i64* %time5, align 8 %sub138 = sub nsw i64 %153, %154 %conv139 = sitofp i64 %sub138 to float %155 = load i64, i64* %time6, align 8 %156 = load i64, i64* %time0, align 8 %sub140 = sub nsw i64 %155, %156 %conv141 = sitofp i64 %sub140 to float %div142 = fdiv float %conv139, %conv141 %mul143 = fmul contract float %div142, 1.000000e+02 %conv144 = fpext float %mul143 to double %call145 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.26, i64 0, i64 0), double %conv137, double %conv144) %call146 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.27, i64 0, i64 0)) %157 = load i64, i64* %time6, align 8 %158 = load i64, i64* %time0, align 8 %sub147 = sub nsw i64 %157, %158 %conv148 = sitofp i64 %sub147 to float %div149 = fdiv float %conv148, 1.000000e+06 %conv150 = fpext float %div149 to double %call151 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.28, i64 0, i64 0), double %conv150) ret void } declare dso_local i64 @get_time() #2 declare dso_local i32 @cudaThreadSynchronize() #2 declare dso_local i32 @printf(i8*, ...) #2 declare dso_local i32 @cudaMalloc(i8**, i64) #2 declare dso_local void @checkCUDAError(i8*) #2 declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2 declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2 ; Function Attrs: noinline nounwind optnone uwtable define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #3 comdat align 2 { entry: %this.addr = alloca %struct.dim3*, align 8 %vx.addr = alloca i32, align 4 %vy.addr = alloca i32, align 4 %vz.addr = alloca i32, align 4 store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 store i32 %vx, i32* %vx.addr, align 4 store i32 %vy, i32* %vy.addr, align 4 store i32 %vz, i32* %vz.addr, align 4 %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 %0 = load i32, i32* %vx.addr, align 4 store i32 %0, i32* %x, align 4 %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 %1 = load i32, i32* %vy.addr, align 4 store i32 %1, i32* %y, align 4 %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 %2 = load i32, i32* %vz.addr, align 4 store i32 %2, i32* %z, align 4 ret void } declare dso_local i32 @cudaFree(i8*) #2 define internal void @__cuda_register_globals(i8** %0) { entry: %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK to i8*), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.19, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) ret void } declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) declare dso_local i8** @__cudaRegisterFatBinary(i8*) define internal void @__cuda_module_ctor(i8* %0) { entry: %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) store i8** %1, i8*** @__cuda_gpubin_handle, align 8 call void @__cuda_register_globals(i8** %1) call void @__cudaRegisterFatBinaryEnd(i8** %1) %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) ret void } declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) declare dso_local void @__cudaUnregisterFatBinary(i8**) define internal void @__cuda_module_dtor(i8* %0) { entry: %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 call void @__cudaUnregisterFatBinary(i8** %1) ret void } declare dso_local i32 @atexit(void (i8*)*) attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #3 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}