; ModuleID = 'gaussian-host-x86_64-unknown-linux-gnu.bc' source_filename = "gaussian.cu" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } %struct.timeval = type { i64, i64 } %struct.timezone = type { i32, i32 } %struct.cudaDeviceProp = type { [256 x i8], %struct.CUuuid_st, [8 x i8], i32, i64, i64, i32, i32, i64, i32, [3 x i32], [3 x i32], i32, i64, i32, i32, i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], [2 x i32], [3 x i32], [2 x i32], [3 x i32], [3 x i32], i32, [2 x i32], [3 x i32], [2 x i32], i32, [2 x i32], [3 x i32], [2 x i32], [3 x i32], i32, [2 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32 } %struct.CUuuid_st = type { [16 x i8] } %struct.dim3 = type { i32, i32, i32 } %struct.CUstream_st = type opaque $_ZSt3expf = comdat any $_ZN4dim3C2Ejjj = comdat any @Size = dso_local global i32 0, align 4 @a = dso_local global float* null, align 8 @b = dso_local global float* null, align 8 @finalVec = dso_local global float* null, align 8 @m = dso_local global float* null, align 8 @fp = dso_local global %struct._IO_FILE* null, align 8 @totalKernelTime = dso_local global i32 0, align 4 @.str = private unnamed_addr constant [56 x i8] c"WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\0A\00", align 1 @.str.1 = private unnamed_addr constant [45 x i8] c"Usage: gaussian -f filename / -s size [-q]\0A\0A\00", align 1 @.str.2 = private unnamed_addr constant [62 x i8] c"-q (quiet) suppresses printing the matrix and result values.\0A\00", align 1 @.str.3 = private unnamed_addr constant [34 x i8] c"-f (filename) path of input file\0A\00", align 1 @.str.4 = private unnamed_addr constant [66 x i8] c"-s (size) size of matrix. Create matrix and rhs in this program \0A\00", align 1 @.str.5 = private unnamed_addr constant [68 x i8] c"The first line of the file contains the dimension of the matrix, n.\00", align 1 @.str.6 = private unnamed_addr constant [43 x i8] c"The second line of the file is a newline.\0A\00", align 1 @.str.7 = private unnamed_addr constant [64 x i8] c"The next n lines contain n tab separated values for the matrix.\00", align 1 @.str.8 = private unnamed_addr constant [41 x i8] c"The next line of the file is a newline.\0A\00", align 1 @.str.9 = private unnamed_addr constant [70 x i8] c"The next line of the file is a 1xn vector with tab separated values.\0A\00", align 1 @.str.10 = private unnamed_addr constant [52 x i8] c"The next line of the file is a newline. (optional)\0A\00", align 1 @.str.11 = private unnamed_addr constant [69 x i8] c"The final line of the file is the pre-computed solution. (optional)\0A\00", align 1 @.str.12 = private unnamed_addr constant [23 x i8] c"Example: matrix4.txt:\0A\00", align 1 @.str.13 = private unnamed_addr constant [3 x i8] c"4\0A\00", align 1 @.str.14 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1 @.str.15 = private unnamed_addr constant [19 x i8] c"-0.6\09-0.5\090.7\090.3\0A\00", align 1 @.str.16 = private unnamed_addr constant [19 x i8] c"-0.3\09-0.9\090.3\090.7\0A\00", align 1 @.str.17 = private unnamed_addr constant [21 x i8] c"-0.4\09-0.5\09-0.3\09-0.8\0A\00", align 1 @.str.18 = private unnamed_addr constant [18 x i8] c"0.0\09-0.1\090.2\090.9\0A\00", align 1 @.str.19 = private unnamed_addr constant [24 x i8] c"-0.85\09-0.68\090.24\09-0.53\0A\00", align 1 @.str.20 = private unnamed_addr constant [19 x i8] c"0.7\090.0\09-0.4\09-0.5\0A\00", align 1 @.str.21 = private unnamed_addr constant [47 x i8] c"Create matrix internally in parse, size = %d \0A\00", align 1 @.str.22 = private unnamed_addr constant [20 x i8] c"Read file from %s \0A\00", align 1 @.str.23 = private unnamed_addr constant [15 x i8] c"Matrix m is: \0A\00", align 1 @.str.24 = private unnamed_addr constant [15 x i8] c"Matrix a is: \0A\00", align 1 @.str.25 = private unnamed_addr constant [14 x i8] c"Array b is: \0A\00", align 1 @.str.26 = private unnamed_addr constant [25 x i8] c"The final solution is: \0A\00", align 1 @.str.27 = private unnamed_addr constant [49 x i8] c"\0ATime total (including memory transfers)\09%f sec\0A\00", align 1 @.str.28 = private unnamed_addr constant [31 x i8] c"Time for CUDA kernels:\09%f sec\0A\00", align 1 @.str.29 = private unnamed_addr constant [23 x i8] c"Total Device found: %d\00", align 1 @.str.30 = private unnamed_addr constant [22 x i8] c"\0ADevice Name \09\09 - %s \00", align 1 @.str.31 = private unnamed_addr constant [40 x i8] c"\0A**************************************\00", align 1 @.str.32 = private unnamed_addr constant [33 x i8] c"\0ATotal Global Memory\09\09\09 - %lu KB\00", align 1 @.str.33 = private unnamed_addr constant [46 x i8] c"\0AShared memory available per block \09 - %lu KB\00", align 1 @.str.34 = private unnamed_addr constant [45 x i8] c"\0ANumber of registers per thread block \09 - %d\00", align 1 @.str.35 = private unnamed_addr constant [31 x i8] c"\0AWarp size in threads \09\09\09 - %d\00", align 1 @.str.36 = private unnamed_addr constant [31 x i8] c"\0AMemory Pitch \09\09\09\09 - %zu bytes\00", align 1 @.str.37 = private unnamed_addr constant [35 x i8] c"\0AMaximum threads per block \09\09 - %d\00", align 1 @.str.38 = private unnamed_addr constant [47 x i8] c"\0AMaximum Thread Dimension (block) \09 - %d %d %d\00", align 1 @.str.39 = private unnamed_addr constant [46 x i8] c"\0AMaximum Thread Dimension (grid) \09 - %d %d %d\00", align 1 @.str.40 = private unnamed_addr constant [39 x i8] c"\0ATotal constant memory \09\09\09 - %zu bytes\00", align 1 @.str.41 = private unnamed_addr constant [23 x i8] c"\0ACUDA ver \09\09\09\09 - %d.%d\00", align 1 @.str.42 = private unnamed_addr constant [26 x i8] c"\0AClock rate \09\09\09\09 - %d KHz\00", align 1 @.str.43 = private unnamed_addr constant [35 x i8] c"\0ATexture Alignment \09\09\09 - %zu bytes\00", align 1 @.str.44 = private unnamed_addr constant [26 x i8] c"\0ADevice Overlap \09\09\09\09 - %s\00", align 1 @.str.45 = private unnamed_addr constant [8 x i8] c"Allowed\00", align 1 @.str.46 = private unnamed_addr constant [12 x i8] c"Not Allowed\00", align 1 @.str.47 = private unnamed_addr constant [38 x i8] c"\0ANumber of Multi processors \09\09 - %d\0A\0A\00", align 1 @.str.48 = private unnamed_addr constant [4 x i8] c"\0A%s\00", align 1 @.str.49 = private unnamed_addr constant [22 x i8] c"The file name is: %s\0A\00", align 1 @.str.50 = private unnamed_addr constant [2 x i8] c"r\00", align 1 @.str.51 = private unnamed_addr constant [3 x i8] c"%d\00", align 1 @.str.52 = private unnamed_addr constant [24 x i8] c"The input matrix a is:\0A\00", align 1 @.str.53 = private unnamed_addr constant [23 x i8] c"The input array b is:\0A\00", align 1 @.str.54 = private unnamed_addr constant [18 x i8] c"1d grid size: %d\0A\00", align 1 @.str.55 = private unnamed_addr constant [14 x i8] c"BlockXY: %d \0A\00", align 1 @.str.56 = private unnamed_addr constant [32 x i8] c"first grid size: %d second: %d\0A\00", align 1 @.str.57 = private unnamed_addr constant [5 x i8] c"Fan2\00", align 1 @.str.58 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 @.str.59 = private unnamed_addr constant [6 x i8] c"%.2f \00", align 1 @.str.60 = private unnamed_addr constant [3 x i8] c"\0A\0A\00", align 1 @stderr = external dso_local global %struct._IO_FILE*, align 8 @.str.61 = private unnamed_addr constant [21 x i8] c"Cuda error: %s: %s.\0A\00", align 1 @0 = private unnamed_addr constant [14 x i8] c"_Z4Fan1PfS_ii\00", align 1 @1 = private unnamed_addr constant [17 x i8] c"_Z4Fan2PfS_S_iii\00", align 1 @2 = private constant [16065 x i8] c"P\EDU\BA\01\00\10\00\B0>\00\00\00\00\00\00\02\00\01\01@\00\00\00h4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\C03\00\00\00\00\00\00\C00\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z4Fan2PfS_S_iii\00.nv.info._Z4Fan2PfS_S_iii\00.nv.shared._Z4Fan2PfS_S_iii\00.nv.global\00.nv.constant0._Z4Fan2PfS_S_iii\00.text._Z4Fan1PfS_ii\00.nv.info._Z4Fan1PfS_ii\00.nv.shared._Z4Fan1PfS_ii\00.nv.constant0._Z4Fan1PfS_ii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z4Fan2PfS_S_iii\00.text._Z4Fan2PfS_S_iii\00.nv.info._Z4Fan2PfS_S_iii\00.nv.shared._Z4Fan2PfS_S_iii\00.nv.global\00threadIdx\00blockIdx\00blockDim\00.nv.constant0._Z4Fan2PfS_S_iii\00_param\00_Z4Fan1PfS_ii\00.text._Z4Fan1PfS_ii\00.nv.info._Z4Fan1PfS_ii\00.nv.shared._Z4Fan1PfS_ii\00$_Z4Fan1PfS_ii$__cuda_sm3x_div_rn_noftz_f32\00$_Z4Fan1PfS_ii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00.nv.constant0._Z4Fan1PfS_ii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00C\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\90\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9B\00\00\00\01\00\0B\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\A5\00\00\00\01\00\0B\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\AE\00\00\00\01\00\0B\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B7\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EB\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00/\01\00\00\22\00\0A\00\D8\09\00\00\00\00\00\00`\01\00\00\00\00\00\00[\01\00\00\22\00\0A\008\0B\00\00\00\00\00\00H\08\00\00\00\00\00\00\90\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\14\00\00\00\00\00\00\DD\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\13\00\00\00\00\00\00\04/\08\00\0C\00\00\00\13\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00\18\00\00\00\04\11\08\00\0C\00\00\00\18\00\00\00\04/\08\00\0B\00\00\00\0F\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\000\00\00\00\04\11\08\00\0B\00\00\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\10\00\F8\04\00\00h\06\00\00\90\07\00\008\08\00\00\04\1C\04\00p\14\00\00\04\1E\04\00 \00\00\00\010\00\00\01*\00\00\04\0A\08\00\0A\00\00\00@\01\18\00\03\19\18\00\04\17\0C\00\00\00\00\00\03\00\14\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0\11\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00\98\03\00\00\C8\05\00\00\04\1C\04\00\D0\09\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03!\00\D3rd<3>;\0A\0Amov.u2\00\1B,e\00b;\0Acvta\8D\00\04%\00\13,\\\00\22ld\C8\00\02\18\00nrd2, [\CE\00\1E])\00\1F1)\00\01a0];\0Ast#\00\81[%SP+0],,\00\0A\16\00\128\16\00\222;\B6\00\01\D8\00a1, 999(\00\02g\00\00\EF\00\18[\9E\01\03M\00\AF1;\0Aret;\0A\0A}\D6\01\1A\FE\02FuncGetAttributes\E1\01\0D#\00\0E\EC\01\0F+\00\06\0F\F7\01\1B\1F1\F7\01Q\1F1\F7\01!\0E\D9\00\0F\02\02\0F\0E8\01\0F\0D\02\8DhDevice\B4\00\0E\0E\02\0E$\00\0F\0F\02\00/32,\00\0B\1F1,\00\18\1F2<\02\13\1F2<\02\1F\1D4<\02\1F2<\02\0C\1F2<\02\13\01_\00\04;\02\0F\D9\00\07\1D]4\00\1F14\00\06\0Fp\02\10\0E\9A\01\0Fq\02\12(32q\02\0B\15\00!12\16\00\09\86\02\1F3\86\02\15\1F3\86\02#2Get\CB\00\0E}\02\05\1B\00\04\DA\00\0F\1C\02\13?3[8W\04.\0F\1B\02\0D\1F3W\04\19\04\B3\01\0D\D0\00\0F\AA\01\06\0F\05\04W\F0\04OccupancyMaxActiveB3\08\FE\03sPerMultiprocessor\9F\01\0F;\00\16\0EB\06\0FC\00%\0EJ\04\0FC\00\1E\1F2\86\00/\1F3\88\02\13O4[32\89\02\1C\1D3\89\02\1F4\89\02\0C\1F4\89\02\19\133\89\02\0F\F1\00\1E\0F\BC\04\00\0FK\00$/2]w\07\00\0FL\00$\0F\1F\05\01\0F\98\00%\0F\A7\07\1D\097\05\186M\05\04,\00\2224-\00\183\CF\03\1F2\CF\03\15\1F2\CF\03L\9FWithFlags\D8\03(\05D\00\0E\E1\03\0FL\00'\0F\EA\030\0CL\00\1F2\98\008\1F3\98\008\1F4H\04\13O5[40\EC\08.\0FH\04\0D\1F5\EC\08\1C\0F\F9\00+\1F]\9C\040\0D\9A\01\0F\A5\040\0D:\02\0F\AE\041\0D\DB\02\0F\B7\041\0D|\03\0F\C0\04I\08-\00\1F3$\0A6\F6\0Dvisible .entry _Z4Fan1PfS_ii\96\04\00\90\00\09\1B\00\0Em\04\0F#\00\05\07D\04\00\A8\00\0F#\00\01\1F2#\00\08\0F\EE\07\1BO6[24\A6\03\15wpred %px\0A\10fH\01\18f\B8\03\03\9A\0A\0E\12\08/21\CB\03\0C\1F6\CB\03\1C\0E\F5\00\0FM\03\06\0EC\01\0F$\03\07\0E\92\01\0F\FB\02\07\0F\E1\01\02\13]\C8\00#to\DB\12\07+\04\02\C5\02\01\9E\0D\0A\1C\00\144\B4\02\0F;\00\03\145\16\03\0F;\00\00\116\1C\00\1F5H\03\02\1F6H\03\02\1F4H\03\09\04\16\00/201\03\02h%tid.x\15\00\00\BB\00\0A\1F2;\0A\02\03~\00\179~\00$3,@\0AS;\0Aneg\16\00\114\B7\0AV;\0Afma\CC\0A$5,\1A\00\132l\0A\0E\81\0A\175s\01(43s\01\01j\04\22ne\C1\003p3,!\00\02g\04\163g\04\1B6g\04\135O\04\185\B3\0D/21)\03\01/44)\03\02/45)\03\02/46)\03\03347,5\00\00$\00\09\1A\00\02u\01\1E7)\03349,\80\00\00&\00\07e\00/50w\02\03351,\1E\00\0Ce\00552,Q\00-51\F9\01\02\EE\0C\1A5\F9\01\02\F6\04\01\1C\00\0B\F9\01$4,/\01\01'\00\07\F9\01\136\F9\01\194Z\03(25\F7\0E\0C|\00\146\AF\00\08|\00$7,\1C\00\0B|\00$8,R\00\01'\00\07|\00\137|\00\1F8d\00\00\02J\01\0A0\0E\02I\05\01\1C\00\0Ad\00(31d\00'30d\00\128d\00)31\D9\02\00^\00+f6\D9\02\01y\04\02\1B\00\00\CD\00)f8\DA\02!31\DA\02\00h\08\0F\E8\06\02\B06:\0Aret;\0A\0A}\0A\00\00\00\00\00\00", section ".nv_fatbin", align 8 @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([16065 x i8], [16065 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8 @__cuda_gpubin_handle = internal global i8** null, align 8 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }] ; Function Attrs: noinline optnone uwtable define dso_local void @_Z13create_matrixPfi(float* %m, i32 %size) #0 { entry: %m.addr = alloca float*, align 8 %size.addr = alloca i32, align 4 %i = alloca i32, align 4 %j = alloca i32, align 4 %lamda = alloca float, align 4 %saved_stack = alloca i8*, align 8 %__vla_expr0 = alloca i64, align 8 %coe_i = alloca float, align 4 store float* %m, float** %m.addr, align 8 store i32 %size, i32* %size.addr, align 4 store float 0xBF847AE140000000, float* %lamda, align 4 %0 = load i32, i32* %size.addr, align 4 %mul = mul nsw i32 2, %0 %sub = sub nsw i32 %mul, 1 %1 = zext i32 %sub to i64 %2 = call i8* @llvm.stacksave() store i8* %2, i8** %saved_stack, align 8 %vla = alloca float, i64 %1, align 16 store i64 %1, i64* %__vla_expr0, align 8 store float 0.000000e+00, float* %coe_i, align 4 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %3 = load i32, i32* %i, align 4 %4 = load i32, i32* %size.addr, align 4 %cmp = icmp slt i32 %3, %4 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %5 = load float, float* %lamda, align 4 %6 = load i32, i32* %i, align 4 %conv = sitofp i32 %6 to float %mul1 = fmul contract float %5, %conv %call = call float @_ZSt3expf(float %mul1) %mul2 = fmul contract float 1.000000e+01, %call store float %mul2, float* %coe_i, align 4 %7 = load i32, i32* %size.addr, align 4 %sub3 = sub nsw i32 %7, 1 %8 = load i32, i32* %i, align 4 %add = add nsw i32 %sub3, %8 store i32 %add, i32* %j, align 4 %9 = load float, float* %coe_i, align 4 %10 = load i32, i32* %j, align 4 %idxprom = sext i32 %10 to i64 %arrayidx = getelementptr inbounds float, float* %vla, i64 %idxprom store float %9, float* %arrayidx, align 4 %11 = load i32, i32* %size.addr, align 4 %sub4 = sub nsw i32 %11, 1 %12 = load i32, i32* %i, align 4 %sub5 = sub nsw i32 %sub4, %12 store i32 %sub5, i32* %j, align 4 %13 = load float, float* %coe_i, align 4 %14 = load i32, i32* %j, align 4 %idxprom6 = sext i32 %14 to i64 %arrayidx7 = getelementptr inbounds float, float* %vla, i64 %idxprom6 store float %13, float* %arrayidx7, align 4 br label %for.inc for.inc: ; preds = %for.body %15 = load i32, i32* %i, align 4 %inc = add nsw i32 %15, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond store i32 0, i32* %i, align 4 br label %for.cond8 for.cond8: ; preds = %for.inc26, %for.end %16 = load i32, i32* %i, align 4 %17 = load i32, i32* %size.addr, align 4 %cmp9 = icmp slt i32 %16, %17 br i1 %cmp9, label %for.body10, label %for.end28 for.body10: ; preds = %for.cond8 store i32 0, i32* %j, align 4 br label %for.cond11 for.cond11: ; preds = %for.inc23, %for.body10 %18 = load i32, i32* %j, align 4 %19 = load i32, i32* %size.addr, align 4 %cmp12 = icmp slt i32 %18, %19 br i1 %cmp12, label %for.body13, label %for.end25 for.body13: ; preds = %for.cond11 %20 = load i32, i32* %size.addr, align 4 %sub14 = sub nsw i32 %20, 1 %21 = load i32, i32* %i, align 4 %sub15 = sub nsw i32 %sub14, %21 %22 = load i32, i32* %j, align 4 %add16 = add nsw i32 %sub15, %22 %idxprom17 = sext i32 %add16 to i64 %arrayidx18 = getelementptr inbounds float, float* %vla, i64 %idxprom17 %23 = load float, float* %arrayidx18, align 4 %24 = load float*, float** %m.addr, align 8 %25 = load i32, i32* %i, align 4 %26 = load i32, i32* %size.addr, align 4 %mul19 = mul nsw i32 %25, %26 %27 = load i32, i32* %j, align 4 %add20 = add nsw i32 %mul19, %27 %idxprom21 = sext i32 %add20 to i64 %arrayidx22 = getelementptr inbounds float, float* %24, i64 %idxprom21 store float %23, float* %arrayidx22, align 4 br label %for.inc23 for.inc23: ; preds = %for.body13 %28 = load i32, i32* %j, align 4 %inc24 = add nsw i32 %28, 1 store i32 %inc24, i32* %j, align 4 br label %for.cond11 for.end25: ; preds = %for.cond11 br label %for.inc26 for.inc26: ; preds = %for.end25 %29 = load i32, i32* %i, align 4 %inc27 = add nsw i32 %29, 1 store i32 %inc27, i32* %i, align 4 br label %for.cond8 for.end28: ; preds = %for.cond8 %30 = load i8*, i8** %saved_stack, align 8 call void @llvm.stackrestore(i8* %30) ret void } ; Function Attrs: nounwind declare i8* @llvm.stacksave() #1 ; Function Attrs: noinline nounwind optnone uwtable define linkonce_odr dso_local float @_ZSt3expf(float %__x) #2 comdat { entry: %__x.addr = alloca float, align 4 store float %__x, float* %__x.addr, align 4 %0 = load float, float* %__x.addr, align 4 %call = call float @expf(float %0) #1 ret float %call } ; Function Attrs: nounwind declare void @llvm.stackrestore(i8*) #1 ; Function Attrs: noinline norecurse optnone uwtable define dso_local i32 @main(i32 %argc, i8** %argv) #3 { entry: %retval = alloca i32, align 4 %argc.addr = alloca i32, align 4 %argv.addr = alloca i8**, align 8 %verbose = alloca i32, align 4 %i = alloca i32, align 4 %j = alloca i32, align 4 %flag = alloca i8, align 1 %time_start = alloca %struct.timeval, align 8 %time_end = alloca %struct.timeval, align 8 %time_total = alloca i32, align 4 store i32 0, i32* %retval, align 4 store i32 %argc, i32* %argc.addr, align 4 store i8** %argv, i8*** %argv.addr, align 8 %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([56 x i8], [56 x i8]* @.str, i64 0, i64 0), i32 512, i32 1, i32 1) store i32 1, i32* %verbose, align 4 %0 = load i32, i32* %argc.addr, align 4 %cmp = icmp slt i32 %0, 2 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @.str.1, i64 0, i64 0)) %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str.2, i64 0, i64 0)) %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.3, i64 0, i64 0)) %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([66 x i8], [66 x i8]* @.str.4, i64 0, i64 0)) %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([68 x i8], [68 x i8]* @.str.5, i64 0, i64 0)) %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.6, i64 0, i64 0)) %call7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([64 x i8], [64 x i8]* @.str.7, i64 0, i64 0)) %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.8, i64 0, i64 0)) %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([70 x i8], [70 x i8]* @.str.9, i64 0, i64 0)) %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([52 x i8], [52 x i8]* @.str.10, i64 0, i64 0)) %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([69 x i8], [69 x i8]* @.str.11, i64 0, i64 0)) %call12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.12, i64 0, i64 0)) %call13 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.13, i64 0, i64 0)) %call14 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) %call15 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.15, i64 0, i64 0)) %call16 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.16, i64 0, i64 0)) %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.17, i64 0, i64 0)) %call18 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.18, i64 0, i64 0)) %call19 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.19, i64 0, i64 0)) %call21 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.14, i64 0, i64 0)) %call22 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.20, i64 0, i64 0)) call void @exit(i32 0) #9 unreachable if.end: ; preds = %entry %call23 = call i32 @cudaSetDevice(i32 0) call void @_Z21PrintDevicePropertiesv() store i32 1, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc61, %if.end %1 = load i32, i32* %i, align 4 %2 = load i32, i32* %argc.addr, align 4 %cmp24 = icmp slt i32 %1, %2 br i1 %cmp24, label %for.body, label %for.end63 for.body: ; preds = %for.cond %3 = load i8**, i8*** %argv.addr, align 8 %4 = load i32, i32* %i, align 4 %idxprom = sext i32 %4 to i64 %arrayidx = getelementptr inbounds i8*, i8** %3, i64 %idxprom %5 = load i8*, i8** %arrayidx, align 8 %arrayidx25 = getelementptr inbounds i8, i8* %5, i64 0 %6 = load i8, i8* %arrayidx25, align 1 %conv = sext i8 %6 to i32 %cmp26 = icmp eq i32 %conv, 45 br i1 %cmp26, label %if.then27, label %if.end60 if.then27: ; preds = %for.body %7 = load i8**, i8*** %argv.addr, align 8 %8 = load i32, i32* %i, align 4 %idxprom28 = sext i32 %8 to i64 %arrayidx29 = getelementptr inbounds i8*, i8** %7, i64 %idxprom28 %9 = load i8*, i8** %arrayidx29, align 8 %arrayidx30 = getelementptr inbounds i8, i8* %9, i64 1 %10 = load i8, i8* %arrayidx30, align 1 store i8 %10, i8* %flag, align 1 %11 = load i8, i8* %flag, align 1 %conv31 = sext i8 %11 to i32 switch i32 %conv31, label %sw.epilog [ i32 115, label %sw.bb i32 102, label %sw.bb52 i32 113, label %sw.bb59 ] sw.bb: ; preds = %if.then27 %12 = load i32, i32* %i, align 4 %inc = add nsw i32 %12, 1 store i32 %inc, i32* %i, align 4 %13 = load i8**, i8*** %argv.addr, align 8 %14 = load i32, i32* %i, align 4 %idxprom32 = sext i32 %14 to i64 %arrayidx33 = getelementptr inbounds i8*, i8** %13, i64 %idxprom32 %15 = load i8*, i8** %arrayidx33, align 8 %call34 = call i32 @atoi(i8* %15) #10 store i32 %call34, i32* @Size, align 4 %16 = load i32, i32* @Size, align 4 %call35 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.21, i64 0, i64 0), i32 %16) %17 = load i32, i32* @Size, align 4 %18 = load i32, i32* @Size, align 4 %mul = mul nsw i32 %17, %18 %conv36 = sext i32 %mul to i64 %mul37 = mul i64 %conv36, 4 %call38 = call noalias i8* @malloc(i64 %mul37) #1 %19 = bitcast i8* %call38 to float* store float* %19, float** @a, align 8 %20 = load float*, float** @a, align 8 %21 = load i32, i32* @Size, align 4 call void @_Z13create_matrixPfi(float* %20, i32 %21) %22 = load i32, i32* @Size, align 4 %conv39 = sext i32 %22 to i64 %mul40 = mul i64 %conv39, 4 %call41 = call noalias i8* @malloc(i64 %mul40) #1 %23 = bitcast i8* %call41 to float* store float* %23, float** @b, align 8 store i32 0, i32* %j, align 4 br label %for.cond42 for.cond42: ; preds = %for.inc, %sw.bb %24 = load i32, i32* %j, align 4 %25 = load i32, i32* @Size, align 4 %cmp43 = icmp slt i32 %24, %25 br i1 %cmp43, label %for.body44, label %for.end for.body44: ; preds = %for.cond42 %26 = load float*, float** @b, align 8 %27 = load i32, i32* %j, align 4 %idxprom45 = sext i32 %27 to i64 %arrayidx46 = getelementptr inbounds float, float* %26, i64 %idxprom45 store float 1.000000e+00, float* %arrayidx46, align 4 br label %for.inc for.inc: ; preds = %for.body44 %28 = load i32, i32* %j, align 4 %inc47 = add nsw i32 %28, 1 store i32 %inc47, i32* %j, align 4 br label %for.cond42 for.end: ; preds = %for.cond42 %29 = load i32, i32* @Size, align 4 %30 = load i32, i32* @Size, align 4 %mul48 = mul nsw i32 %29, %30 %conv49 = sext i32 %mul48 to i64 %mul50 = mul i64 %conv49, 4 %call51 = call noalias i8* @malloc(i64 %mul50) #1 %31 = bitcast i8* %call51 to float* store float* %31, float** @m, align 8 br label %sw.epilog sw.bb52: ; preds = %if.then27 %32 = load i32, i32* %i, align 4 %inc53 = add nsw i32 %32, 1 store i32 %inc53, i32* %i, align 4 %33 = load i8**, i8*** %argv.addr, align 8 %34 = load i32, i32* %i, align 4 %idxprom54 = sext i32 %34 to i64 %arrayidx55 = getelementptr inbounds i8*, i8** %33, i64 %idxprom54 %35 = load i8*, i8** %arrayidx55, align 8 %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.22, i64 0, i64 0), i8* %35) %36 = load i8**, i8*** %argv.addr, align 8 %37 = load i32, i32* %i, align 4 %idxprom57 = sext i32 %37 to i64 %arrayidx58 = getelementptr inbounds i8*, i8** %36, i64 %idxprom57 %38 = load i8*, i8** %arrayidx58, align 8 call void @_Z15InitProblemOncePc(i8* %38) br label %sw.epilog sw.bb59: ; preds = %if.then27 store i32 1, i32* %verbose, align 4 br label %sw.epilog sw.epilog: ; preds = %if.then27, %sw.bb59, %sw.bb52, %for.end br label %if.end60 if.end60: ; preds = %sw.epilog, %for.body br label %for.inc61 for.inc61: ; preds = %if.end60 %39 = load i32, i32* %i, align 4 %inc62 = add nsw i32 %39, 1 store i32 %inc62, i32* %i, align 4 br label %for.cond for.end63: ; preds = %for.cond call void @_Z10InitPerRunv() %call64 = call i32 @gettimeofday(%struct.timeval* %time_start, %struct.timezone* null) #1 call void @_Z10ForwardSubv() %call65 = call i32 @gettimeofday(%struct.timeval* %time_end, %struct.timezone* null) #1 %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 0 %40 = load i64, i64* %tv_sec, align 8 %mul66 = mul nsw i64 %40, 1000000 %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 1 %41 = load i64, i64* %tv_usec, align 8 %add = add nsw i64 %mul66, %41 %tv_sec67 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 0 %42 = load i64, i64* %tv_sec67, align 8 %mul68 = mul nsw i64 %42, 1000000 %tv_usec69 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 1 %43 = load i64, i64* %tv_usec69, align 8 %add70 = add nsw i64 %mul68, %43 %sub = sub nsw i64 %add, %add70 %conv71 = trunc i64 %sub to i32 store i32 %conv71, i32* %time_total, align 4 %44 = load i32, i32* %verbose, align 4 %tobool = icmp ne i32 %44, 0 br i1 %tobool, label %if.then72, label %if.end76 if.then72: ; preds = %for.end63 %call73 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.23, i64 0, i64 0)) %45 = load float*, float** @m, align 8 %46 = load i32, i32* @Size, align 4 %47 = load i32, i32* @Size, align 4 call void @_Z8PrintMatPfii(float* %45, i32 %46, i32 %47) %call74 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.24, i64 0, i64 0)) %48 = load float*, float** @a, align 8 %49 = load i32, i32* @Size, align 4 %50 = load i32, i32* @Size, align 4 call void @_Z8PrintMatPfii(float* %48, i32 %49, i32 %50) %call75 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.25, i64 0, i64 0)) %51 = load float*, float** @b, align 8 %52 = load i32, i32* @Size, align 4 call void @_Z8PrintAryPfi(float* %51, i32 %52) br label %if.end76 if.end76: ; preds = %if.then72, %for.end63 call void @_Z7BackSubv() %53 = load i32, i32* %verbose, align 4 %tobool77 = icmp ne i32 %53, 0 br i1 %tobool77, label %if.then78, label %if.end80 if.then78: ; preds = %if.end76 %call79 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.26, i64 0, i64 0)) %54 = load float*, float** @finalVec, align 8 %55 = load i32, i32* @Size, align 4 call void @_Z8PrintAryPfi(float* %54, i32 %55) br label %if.end80 if.end80: ; preds = %if.then78, %if.end76 %56 = load i32, i32* %time_total, align 4 %conv81 = uitofp i32 %56 to double %mul82 = fmul contract double %conv81, 0x3EB0C6F7A0B5ED8D %call83 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.27, i64 0, i64 0), double %mul82) %57 = load i32, i32* @totalKernelTime, align 4 %conv84 = uitofp i32 %57 to double %mul85 = fmul contract double %conv84, 0x3EB0C6F7A0B5ED8D %call86 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.28, i64 0, i64 0), double %mul85) %58 = load float*, float** @m, align 8 %59 = bitcast float* %58 to i8* call void @free(i8* %59) #1 %60 = load float*, float** @a, align 8 %61 = bitcast float* %60 to i8* call void @free(i8* %61) #1 %62 = load float*, float** @b, align 8 %63 = bitcast float* %62 to i8* call void @free(i8* %63) #1 %64 = load i32, i32* %retval, align 4 ret i32 %64 } declare dso_local i32 @printf(i8*, ...) #4 ; Function Attrs: noreturn nounwind declare dso_local void @exit(i32) #5 declare dso_local i32 @cudaSetDevice(i32) #4 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z21PrintDevicePropertiesv() #0 { entry: %deviceProp = alloca %struct.cudaDeviceProp, align 8 %nDevCount = alloca i32, align 4 %nDeviceIdx = alloca i32, align 4 store i32 0, i32* %nDevCount, align 4 %call = call i32 @cudaGetDeviceCount(i32* %nDevCount) %0 = load i32, i32* %nDevCount, align 4 %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.29, i64 0, i64 0), i32 %0) store i32 0, i32* %nDeviceIdx, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %1 = load i32, i32* %nDeviceIdx, align 4 %2 = load i32, i32* %nDevCount, align 4 %cmp = icmp slt i32 %1, %2 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %3 = bitcast %struct.cudaDeviceProp* %deviceProp to i8* call void @llvm.memset.p0i8.i64(i8* align 8 %3, i8 0, i64 712, i1 false) %4 = load i32, i32* %nDeviceIdx, align 4 %call2 = call i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp* %deviceProp, i32 %4) %cmp3 = icmp eq i32 0, %call2 br i1 %cmp3, label %if.then, label %if.else if.then: ; preds = %for.body %name = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 0 %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %name, i64 0, i64 0 %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.30, i64 0, i64 0), i8* %arraydecay) %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.31, i64 0, i64 0)) %totalGlobalMem = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 4 %5 = load i64, i64* %totalGlobalMem, align 8 %div = udiv i64 %5, 1024 %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.32, i64 0, i64 0), i64 %div) %sharedMemPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 5 %6 = load i64, i64* %sharedMemPerBlock, align 8 %div7 = udiv i64 %6, 1024 %call8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.33, i64 0, i64 0), i64 %div7) %regsPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 6 %7 = load i32, i32* %regsPerBlock, align 8 %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @.str.34, i64 0, i64 0), i32 %7) %warpSize = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 7 %8 = load i32, i32* %warpSize, align 4 %call10 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.35, i64 0, i64 0), i32 %8) %memPitch = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 8 %9 = load i64, i64* %memPitch, align 8 %call11 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.36, i64 0, i64 0), i64 %9) %maxThreadsPerBlock = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 9 %10 = load i32, i32* %maxThreadsPerBlock, align 8 %call12 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.37, i64 0, i64 0), i32 %10) %maxThreadsDim = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim, i64 0, i64 0 %11 = load i32, i32* %arrayidx, align 4 %maxThreadsDim13 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 %arrayidx14 = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim13, i64 0, i64 1 %12 = load i32, i32* %arrayidx14, align 4 %maxThreadsDim15 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 10 %arrayidx16 = getelementptr inbounds [3 x i32], [3 x i32]* %maxThreadsDim15, i64 0, i64 2 %13 = load i32, i32* %arrayidx16, align 4 %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.38, i64 0, i64 0), i32 %11, i32 %12, i32 %13) %maxGridSize = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 %arrayidx18 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize, i64 0, i64 0 %14 = load i32, i32* %arrayidx18, align 8 %maxGridSize19 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 %arrayidx20 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize19, i64 0, i64 1 %15 = load i32, i32* %arrayidx20, align 4 %maxGridSize21 = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 11 %arrayidx22 = getelementptr inbounds [3 x i32], [3 x i32]* %maxGridSize21, i64 0, i64 2 %16 = load i32, i32* %arrayidx22, align 8 %call23 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.39, i64 0, i64 0), i32 %14, i32 %15, i32 %16) %totalConstMem = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 13 %17 = load i64, i64* %totalConstMem, align 8 %call24 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.40, i64 0, i64 0), i64 %17) %major = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 14 %18 = load i32, i32* %major, align 8 %minor = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 15 %19 = load i32, i32* %minor, align 4 %call25 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.41, i64 0, i64 0), i32 %18, i32 %19) %clockRate = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 12 %20 = load i32, i32* %clockRate, align 4 %call26 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.42, i64 0, i64 0), i32 %20) %textureAlignment = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 16 %21 = load i64, i64* %textureAlignment, align 8 %call27 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.43, i64 0, i64 0), i64 %21) %deviceOverlap = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 18 %22 = load i32, i32* %deviceOverlap, align 8 %tobool = icmp ne i32 %22, 0 %23 = zext i1 %tobool to i64 %cond = select i1 %tobool, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.45, i64 0, i64 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.46, i64 0, i64 0) %call28 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.44, i64 0, i64 0), i8* %cond) %multiProcessorCount = getelementptr inbounds %struct.cudaDeviceProp, %struct.cudaDeviceProp* %deviceProp, i32 0, i32 19 %24 = load i32, i32* %multiProcessorCount, align 4 %call29 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.47, i64 0, i64 0), i32 %24) br label %if.end if.else: ; preds = %for.body %call30 = call i32 @cudaGetLastError() %call31 = call i8* @cudaGetErrorString(i32 %call30) %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.48, i64 0, i64 0), i8* %call31) br label %if.end if.end: ; preds = %if.else, %if.then br label %for.inc for.inc: ; preds = %if.end %25 = load i32, i32* %nDeviceIdx, align 4 %inc = add nsw i32 %25, 1 store i32 %inc, i32* %nDeviceIdx, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Function Attrs: nounwind readonly declare dso_local i32 @atoi(i8*) #6 ; Function Attrs: nounwind declare dso_local noalias i8* @malloc(i64) #7 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z15InitProblemOncePc(i8* %filename) #0 { entry: %filename.addr = alloca i8*, align 8 store i8* %filename, i8** %filename.addr, align 8 %0 = load i8*, i8** %filename.addr, align 8 %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.49, i64 0, i64 0), i8* %0) %1 = load i8*, i8** %filename.addr, align 8 %call1 = call %struct._IO_FILE* @fopen(i8* %1, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.50, i64 0, i64 0)) store %struct._IO_FILE* %call1, %struct._IO_FILE** @fp, align 8 %2 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.51, i64 0, i64 0), i32* @Size) %3 = load i32, i32* @Size, align 4 %4 = load i32, i32* @Size, align 4 %mul = mul nsw i32 %3, %4 %conv = sext i32 %mul to i64 %mul3 = mul i64 %conv, 4 %call4 = call noalias i8* @malloc(i64 %mul3) #1 %5 = bitcast i8* %call4 to float* store float* %5, float** @a, align 8 %6 = load float*, float** @a, align 8 %7 = load i32, i32* @Size, align 4 %8 = load i32, i32* @Size, align 4 call void @_Z7InitMatPfii(float* %6, i32 %7, i32 %8) %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.52, i64 0, i64 0)) %9 = load float*, float** @a, align 8 %10 = load i32, i32* @Size, align 4 %11 = load i32, i32* @Size, align 4 call void @_Z8PrintMatPfii(float* %9, i32 %10, i32 %11) %12 = load i32, i32* @Size, align 4 %conv6 = sext i32 %12 to i64 %mul7 = mul i64 %conv6, 4 %call8 = call noalias i8* @malloc(i64 %mul7) #1 %13 = bitcast i8* %call8 to float* store float* %13, float** @b, align 8 %14 = load float*, float** @b, align 8 %15 = load i32, i32* @Size, align 4 call void @_Z7InitAryPfi(float* %14, i32 %15) %call9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.53, i64 0, i64 0)) %16 = load float*, float** @b, align 8 %17 = load i32, i32* @Size, align 4 call void @_Z8PrintAryPfi(float* %16, i32 %17) %18 = load i32, i32* @Size, align 4 %19 = load i32, i32* @Size, align 4 %mul10 = mul nsw i32 %18, %19 %conv11 = sext i32 %mul10 to i64 %mul12 = mul i64 %conv11, 4 %call13 = call noalias i8* @malloc(i64 %mul12) #1 %20 = bitcast i8* %call13 to float* store float* %20, float** @m, align 8 ret void } ; Function Attrs: noinline nounwind optnone uwtable define dso_local void @_Z10InitPerRunv() #2 { entry: %i = alloca i32, align 4 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %1 = load i32, i32* @Size, align 4 %2 = load i32, i32* @Size, align 4 %mul = mul nsw i32 %1, %2 %cmp = icmp slt i32 %0, %mul br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %3 = load float*, float** @m, align 8 %4 = load i32, i32* %i, align 4 %idx.ext = sext i32 %4 to i64 %add.ptr = getelementptr inbounds float, float* %3, i64 %idx.ext store float 0.000000e+00, float* %add.ptr, align 4 br label %for.inc for.inc: ; preds = %for.body %5 = load i32, i32* %i, align 4 %inc = add nsw i32 %5, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Function Attrs: nounwind declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #7 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z10ForwardSubv() #0 { entry: %t = alloca i32, align 4 %m_cuda = alloca float*, align 8 %a_cuda = alloca float*, align 8 %b_cuda = alloca float*, align 8 %A = alloca i32, align 4 %B = alloca i32, align 4 %C = alloca i32, align 4 %D = alloca i32, align 4 %E = alloca i32, align 4 %F = alloca i32, align 4 %block_size = alloca i32, align 4 %grid_size = alloca i32, align 4 %dimBlock = alloca %struct.dim3, align 4 %dimGrid = alloca %struct.dim3, align 4 %blockSize2d = alloca i32, align 4 %gridSize2d = alloca i32, align 4 %dimBlockXY = alloca %struct.dim3, align 4 %dimGridXY = alloca %struct.dim3, align 4 %time_start = alloca %struct.timeval, align 8 %agg.tmp = alloca %struct.dim3, align 4 %agg.tmp32 = alloca %struct.dim3, align 4 %agg.tmp.coerce = alloca { i64, i32 }, align 4 %agg.tmp32.coerce = alloca { i64, i32 }, align 4 %agg.tmp36 = alloca %struct.dim3, align 4 %agg.tmp37 = alloca %struct.dim3, align 4 %agg.tmp36.coerce = alloca { i64, i32 }, align 4 %agg.tmp37.coerce = alloca { i64, i32 }, align 4 %time_end = alloca %struct.timeval, align 8 store i32 1, i32* %A, align 4 store i32 2, i32* %B, align 4 store i32 3, i32* %C, align 4 store i32 4, i32* %D, align 4 store i32 5, i32* %E, align 4 store i32 6, i32* %F, align 4 %0 = bitcast float** %m_cuda to i8** %1 = load i32, i32* @Size, align 4 %2 = load i32, i32* @Size, align 4 %mul = mul nsw i32 %1, %2 %conv = sext i32 %mul to i64 %mul1 = mul i64 %conv, 4 %call = call i32 @cudaMalloc(i8** %0, i64 %mul1) %3 = bitcast float** %a_cuda to i8** %4 = load i32, i32* @Size, align 4 %5 = load i32, i32* @Size, align 4 %mul2 = mul nsw i32 %4, %5 %conv3 = sext i32 %mul2 to i64 %mul4 = mul i64 %conv3, 4 %call5 = call i32 @cudaMalloc(i8** %3, i64 %mul4) %6 = bitcast float** %b_cuda to i8** %7 = load i32, i32* @Size, align 4 %conv6 = sext i32 %7 to i64 %mul7 = mul i64 %conv6, 4 %call8 = call i32 @cudaMalloc(i8** %6, i64 %mul7) %8 = load float*, float** %m_cuda, align 8 %9 = bitcast float* %8 to i8* %10 = load float*, float** @m, align 8 %11 = bitcast float* %10 to i8* %12 = load i32, i32* @Size, align 4 %13 = load i32, i32* @Size, align 4 %mul9 = mul nsw i32 %12, %13 %conv10 = sext i32 %mul9 to i64 %mul11 = mul i64 %conv10, 4 %call12 = call i32 @cudaMemcpy(i8* %9, i8* %11, i64 %mul11, i32 1) %14 = load float*, float** %a_cuda, align 8 %15 = bitcast float* %14 to i8* %16 = load float*, float** @a, align 8 %17 = bitcast float* %16 to i8* %18 = load i32, i32* @Size, align 4 %19 = load i32, i32* @Size, align 4 %mul13 = mul nsw i32 %18, %19 %conv14 = sext i32 %mul13 to i64 %mul15 = mul i64 %conv14, 4 %call16 = call i32 @cudaMemcpy(i8* %15, i8* %17, i64 %mul15, i32 1) %20 = load float*, float** %b_cuda, align 8 %21 = bitcast float* %20 to i8* %22 = load float*, float** @b, align 8 %23 = bitcast float* %22 to i8* %24 = load i32, i32* @Size, align 4 %conv17 = sext i32 %24 to i64 %mul18 = mul i64 %conv17, 4 %call19 = call i32 @cudaMemcpy(i8* %21, i8* %23, i64 %mul18, i32 1) store i32 512, i32* %block_size, align 4 %25 = load i32, i32* @Size, align 4 %26 = load i32, i32* %block_size, align 4 %div = sdiv i32 %25, %26 %27 = load i32, i32* @Size, align 4 %28 = load i32, i32* %block_size, align 4 %rem = srem i32 %27, %28 %tobool = icmp ne i32 %rem, 0 %lnot = xor i1 %tobool, true %29 = zext i1 %lnot to i64 %cond = select i1 %lnot, i32 0, i32 1 %add = add nsw i32 %div, %cond store i32 %add, i32* %grid_size, align 4 %30 = load i32, i32* %grid_size, align 4 %call20 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.54, i64 0, i64 0), i32 %30) %31 = load i32, i32* %block_size, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 %31, i32 1, i32 1) %32 = load i32, i32* %grid_size, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %32, i32 1, i32 1) store i32 1, i32* %blockSize2d, align 4 %33 = load i32, i32* @Size, align 4 %34 = load i32, i32* %blockSize2d, align 4 %div21 = sdiv i32 %33, %34 %35 = load i32, i32* @Size, align 4 %36 = load i32, i32* %blockSize2d, align 4 %rem22 = srem i32 %35, %36 %tobool23 = icmp ne i32 %rem22, 0 %37 = zext i1 %tobool23 to i64 %cond24 = select i1 %tobool23, i32 0, i32 1 %tobool25 = icmp ne i32 %cond24, 0 %lnot26 = xor i1 %tobool25, true %conv27 = zext i1 %lnot26 to i32 %add28 = add nsw i32 %div21, %conv27 store i32 %add28, i32* %gridSize2d, align 4 %38 = load i32, i32* %blockSize2d, align 4 %39 = load i32, i32* %blockSize2d, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlockXY, i32 %38, i32 %39, i32 1) %40 = load i32, i32* %blockSize2d, align 4 %call29 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.55, i64 0, i64 0), i32 %40) %41 = load i32, i32* %gridSize2d, align 4 %42 = load i32, i32* %gridSize2d, align 4 call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGridXY, i32 %41, i32 %42, i32 1) %43 = load i32, i32* %grid_size, align 4 %44 = load i32, i32* %gridSize2d, align 4 %call30 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.56, i64 0, i64 0), i32 %43, i32 %44) %call31 = call i32 @gettimeofday(%struct.timeval* %time_start, %struct.timezone* null) #1 store i32 0, i32* %t, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %45 = load i32, i32* %t, align 4 %46 = load i32, i32* @Size, align 4 %sub = sub nsw i32 %46, 1 %cmp = icmp slt i32 %45, %sub br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %47 = bitcast %struct.dim3* %agg.tmp to i8* %48 = bitcast %struct.dim3* %dimGrid to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %47, i8* align 4 %48, i64 12, i1 false) %49 = bitcast %struct.dim3* %agg.tmp32 to i8* %50 = bitcast %struct.dim3* %dimBlock to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %49, i8* align 4 %50, i64 12, i1 false) %51 = bitcast { i64, i32 }* %agg.tmp.coerce to i8* %52 = bitcast %struct.dim3* %agg.tmp to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %51, i8* align 4 %52, i64 12, i1 false) %53 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0 %54 = load i64, i64* %53, align 4 %55 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1 %56 = load i32, i32* %55, align 4 %57 = bitcast { i64, i32 }* %agg.tmp32.coerce to i8* %58 = bitcast %struct.dim3* %agg.tmp32 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %57, i8* align 4 %58, i64 12, i1 false) %59 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 0 %60 = load i64, i64* %59, align 4 %61 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp32.coerce, i32 0, i32 1 %62 = load i32, i32* %61, align 4 %call33 = call i32 @__cudaPushCallConfiguration(i64 %54, i32 %56, i64 %60, i32 %62, i64 0, i8* null) %tobool34 = icmp ne i32 %call33, 0 br i1 %tobool34, label %kcall.end, label %kcall.configok kcall.configok: ; preds = %for.body %63 = load float*, float** %m_cuda, align 8 %64 = load float*, float** %a_cuda, align 8 %65 = load i32, i32* @Size, align 4 %66 = load i32, i32* %t, align 4 call void @_Z4Fan1PfS_ii(float* %63, float* %64, i32 %65, i32 %66) br label %kcall.end kcall.end: ; preds = %kcall.configok, %for.body %call35 = call i32 @cudaDeviceSynchronize() %67 = bitcast %struct.dim3* %agg.tmp36 to i8* %68 = bitcast %struct.dim3* %dimGridXY to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %67, i8* align 4 %68, i64 12, i1 false) %69 = bitcast %struct.dim3* %agg.tmp37 to i8* %70 = bitcast %struct.dim3* %dimBlockXY to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %69, i8* align 4 %70, i64 12, i1 false) %71 = bitcast { i64, i32 }* %agg.tmp36.coerce to i8* %72 = bitcast %struct.dim3* %agg.tmp36 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %71, i8* align 4 %72, i64 12, i1 false) %73 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 0 %74 = load i64, i64* %73, align 4 %75 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp36.coerce, i32 0, i32 1 %76 = load i32, i32* %75, align 4 %77 = bitcast { i64, i32 }* %agg.tmp37.coerce to i8* %78 = bitcast %struct.dim3* %agg.tmp37 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %77, i8* align 4 %78, i64 12, i1 false) %79 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp37.coerce, i32 0, i32 0 %80 = load i64, i64* %79, align 4 %81 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp37.coerce, i32 0, i32 1 %82 = load i32, i32* %81, align 4 %call38 = call i32 @__cudaPushCallConfiguration(i64 %74, i32 %76, i64 %80, i32 %82, i64 0, i8* null) %tobool39 = icmp ne i32 %call38, 0 br i1 %tobool39, label %kcall.end42, label %kcall.configok40 kcall.configok40: ; preds = %kcall.end %83 = load float*, float** %m_cuda, align 8 %84 = load float*, float** %a_cuda, align 8 %85 = load float*, float** %b_cuda, align 8 %86 = load i32, i32* @Size, align 4 %87 = load i32, i32* @Size, align 4 %88 = load i32, i32* %t, align 4 %sub41 = sub nsw i32 %87, %88 %89 = load i32, i32* %t, align 4 call void @_Z4Fan2PfS_S_iii(float* %83, float* %84, float* %85, i32 %86, i32 %sub41, i32 %89) br label %kcall.end42 kcall.end42: ; preds = %kcall.configok40, %kcall.end %call43 = call i32 @cudaDeviceSynchronize() call void @_Z14checkCUDAErrorPKc(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.57, i64 0, i64 0)) br label %for.inc for.inc: ; preds = %kcall.end42 %90 = load i32, i32* %t, align 4 %inc = add nsw i32 %90, 1 store i32 %inc, i32* %t, align 4 br label %for.cond for.end: ; preds = %for.cond %call44 = call i32 @gettimeofday(%struct.timeval* %time_end, %struct.timezone* null) #1 %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 0 %91 = load i64, i64* %tv_sec, align 8 %mul45 = mul nsw i64 %91, 1000000 %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %time_end, i32 0, i32 1 %92 = load i64, i64* %tv_usec, align 8 %add46 = add nsw i64 %mul45, %92 %tv_sec47 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 0 %93 = load i64, i64* %tv_sec47, align 8 %mul48 = mul nsw i64 %93, 1000000 %tv_usec49 = getelementptr inbounds %struct.timeval, %struct.timeval* %time_start, i32 0, i32 1 %94 = load i64, i64* %tv_usec49, align 8 %add50 = add nsw i64 %mul48, %94 %sub51 = sub nsw i64 %add46, %add50 %conv52 = trunc i64 %sub51 to i32 store i32 %conv52, i32* @totalKernelTime, align 4 %95 = load float*, float** @m, align 8 %96 = bitcast float* %95 to i8* %97 = load float*, float** %m_cuda, align 8 %98 = bitcast float* %97 to i8* %99 = load i32, i32* @Size, align 4 %100 = load i32, i32* @Size, align 4 %mul53 = mul nsw i32 %99, %100 %conv54 = sext i32 %mul53 to i64 %mul55 = mul i64 %conv54, 4 %call56 = call i32 @cudaMemcpy(i8* %96, i8* %98, i64 %mul55, i32 2) %101 = load float*, float** @a, align 8 %102 = bitcast float* %101 to i8* %103 = load float*, float** %a_cuda, align 8 %104 = bitcast float* %103 to i8* %105 = load i32, i32* @Size, align 4 %106 = load i32, i32* @Size, align 4 %mul57 = mul nsw i32 %105, %106 %conv58 = sext i32 %mul57 to i64 %mul59 = mul i64 %conv58, 4 %call60 = call i32 @cudaMemcpy(i8* %102, i8* %104, i64 %mul59, i32 2) %107 = load float*, float** @b, align 8 %108 = bitcast float* %107 to i8* %109 = load float*, float** %b_cuda, align 8 %110 = bitcast float* %109 to i8* %111 = load i32, i32* @Size, align 4 %conv61 = sext i32 %111 to i64 %mul62 = mul i64 %conv61, 4 %call63 = call i32 @cudaMemcpy(i8* %108, i8* %110, i64 %mul62, i32 2) %112 = load float*, float** %m_cuda, align 8 %113 = bitcast float* %112 to i8* %call64 = call i32 @cudaFree(i8* %113) %114 = load float*, float** %a_cuda, align 8 %115 = bitcast float* %114 to i8* %call65 = call i32 @cudaFree(i8* %115) %116 = load float*, float** %b_cuda, align 8 %117 = bitcast float* %116 to i8* %call66 = call i32 @cudaFree(i8* %117) ret void } ; Function Attrs: noinline nounwind optnone uwtable define dso_local void @_Z8PrintMatPfii(float* %ary, i32 %nrow, i32 %ncol) #2 { entry: %ary.addr = alloca float*, align 8 %nrow.addr = alloca i32, align 4 %ncol.addr = alloca i32, align 4 %i = alloca i32, align 4 %j = alloca i32, align 4 store float* %ary, float** %ary.addr, align 8 store i32 %nrow, i32* %nrow.addr, align 4 store i32 %ncol, i32* %ncol.addr, align 4 ret void } ; Function Attrs: noinline optnone uwtable define dso_local void @_Z8PrintAryPfi(float* %ary, i32 %ary_size) #0 { entry: %ary.addr = alloca float*, align 8 %ary_size.addr = alloca i32, align 4 %i = alloca i32, align 4 store float* %ary, float** %ary.addr, align 8 store i32 %ary_size, i32* %ary_size.addr, align 4 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %1 = load i32, i32* %ary_size.addr, align 4 %cmp = icmp slt i32 %0, %1 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %2 = load float*, float** %ary.addr, align 8 %3 = load i32, i32* %i, align 4 %idxprom = sext i32 %3 to i64 %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom %4 = load float, float* %arrayidx, align 4 %conv = fpext float %4 to double %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.59, i64 0, i64 0), double %conv) br label %for.inc for.inc: ; preds = %for.body %5 = load i32, i32* %i, align 4 %inc = add nsw i32 %5, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.60, i64 0, i64 0)) ret void } ; Function Attrs: noinline nounwind optnone uwtable define dso_local void @_Z7BackSubv() #2 { entry: %i = alloca i32, align 4 %j = alloca i32, align 4 %0 = load i32, i32* @Size, align 4 %conv = sext i32 %0 to i64 %mul = mul i64 %conv, 4 %call = call noalias i8* @malloc(i64 %mul) #1 %1 = bitcast i8* %call to float* store float* %1, float** @finalVec, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc43, %entry %2 = load i32, i32* %i, align 4 %3 = load i32, i32* @Size, align 4 %cmp = icmp slt i32 %2, %3 br i1 %cmp, label %for.body, label %for.end45 for.body: ; preds = %for.cond %4 = load float*, float** @b, align 8 %5 = load i32, i32* @Size, align 4 %6 = load i32, i32* %i, align 4 %sub = sub nsw i32 %5, %6 %sub1 = sub nsw i32 %sub, 1 %idxprom = sext i32 %sub1 to i64 %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom %7 = load float, float* %arrayidx, align 4 %8 = load float*, float** @finalVec, align 8 %9 = load i32, i32* @Size, align 4 %10 = load i32, i32* %i, align 4 %sub2 = sub nsw i32 %9, %10 %sub3 = sub nsw i32 %sub2, 1 %idxprom4 = sext i32 %sub3 to i64 %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 store float %7, float* %arrayidx5, align 4 store i32 0, i32* %j, align 4 br label %for.cond6 for.cond6: ; preds = %for.inc, %for.body %11 = load i32, i32* %j, align 4 %12 = load i32, i32* %i, align 4 %cmp7 = icmp slt i32 %11, %12 br i1 %cmp7, label %for.body8, label %for.end for.body8: ; preds = %for.cond6 %13 = load float*, float** @a, align 8 %14 = load i32, i32* @Size, align 4 %15 = load i32, i32* @Size, align 4 %16 = load i32, i32* %i, align 4 %sub9 = sub nsw i32 %15, %16 %sub10 = sub nsw i32 %sub9, 1 %mul11 = mul nsw i32 %14, %sub10 %idx.ext = sext i32 %mul11 to i64 %add.ptr = getelementptr inbounds float, float* %13, i64 %idx.ext %17 = load i32, i32* @Size, align 4 %18 = load i32, i32* %j, align 4 %sub12 = sub nsw i32 %17, %18 %sub13 = sub nsw i32 %sub12, 1 %idx.ext14 = sext i32 %sub13 to i64 %add.ptr15 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext14 %19 = load float, float* %add.ptr15, align 4 %20 = load float*, float** @finalVec, align 8 %21 = load i32, i32* @Size, align 4 %22 = load i32, i32* %j, align 4 %sub16 = sub nsw i32 %21, %22 %sub17 = sub nsw i32 %sub16, 1 %idxprom18 = sext i32 %sub17 to i64 %arrayidx19 = getelementptr inbounds float, float* %20, i64 %idxprom18 %23 = load float, float* %arrayidx19, align 4 %mul20 = fmul contract float %19, %23 %24 = load float*, float** @finalVec, align 8 %25 = load i32, i32* @Size, align 4 %26 = load i32, i32* %i, align 4 %sub21 = sub nsw i32 %25, %26 %sub22 = sub nsw i32 %sub21, 1 %idxprom23 = sext i32 %sub22 to i64 %arrayidx24 = getelementptr inbounds float, float* %24, i64 %idxprom23 %27 = load float, float* %arrayidx24, align 4 %sub25 = fsub contract float %27, %mul20 store float %sub25, float* %arrayidx24, align 4 br label %for.inc for.inc: ; preds = %for.body8 %28 = load i32, i32* %j, align 4 %inc = add nsw i32 %28, 1 store i32 %inc, i32* %j, align 4 br label %for.cond6 for.end: ; preds = %for.cond6 %29 = load float*, float** @finalVec, align 8 %30 = load i32, i32* @Size, align 4 %31 = load i32, i32* %i, align 4 %sub26 = sub nsw i32 %30, %31 %sub27 = sub nsw i32 %sub26, 1 %idxprom28 = sext i32 %sub27 to i64 %arrayidx29 = getelementptr inbounds float, float* %29, i64 %idxprom28 %32 = load float, float* %arrayidx29, align 4 %33 = load float*, float** @a, align 8 %34 = load i32, i32* @Size, align 4 %35 = load i32, i32* @Size, align 4 %36 = load i32, i32* %i, align 4 %sub30 = sub nsw i32 %35, %36 %sub31 = sub nsw i32 %sub30, 1 %mul32 = mul nsw i32 %34, %sub31 %idx.ext33 = sext i32 %mul32 to i64 %add.ptr34 = getelementptr inbounds float, float* %33, i64 %idx.ext33 %37 = load i32, i32* @Size, align 4 %38 = load i32, i32* %i, align 4 %sub35 = sub nsw i32 %37, %38 %sub36 = sub nsw i32 %sub35, 1 %idx.ext37 = sext i32 %sub36 to i64 %add.ptr38 = getelementptr inbounds float, float* %add.ptr34, i64 %idx.ext37 %39 = load float, float* %add.ptr38, align 4 %div = fdiv float %32, %39 %40 = load float*, float** @finalVec, align 8 %41 = load i32, i32* @Size, align 4 %42 = load i32, i32* %i, align 4 %sub39 = sub nsw i32 %41, %42 %sub40 = sub nsw i32 %sub39, 1 %idxprom41 = sext i32 %sub40 to i64 %arrayidx42 = getelementptr inbounds float, float* %40, i64 %idxprom41 store float %div, float* %arrayidx42, align 4 br label %for.inc43 for.inc43: ; preds = %for.end %43 = load i32, i32* %i, align 4 %inc44 = add nsw i32 %43, 1 store i32 %inc44, i32* %i, align 4 br label %for.cond for.end45: ; preds = %for.cond ret void } ; Function Attrs: nounwind declare dso_local void @free(i8*) #7 declare dso_local i32 @cudaGetDeviceCount(i32*) #4 ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #8 declare dso_local i32 @cudaGetDeviceProperties(%struct.cudaDeviceProp*, i32) #4 declare dso_local i8* @cudaGetErrorString(i32) #4 declare dso_local i32 @cudaGetLastError() #4 declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #4 declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #4 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z7InitMatPfii(float* %ary, i32 %nrow, i32 %ncol) #0 { entry: %ary.addr = alloca float*, align 8 %nrow.addr = alloca i32, align 4 %ncol.addr = alloca i32, align 4 %i = alloca i32, align 4 %j = alloca i32, align 4 store float* %ary, float** %ary.addr, align 8 store i32 %nrow, i32* %nrow.addr, align 4 store i32 %ncol, i32* %ncol.addr, align 4 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc6, %entry %0 = load i32, i32* %i, align 4 %1 = load i32, i32* %nrow.addr, align 4 %cmp = icmp slt i32 %0, %1 br i1 %cmp, label %for.body, label %for.end8 for.body: ; preds = %for.cond store i32 0, i32* %j, align 4 br label %for.cond1 for.cond1: ; preds = %for.inc, %for.body %2 = load i32, i32* %j, align 4 %3 = load i32, i32* %ncol.addr, align 4 %cmp2 = icmp slt i32 %2, %3 br i1 %cmp2, label %for.body3, label %for.end for.body3: ; preds = %for.cond1 %4 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 %5 = load float*, float** %ary.addr, align 8 %6 = load i32, i32* @Size, align 4 %7 = load i32, i32* %i, align 4 %mul = mul nsw i32 %6, %7 %idx.ext = sext i32 %mul to i64 %add.ptr = getelementptr inbounds float, float* %5, i64 %idx.ext %8 = load i32, i32* %j, align 4 %idx.ext4 = sext i32 %8 to i64 %add.ptr5 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext4 %call = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.58, i64 0, i64 0), float* %add.ptr5) br label %for.inc for.inc: ; preds = %for.body3 %9 = load i32, i32* %j, align 4 %inc = add nsw i32 %9, 1 store i32 %inc, i32* %j, align 4 br label %for.cond1 for.end: ; preds = %for.cond1 br label %for.inc6 for.inc6: ; preds = %for.end %10 = load i32, i32* %i, align 4 %inc7 = add nsw i32 %10, 1 store i32 %inc7, i32* %i, align 4 br label %for.cond for.end8: ; preds = %for.cond ret void } ; Function Attrs: noinline optnone uwtable define dso_local void @_Z7InitAryPfi(float* %ary, i32 %ary_size) #0 { entry: %ary.addr = alloca float*, align 8 %ary_size.addr = alloca i32, align 4 %i = alloca i32, align 4 store float* %ary, float** %ary.addr, align 8 store i32 %ary_size, i32* %ary_size.addr, align 4 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %1 = load i32, i32* %ary_size.addr, align 4 %cmp = icmp slt i32 %0, %1 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %2 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8 %3 = load float*, float** %ary.addr, align 8 %4 = load i32, i32* %i, align 4 %idxprom = sext i32 %4 to i64 %arrayidx = getelementptr inbounds float, float* %3, i64 %idxprom %call = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.58, i64 0, i64 0), float* %arrayidx) br label %for.inc for.inc: ; preds = %for.body %5 = load i32, i32* %i, align 4 %inc = add nsw i32 %5, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Function Attrs: noinline optnone uwtable define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 { entry: %m_cuda.addr = alloca float*, align 8 %a_cuda.addr = alloca float*, align 8 %Size.addr = alloca i32, align 4 %t.addr = alloca i32, align 4 %grid_dim = alloca %struct.dim3, align 8 %block_dim = alloca %struct.dim3, align 8 %shmem_size = alloca i64, align 8 %stream = alloca i8*, align 8 %grid_dim.coerce = alloca { i64, i32 }, align 8 %block_dim.coerce = alloca { i64, i32 }, align 8 store float* %m_cuda, float** %m_cuda.addr, align 8 store float* %a_cuda, float** %a_cuda.addr, align 8 store i32 %Size, i32* %Size.addr, align 4 store i32 %t, i32* %t.addr, align 4 %kernel_args = alloca i8*, i64 4, align 16 %0 = bitcast float** %m_cuda.addr to i8* %1 = getelementptr i8*, i8** %kernel_args, i32 0 store i8* %0, i8** %1 %2 = bitcast float** %a_cuda.addr to i8* %3 = getelementptr i8*, i8** %kernel_args, i32 1 store i8* %2, i8** %3 %4 = bitcast i32* %Size.addr to i8* %5 = getelementptr i8*, i8** %kernel_args, i32 2 store i8* %4, i8** %5 %6 = bitcast i32* %t.addr to i8* %7 = getelementptr i8*, i8** %kernel_args, i32 3 store i8* %6, i8** %7 %8 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) %9 = load i64, i64* %shmem_size, align 8 %10 = load i8*, i8** %stream, align 8 %11 = bitcast { i64, i32 }* %grid_dim.coerce to i8* %12 = bitcast %struct.dim3* %grid_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 12, i1 false) %13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 %14 = load i64, i64* %13, align 8 %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 %16 = load i32, i32* %15, align 8 %17 = bitcast { i64, i32 }* %block_dim.coerce to i8* %18 = bitcast %struct.dim3* %block_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 12, i1 false) %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 %20 = load i64, i64* %19, align 8 %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 %22 = load i32, i32* %21, align 8 %23 = bitcast i8* %10 to %struct.CUstream_st* %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i64 %14, i32 %16, i64 %20, i32 %22, i8** %kernel_args, i64 %9, %struct.CUstream_st* %23) br label %setup.end setup.end: ; preds = %entry ret void } declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #8 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 { entry: %m_cuda.addr = alloca float*, align 8 %a_cuda.addr = alloca float*, align 8 %b_cuda.addr = alloca float*, align 8 %Size.addr = alloca i32, align 4 %j1.addr = alloca i32, align 4 %t.addr = alloca i32, align 4 %grid_dim = alloca %struct.dim3, align 8 %block_dim = alloca %struct.dim3, align 8 %shmem_size = alloca i64, align 8 %stream = alloca i8*, align 8 %grid_dim.coerce = alloca { i64, i32 }, align 8 %block_dim.coerce = alloca { i64, i32 }, align 8 store float* %m_cuda, float** %m_cuda.addr, align 8 store float* %a_cuda, float** %a_cuda.addr, align 8 store float* %b_cuda, float** %b_cuda.addr, align 8 store i32 %Size, i32* %Size.addr, align 4 store i32 %j1, i32* %j1.addr, align 4 store i32 %t, i32* %t.addr, align 4 %kernel_args = alloca i8*, i64 6, align 16 %0 = bitcast float** %m_cuda.addr to i8* %1 = getelementptr i8*, i8** %kernel_args, i32 0 store i8* %0, i8** %1 %2 = bitcast float** %a_cuda.addr to i8* %3 = getelementptr i8*, i8** %kernel_args, i32 1 store i8* %2, i8** %3 %4 = bitcast float** %b_cuda.addr to i8* %5 = getelementptr i8*, i8** %kernel_args, i32 2 store i8* %4, i8** %5 %6 = bitcast i32* %Size.addr to i8* %7 = getelementptr i8*, i8** %kernel_args, i32 3 store i8* %6, i8** %7 %8 = bitcast i32* %j1.addr to i8* %9 = getelementptr i8*, i8** %kernel_args, i32 4 store i8* %8, i8** %9 %10 = bitcast i32* %t.addr to i8* %11 = getelementptr i8*, i8** %kernel_args, i32 5 store i8* %10, i8** %11 %12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream) %13 = load i64, i64* %shmem_size, align 8 %14 = load i8*, i8** %stream, align 8 %15 = bitcast { i64, i32 }* %grid_dim.coerce to i8* %16 = bitcast %struct.dim3* %grid_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false) %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0 %18 = load i64, i64* %17, align 8 %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1 %20 = load i32, i32* %19, align 8 %21 = bitcast { i64, i32 }* %block_dim.coerce to i8* %22 = bitcast %struct.dim3* %block_dim to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false) %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0 %24 = load i64, i64* %23, align 8 %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1 %26 = load i32, i32* %25, align 8 %27 = bitcast i8* %14 to %struct.CUstream_st* %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27) br label %setup.end setup.end: ; preds = %entry ret void } declare dso_local i32 @cudaMalloc(i8**, i64) #4 declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #4 ; Function Attrs: noinline nounwind optnone uwtable define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 { entry: %this.addr = alloca %struct.dim3*, align 8 %vx.addr = alloca i32, align 4 %vy.addr = alloca i32, align 4 %vz.addr = alloca i32, align 4 store %struct.dim3* %this, %struct.dim3** %this.addr, align 8 store i32 %vx, i32* %vx.addr, align 4 store i32 %vy, i32* %vy.addr, align 4 store i32 %vz, i32* %vz.addr, align 4 %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8 %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0 %0 = load i32, i32* %vx.addr, align 4 store i32 %0, i32* %x, align 4 %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1 %1 = load i32, i32* %vy.addr, align 4 store i32 %1, i32* %y, align 4 %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2 %2 = load i32, i32* %vz.addr, align 4 store i32 %2, i32* %z, align 4 ret void } declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4 declare dso_local i32 @cudaDeviceSynchronize() #4 ; Function Attrs: noinline optnone uwtable define dso_local void @_Z14checkCUDAErrorPKc(i8* %msg) #0 { entry: %msg.addr = alloca i8*, align 8 %err = alloca i32, align 4 store i8* %msg, i8** %msg.addr, align 8 %call = call i32 @cudaGetLastError() store i32 %call, i32* %err, align 4 %0 = load i32, i32* %err, align 4 %cmp = icmp ne i32 0, %0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %1 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 %2 = load i8*, i8** %msg.addr, align 8 %3 = load i32, i32* %err, align 4 %call1 = call i8* @cudaGetErrorString(i32 %3) %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.61, i64 0, i64 0), i8* %2, i8* %call1) call void @exit(i32 1) #9 unreachable if.end: ; preds = %entry ret void } declare dso_local i32 @cudaFree(i8*) #4 declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #4 ; Function Attrs: nounwind declare dso_local float @expf(float) #7 define internal void @__cuda_register_globals(i8** %0) { entry: %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii to i8*), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([17 x i8], [17 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null) ret void } declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*) declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32) declare dso_local i8** @__cudaRegisterFatBinary(i8*) define internal void @__cuda_module_ctor(i8* %0) { entry: %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*)) store i8** %1, i8*** @__cuda_gpubin_handle, align 8 call void @__cuda_register_globals(i8** %1) call void @__cudaRegisterFatBinaryEnd(i8** %1) %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor) ret void } declare dso_local void @__cudaRegisterFatBinaryEnd(i8**) declare dso_local void @__cudaUnregisterFatBinary(i8**) define internal void @__cuda_module_dtor(i8* %0) { entry: %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8 call void @__cudaUnregisterFatBinary(i8** %1) ret void } declare dso_local i32 @atexit(void (i8*)*) attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #3 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #6 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #7 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #8 = { argmemonly nounwind willreturn } attributes #9 = { noreturn nounwind } attributes #10 = { nounwind readonly } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}