CuPBoP/examples/srad_v2/srad-host-x86_64-unknown-li...

963 lines
291 KiB
LLVM
Raw Normal View History

2022-05-04 20:59:38 +08:00
; ModuleID = 'srad-host-x86_64-unknown-linux-gnu.bc'
source_filename = "srad.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
$_ZSt3expf = comdat any
$_ZN4dim3C2Ejjj = comdat any
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [67 x i8] c"Usage: %s <rows> <cols> <y1> <y2> <x1> <x2> <lamda> <no. of iter>\0A\00", align 1
@.str.1 = private unnamed_addr constant [28 x i8] c"\09<rows> - number of rows\0A\00", align 1
@.str.2 = private unnamed_addr constant [29 x i8] c"\09<cols> - number of cols\0A\00", align 1
@.str.3 = private unnamed_addr constant [35 x i8] c"\09<y1> \09 - y1 value of the speckle\0A\00", align 1
@.str.4 = private unnamed_addr constant [38 x i8] c"\09<y2> - y2 value of the speckle\0A\00", align 1
@.str.5 = private unnamed_addr constant [39 x i8] c"\09<x1> - x1 value of the speckle\0A\00", align 1
@.str.6 = private unnamed_addr constant [39 x i8] c"\09<x2> - x2 value of the speckle\0A\00", align 1
@.str.7 = private unnamed_addr constant [27 x i8] c"\09<lamda> - lambda (0,1)\0A\00", align 1
@.str.8 = private unnamed_addr constant [41 x i8] c"\09<no. of iter> - number of iterations\0A\00", align 1
@.str.9 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1
@.str.10 = private unnamed_addr constant [39 x i8] c"rows and cols must be multiples of 16\0A\00", align 1
@.str.11 = private unnamed_addr constant [30 x i8] c"Randomizing the input matrix\0A\00", align 1
@.str.12 = private unnamed_addr constant [26 x i8] c"Start the SRAD main loop\0A\00", align 1
@.str.13 = private unnamed_addr constant [18 x i8] c"Printing Output:\0A\00", align 1
@.str.14 = private unnamed_addr constant [6 x i8] c"%.5f \00", align 1
@.str.15 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.16 = private unnamed_addr constant [18 x i8] c"Computation Done\0A\00", align 1
@0 = private unnamed_addr constant [31 x i8] c"_Z11srad_cuda_1PfS_S_S_S_S_iif\00", align 1
@1 = private unnamed_addr constant [32 x i8] c"_Z11srad_cuda_2PfS_S_S_S_S_iiff\00", align 1
@2 = private constant [94817 x i8] c"P\EDU\BA\01\00\10\00Pr\01\00\00\00\00\00\02\00\01\01@\00\00\00HG\01\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\A0F\01\00\00\00\00\00\E0B\01\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0F\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00blockIdx\00threadIdx\00gridDim\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c__1225\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c__1227\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp__1229\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result__1231\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp__1233\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00_param\00_Z11srad_cuda_1PfS_S_S_S_S_iif\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00__ocg_const\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm20_dblrcp_rn_slowpath_v3\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp__199\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result__201\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north__203\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south__205\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east__207\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west__209\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif`\01\00\00\00\00\00\00|x\00\00\00\04\11\08\00\0F\00\00\00x\00\00\00\010\00\00\01*\00\00
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([94817 x i8], [94817 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %q0sqr) #0 {
entry:
%E_C.addr = alloca float*, align 8
%W_C.addr = alloca float*, align 8
%N_C.addr = alloca float*, align 8
%S_C.addr = alloca float*, align 8
%J_cuda.addr = alloca float*, align 8
%C_cuda.addr = alloca float*, align 8
%cols.addr = alloca i32, align 4
%rows.addr = alloca i32, align 4
%q0sqr.addr = alloca float, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %E_C, float** %E_C.addr, align 8
store float* %W_C, float** %W_C.addr, align 8
store float* %N_C, float** %N_C.addr, align 8
store float* %S_C, float** %S_C.addr, align 8
store float* %J_cuda, float** %J_cuda.addr, align 8
store float* %C_cuda, float** %C_cuda.addr, align 8
store i32 %cols, i32* %cols.addr, align 4
store i32 %rows, i32* %rows.addr, align 4
store float %q0sqr, float* %q0sqr.addr, align 4
%kernel_args = alloca i8*, i64 9, align 16
%0 = bitcast float** %E_C.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast float** %W_C.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast float** %N_C.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = bitcast float** %S_C.addr to i8*
%7 = getelementptr i8*, i8** %kernel_args, i32 3
store i8* %6, i8** %7
%8 = bitcast float** %J_cuda.addr to i8*
%9 = getelementptr i8*, i8** %kernel_args, i32 4
store i8* %8, i8** %9
%10 = bitcast float** %C_cuda.addr to i8*
%11 = getelementptr i8*, i8** %kernel_args, i32 5
store i8* %10, i8** %11
%12 = bitcast i32* %cols.addr to i8*
%13 = getelementptr i8*, i8** %kernel_args, i32 6
store i8* %12, i8** %13
%14 = bitcast i32* %rows.addr to i8*
%15 = getelementptr i8*, i8** %kernel_args, i32 7
store i8* %14, i8** %15
%16 = bitcast float* %q0sqr.addr to i8*
%17 = getelementptr i8*, i8** %kernel_args, i32 8
store i8* %16, i8** %17
%18 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%19 = load i64, i64* %shmem_size, align 8
%20 = load i8*, i8** %stream, align 8
%21 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%22 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false)
%23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%24 = load i64, i64* %23, align 8
%25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%26 = load i32, i32* %25, align 8
%27 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%28 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %27, i8* align 8 %28, i64 12, i1 false)
%29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%30 = load i64, i64* %29, align 8
%31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%32 = load i32, i32* %31, align 8
%33 = bitcast i8* %20 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif to i8*), i64 %24, i32 %26, i64 %30, i32 %32, i8** %kernel_args, i64 %19, %struct.CUstream_st* %33)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)
declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %lambda, float %q0sqr) #0 {
entry:
%E_C.addr = alloca float*, align 8
%W_C.addr = alloca float*, align 8
%N_C.addr = alloca float*, align 8
%S_C.addr = alloca float*, align 8
%J_cuda.addr = alloca float*, align 8
%C_cuda.addr = alloca float*, align 8
%cols.addr = alloca i32, align 4
%rows.addr = alloca i32, align 4
%lambda.addr = alloca float, align 4
%q0sqr.addr = alloca float, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %E_C, float** %E_C.addr, align 8
store float* %W_C, float** %W_C.addr, align 8
store float* %N_C, float** %N_C.addr, align 8
store float* %S_C, float** %S_C.addr, align 8
store float* %J_cuda, float** %J_cuda.addr, align 8
store float* %C_cuda, float** %C_cuda.addr, align 8
store i32 %cols, i32* %cols.addr, align 4
store i32 %rows, i32* %rows.addr, align 4
store float %lambda, float* %lambda.addr, align 4
store float %q0sqr, float* %q0sqr.addr, align 4
%kernel_args = alloca i8*, i64 10, align 16
%0 = bitcast float** %E_C.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast float** %W_C.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast float** %N_C.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = bitcast float** %S_C.addr to i8*
%7 = getelementptr i8*, i8** %kernel_args, i32 3
store i8* %6, i8** %7
%8 = bitcast float** %J_cuda.addr to i8*
%9 = getelementptr i8*, i8** %kernel_args, i32 4
store i8* %8, i8** %9
%10 = bitcast float** %C_cuda.addr to i8*
%11 = getelementptr i8*, i8** %kernel_args, i32 5
store i8* %10, i8** %11
%12 = bitcast i32* %cols.addr to i8*
%13 = getelementptr i8*, i8** %kernel_args, i32 6
store i8* %12, i8** %13
%14 = bitcast i32* %rows.addr to i8*
%15 = getelementptr i8*, i8** %kernel_args, i32 7
store i8* %14, i8** %15
%16 = bitcast float* %lambda.addr to i8*
%17 = getelementptr i8*, i8** %kernel_args, i32 8
store i8* %16, i8** %17
%18 = bitcast float* %q0sqr.addr to i8*
%19 = getelementptr i8*, i8** %kernel_args, i32 9
store i8* %18, i8** %19
%20 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%21 = load i64, i64* %shmem_size, align 8
%22 = load i8*, i8** %stream, align 8
%23 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%24 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false)
%25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%26 = load i64, i64* %25, align 8
%27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%28 = load i32, i32* %27, align 8
%29 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%30 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %29, i8* align 8 %30, i64 12, i1 false)
%31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%32 = load i64, i64* %31, align 8
%33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%34 = load i32, i32* %33, align 8
%35 = bitcast i8* %22 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff to i8*), i64 %26, i32 %28, i64 %32, i32 %34, i8** %kernel_args, i64 %21, %struct.CUstream_st* %35)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #0 {
entry:
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%1 = load i8**, i8*** %argv.addr, align 8
%arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
%2 = load i8*, i8** %arrayidx, align 8
%call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([67 x i8], [67 x i8]* @.str, i64 0, i64 0), i8* %2)
%3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i64 0, i64 0))
%4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.2, i64 0, i64 0))
%5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.3, i64 0, i64 0))
%6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.4, i64 0, i64 0))
%7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.5, i64 0, i64 0))
%8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.6, i64 0, i64 0))
%9 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call7 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.7, i64 0, i64 0))
%10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %10, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.8, i64 0, i64 0))
call void @exit(i32 1) #8
unreachable
}
declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #2
; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #3
; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #4 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 @cudaSetDevice(i32 0)
%call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.9, i64 0, i64 0), i32 16, i32 16)
%0 = load i32, i32* %argc.addr, align 4
%1 = load i8**, i8*** %argv.addr, align 8
call void @_Z7runTestiPPc(i32 %0, i8** %1)
ret i32 0
}
declare dso_local i32 @cudaSetDevice(i32) #2
declare dso_local i32 @printf(i8*, ...) #2
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z7runTestiPPc(i32 %argc, i8** %argv) #0 {
entry:
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
%rows = alloca i32, align 4
%cols = alloca i32, align 4
%size_I = alloca i32, align 4
%size_R = alloca i32, align 4
%niter = alloca i32, align 4
%iter = alloca i32, align 4
%I = alloca float*, align 8
%J = alloca float*, align 8
%lambda = alloca float, align 4
%q0sqr = alloca float, align 4
%sum = alloca float, align 4
%sum2 = alloca float, align 4
%tmp = alloca float, align 4
%meanROI = alloca float, align 4
%varROI = alloca float, align 4
%J_cuda = alloca float*, align 8
%C_cuda = alloca float*, align 8
%E_C = alloca float*, align 8
%W_C = alloca float*, align 8
%N_C = alloca float*, align 8
%S_C = alloca float*, align 8
%r1 = alloca i32, align 4
%r2 = alloca i32, align 4
%c1 = alloca i32, align 4
%c2 = alloca i32, align 4
%c = alloca float*, align 8
%k = alloca i32, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%block_x = alloca i32, align 4
%block_y = alloca i32, align 4
%dimBlock = alloca %struct.dim3, align 4
%dimGrid = alloca %struct.dim3, align 4
%agg.tmp = alloca %struct.dim3, align 4
%agg.tmp92 = alloca %struct.dim3, align 4
%agg.tmp.coerce = alloca { i64, i32 }, align 4
%agg.tmp92.coerce = alloca { i64, i32 }, align 4
%agg.tmp95 = alloca %struct.dim3, align 4
%agg.tmp96 = alloca %struct.dim3, align 4
%agg.tmp95.coerce = alloca { i64, i32 }, align 4
%agg.tmp96.coerce = alloca { i64, i32 }, align 4
%i110 = alloca i32, align 4
%j114 = alloca i32, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
store i32 10, i32* %niter, align 4
%0 = load i32, i32* %argc.addr, align 4
%cmp = icmp eq i32 %0, 9
br i1 %cmp, label %if.then, label %if.else
if.then: ; preds = %entry
%1 = load i8**, i8*** %argv.addr, align 8
%arrayidx = getelementptr inbounds i8*, i8** %1, i64 1
%2 = load i8*, i8** %arrayidx, align 8
%call = call i32 @atoi(i8* %2) #9
store i32 %call, i32* %rows, align 4
%3 = load i8**, i8*** %argv.addr, align 8
%arrayidx1 = getelementptr inbounds i8*, i8** %3, i64 2
%4 = load i8*, i8** %arrayidx1, align 8
%call2 = call i32 @atoi(i8* %4) #9
store i32 %call2, i32* %cols, align 4
%5 = load i32, i32* %rows, align 4
%rem = srem i32 %5, 16
%cmp3 = icmp ne i32 %rem, 0
br i1 %cmp3, label %if.then6, label %lor.lhs.false
lor.lhs.false: ; preds = %if.then
%6 = load i32, i32* %cols, align 4
%rem4 = srem i32 %6, 16
%cmp5 = icmp ne i32 %rem4, 0
br i1 %cmp5, label %if.then6, label %if.end
if.then6: ; preds = %lor.lhs.false, %if.then
%7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call7 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.10, i64 0, i64 0))
call void @exit(i32 1) #8
unreachable
if.end: ; preds = %lor.lhs.false
%8 = load i8**, i8*** %argv.addr, align 8
%arrayidx8 = getelementptr inbounds i8*, i8** %8, i64 3
%9 = load i8*, i8** %arrayidx8, align 8
%call9 = call i32 @atoi(i8* %9) #9
store i32 %call9, i32* %r1, align 4
%10 = load i8**, i8*** %argv.addr, align 8
%arrayidx10 = getelementptr inbounds i8*, i8** %10, i64 4
%11 = load i8*, i8** %arrayidx10, align 8
%call11 = call i32 @atoi(i8* %11) #9
store i32 %call11, i32* %r2, align 4
%12 = load i8**, i8*** %argv.addr, align 8
%arrayidx12 = getelementptr inbounds i8*, i8** %12, i64 5
%13 = load i8*, i8** %arrayidx12, align 8
%call13 = call i32 @atoi(i8* %13) #9
store i32 %call13, i32* %c1, align 4
%14 = load i8**, i8*** %argv.addr, align 8
%arrayidx14 = getelementptr inbounds i8*, i8** %14, i64 6
%15 = load i8*, i8** %arrayidx14, align 8
%call15 = call i32 @atoi(i8* %15) #9
store i32 %call15, i32* %c2, align 4
%16 = load i8**, i8*** %argv.addr, align 8
%arrayidx16 = getelementptr inbounds i8*, i8** %16, i64 7
%17 = load i8*, i8** %arrayidx16, align 8
%call17 = call double @atof(i8* %17) #9
%conv = fptrunc double %call17 to float
store float %conv, float* %lambda, align 4
%18 = load i8**, i8*** %argv.addr, align 8
%arrayidx18 = getelementptr inbounds i8*, i8** %18, i64 8
%19 = load i8*, i8** %arrayidx18, align 8
%call19 = call i32 @atoi(i8* %19) #9
store i32 %call19, i32* %niter, align 4
br label %if.end20
if.else: ; preds = %entry
%20 = load i32, i32* %argc.addr, align 4
%21 = load i8**, i8*** %argv.addr, align 8
call void @_Z5usageiPPc(i32 %20, i8** %21)
br label %if.end20
if.end20: ; preds = %if.else, %if.end
%22 = load i32, i32* %cols, align 4
%23 = load i32, i32* %rows, align 4
%mul = mul nsw i32 %22, %23
store i32 %mul, i32* %size_I, align 4
%24 = load i32, i32* %r2, align 4
%25 = load i32, i32* %r1, align 4
%sub = sub i32 %24, %25
%add = add i32 %sub, 1
%26 = load i32, i32* %c2, align 4
%27 = load i32, i32* %c1, align 4
%sub21 = sub i32 %26, %27
%add22 = add i32 %sub21, 1
%mul23 = mul i32 %add, %add22
store i32 %mul23, i32* %size_R, align 4
%28 = load i32, i32* %size_I, align 4
%conv24 = sext i32 %28 to i64
%mul25 = mul i64 %conv24, 4
%call26 = call noalias i8* @malloc(i64 %mul25) #10
%29 = bitcast i8* %call26 to float*
store float* %29, float** %I, align 8
%30 = load i32, i32* %size_I, align 4
%conv27 = sext i32 %30 to i64
%mul28 = mul i64 %conv27, 4
%call29 = call noalias i8* @malloc(i64 %mul28) #10
%31 = bitcast i8* %call29 to float*
store float* %31, float** %J, align 8
%32 = load i32, i32* %size_I, align 4
%conv30 = sext i32 %32 to i64
%mul31 = mul i64 4, %conv30
%call32 = call noalias i8* @malloc(i64 %mul31) #10
%33 = bitcast i8* %call32 to float*
store float* %33, float** %c, align 8
%34 = bitcast float** %J_cuda to i8**
%35 = load i32, i32* %size_I, align 4
%conv33 = sext i32 %35 to i64
%mul34 = mul i64 4, %conv33
%call35 = call i32 @cudaMalloc(i8** %34, i64 %mul34)
%36 = bitcast float** %C_cuda to i8**
%37 = load i32, i32* %size_I, align 4
%conv36 = sext i32 %37 to i64
%mul37 = mul i64 4, %conv36
%call38 = call i32 @cudaMalloc(i8** %36, i64 %mul37)
%38 = bitcast float** %E_C to i8**
%39 = load i32, i32* %size_I, align 4
%conv39 = sext i32 %39 to i64
%mul40 = mul i64 4, %conv39
%call41 = call i32 @cudaMalloc(i8** %38, i64 %mul40)
%40 = bitcast float** %W_C to i8**
%41 = load i32, i32* %size_I, align 4
%conv42 = sext i32 %41 to i64
%mul43 = mul i64 4, %conv42
%call44 = call i32 @cudaMalloc(i8** %40, i64 %mul43)
%42 = bitcast float** %S_C to i8**
%43 = load i32, i32* %size_I, align 4
%conv45 = sext i32 %43 to i64
%mul46 = mul i64 4, %conv45
%call47 = call i32 @cudaMalloc(i8** %42, i64 %mul46)
%44 = bitcast float** %N_C to i8**
%45 = load i32, i32* %size_I, align 4
%conv48 = sext i32 %45 to i64
%mul49 = mul i64 4, %conv48
%call50 = call i32 @cudaMalloc(i8** %44, i64 %mul49)
%call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.11, i64 0, i64 0))
%46 = load float*, float** %I, align 8
%47 = load i32, i32* %rows, align 4
%48 = load i32, i32* %cols, align 4
call void @_Z13random_matrixPfii(float* %46, i32 %47, i32 %48)
store i32 0, i32* %k, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %if.end20
%49 = load i32, i32* %k, align 4
%50 = load i32, i32* %size_I, align 4
%cmp52 = icmp slt i32 %49, %50
br i1 %cmp52, label %for.body, label %for.end
for.body: ; preds = %for.cond
%51 = load float*, float** %I, align 8
%52 = load i32, i32* %k, align 4
%idxprom = sext i32 %52 to i64
%arrayidx53 = getelementptr inbounds float, float* %51, i64 %idxprom
%53 = load float, float* %arrayidx53, align 4
%call54 = call float @_ZSt3expf(float %53)
%54 = load float*, float** %J, align 8
%55 = load i32, i32* %k, align 4
%idxprom55 = sext i32 %55 to i64
%arrayidx56 = getelementptr inbounds float, float* %54, i64 %idxprom55
store float %call54, float* %arrayidx56, align 4
br label %for.inc
for.inc: ; preds = %for.body
%56 = load i32, i32* %k, align 4
%inc = add nsw i32 %56, 1
store i32 %inc, i32* %k, align 4
br label %for.cond
for.end: ; preds = %for.cond
%call57 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.12, i64 0, i64 0))
store i32 0, i32* %iter, align 4
br label %for.cond58
for.cond58: ; preds = %for.inc105, %for.end
%57 = load i32, i32* %iter, align 4
%58 = load i32, i32* %niter, align 4
%cmp59 = icmp slt i32 %57, %58
br i1 %cmp59, label %for.body60, label %for.end107
for.body60: ; preds = %for.cond58
store float 0.000000e+00, float* %sum, align 4
store float 0.000000e+00, float* %sum2, align 4
%59 = load i32, i32* %r1, align 4
store i32 %59, i32* %i, align 4
br label %for.cond61
for.cond61: ; preds = %for.inc77, %for.body60
%60 = load i32, i32* %i, align 4
%61 = load i32, i32* %r2, align 4
%cmp62 = icmp ule i32 %60, %61
br i1 %cmp62, label %for.body63, label %for.end79
for.body63: ; preds = %for.cond61
%62 = load i32, i32* %c1, align 4
store i32 %62, i32* %j, align 4
br label %for.cond64
for.cond64: ; preds = %for.inc74, %for.body63
%63 = load i32, i32* %j, align 4
%64 = load i32, i32* %c2, align 4
%cmp65 = icmp ule i32 %63, %64
br i1 %cmp65, label %for.body66, label %for.end76
for.body66: ; preds = %for.cond64
%65 = load float*, float** %J, align 8
%66 = load i32, i32* %i, align 4
%67 = load i32, i32* %cols, align 4
%mul67 = mul nsw i32 %66, %67
%68 = load i32, i32* %j, align 4
%add68 = add nsw i32 %mul67, %68
%idxprom69 = sext i32 %add68 to i64
%arrayidx70 = getelementptr inbounds float, float* %65, i64 %idxprom69
%69 = load float, float* %arrayidx70, align 4
store float %69, float* %tmp, align 4
%70 = load float, float* %tmp, align 4
%71 = load float, float* %sum, align 4
%add71 = fadd contract float %71, %70
store float %add71, float* %sum, align 4
%72 = load float, float* %tmp, align 4
%73 = load float, float* %tmp, align 4
%mul72 = fmul contract float %72, %73
%74 = load float, float* %sum2, align 4
%add73 = fadd contract float %74, %mul72
store float %add73, float* %sum2, align 4
br label %for.inc74
for.inc74: ; preds = %for.body66
%75 = load i32, i32* %j, align 4
%inc75 = add nsw i32 %75, 1
store i32 %inc75, i32* %j, align 4
br label %for.cond64
for.end76: ; preds = %for.cond64
br label %for.inc77
for.inc77: ; preds = %for.end76
%76 = load i32, i32* %i, align 4
%inc78 = add nsw i32 %76, 1
store i32 %inc78, i32* %i, align 4
br label %for.cond61
for.end79: ; preds = %for.cond61
%77 = load float, float* %sum, align 4
%78 = load i32, i32* %size_R, align 4
%conv80 = sitofp i32 %78 to float
%div = fdiv float %77, %conv80
store float %div, float* %meanROI, align 4
%79 = load float, float* %sum2, align 4
%80 = load i32, i32* %size_R, align 4
%conv81 = sitofp i32 %80 to float
%div82 = fdiv float %79, %conv81
%81 = load float, float* %meanROI, align 4
%82 = load float, float* %meanROI, align 4
%mul83 = fmul contract float %81, %82
%sub84 = fsub contract float %div82, %mul83
store float %sub84, float* %varROI, align 4
%83 = load float, float* %varROI, align 4
%84 = load float, float* %meanROI, align 4
%85 = load float, float* %meanROI, align 4
%mul85 = fmul contract float %84, %85
%div86 = fdiv float %83, %mul85
store float %div86, float* %q0sqr, align 4
%86 = load i32, i32* %cols, align 4
%div87 = sdiv i32 %86, 16
store i32 %div87, i32* %block_x, align 4
%87 = load i32, i32* %rows, align 4
%div88 = sdiv i32 %87, 16
store i32 %div88, i32* %block_y, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1)
%88 = load i32, i32* %block_x, align 4
%89 = load i32, i32* %block_y, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %88, i32 %89, i32 1)
%90 = load float*, float** %J_cuda, align 8
%91 = bitcast float* %90 to i8*
%92 = load float*, float** %J, align 8
%93 = bitcast float* %92 to i8*
%94 = load i32, i32* %size_I, align 4
%conv89 = sext i32 %94 to i64
%mul90 = mul i64 4, %conv89
%call91 = call i32 @cudaMemcpy(i8* %91, i8* %93, i64 %mul90, i32 1)
%95 = bitcast %struct.dim3* %agg.tmp to i8*
%96 = bitcast %struct.dim3* %dimGrid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %95, i8* align 4 %96, i64 12, i1 false)
%97 = bitcast %struct.dim3* %agg.tmp92 to i8*
%98 = bitcast %struct.dim3* %dimBlock to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %97, i8* align 4 %98, i64 12, i1 false)
%99 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
%100 = bitcast %struct.dim3* %agg.tmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %99, i8* align 4 %100, i64 12, i1 false)
%101 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
%102 = load i64, i64* %101, align 4
%103 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
%104 = load i32, i32* %103, align 4
%105 = bitcast { i64, i32 }* %agg.tmp92.coerce to i8*
%106 = bitcast %struct.dim3* %agg.tmp92 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %105, i8* align 4 %106, i64 12, i1 false)
%107 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp92.coerce, i32 0, i32 0
%108 = load i64, i64* %107, align 4
%109 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp92.coerce, i32 0, i32 1
%110 = load i32, i32* %109, align 4
%call93 = call i32 @__cudaPushCallConfiguration(i64 %102, i32 %104, i64 %108, i32 %110, i64 0, i8* null)
%tobool = icmp ne i32 %call93, 0
br i1 %tobool, label %kcall.end, label %kcall.configok
kcall.configok: ; preds = %for.end79
%111 = load float*, float** %E_C, align 8
%112 = load float*, float** %W_C, align 8
%113 = load float*, float** %N_C, align 8
%114 = load float*, float** %S_C, align 8
%115 = load float*, float** %J_cuda, align 8
%116 = load float*, float** %C_cuda, align 8
%117 = load i32, i32* %cols, align 4
%118 = load i32, i32* %rows, align 4
%119 = load float, float* %q0sqr, align 4
call void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %111, float* %112, float* %113, float* %114, float* %115, float* %116, i32 %117, i32 %118, float %119)
br label %kcall.end
kcall.end: ; preds = %kcall.configok, %for.end79
%call94 = call i32 @cudaThreadSynchronize()
%120 = bitcast %struct.dim3* %agg.tmp95 to i8*
%121 = bitcast %struct.dim3* %dimGrid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %120, i8* align 4 %121, i64 12, i1 false)
%122 = bitcast %struct.dim3* %agg.tmp96 to i8*
%123 = bitcast %struct.dim3* %dimBlock to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %122, i8* align 4 %123, i64 12, i1 false)
%124 = bitcast { i64, i32 }* %agg.tmp95.coerce to i8*
%125 = bitcast %struct.dim3* %agg.tmp95 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %124, i8* align 4 %125, i64 12, i1 false)
%126 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp95.coerce, i32 0, i32 0
%127 = load i64, i64* %126, align 4
%128 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp95.coerce, i32 0, i32 1
%129 = load i32, i32* %128, align 4
%130 = bitcast { i64, i32 }* %agg.tmp96.coerce to i8*
%131 = bitcast %struct.dim3* %agg.tmp96 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %130, i8* align 4 %131, i64 12, i1 false)
%132 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp96.coerce, i32 0, i32 0
%133 = load i64, i64* %132, align 4
%134 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp96.coerce, i32 0, i32 1
%135 = load i32, i32* %134, align 4
%call97 = call i32 @__cudaPushCallConfiguration(i64 %127, i32 %129, i64 %133, i32 %135, i64 0, i8* null)
%tobool98 = icmp ne i32 %call97, 0
br i1 %tobool98, label %kcall.end100, label %kcall.configok99
kcall.configok99: ; preds = %kcall.end
%136 = load float*, float** %E_C, align 8
%137 = load float*, float** %W_C, align 8
%138 = load float*, float** %N_C, align 8
%139 = load float*, float** %S_C, align 8
%140 = load float*, float** %J_cuda, align 8
%141 = load float*, float** %C_cuda, align 8
%142 = load i32, i32* %cols, align 4
%143 = load i32, i32* %rows, align 4
%144 = load float, float* %lambda, align 4
%145 = load float, float* %q0sqr, align 4
call void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %136, float* %137, float* %138, float* %139, float* %140, float* %141, i32 %142, i32 %143, float %144, float %145)
br label %kcall.end100
kcall.end100: ; preds = %kcall.configok99, %kcall.end
%call101 = call i32 @cudaThreadSynchronize()
%146 = load float*, float** %J, align 8
%147 = bitcast float* %146 to i8*
%148 = load float*, float** %J_cuda, align 8
%149 = bitcast float* %148 to i8*
%150 = load i32, i32* %size_I, align 4
%conv102 = sext i32 %150 to i64
%mul103 = mul i64 4, %conv102
%call104 = call i32 @cudaMemcpy(i8* %147, i8* %149, i64 %mul103, i32 2)
br label %for.inc105
for.inc105: ; preds = %kcall.end100
%151 = load i32, i32* %iter, align 4
%inc106 = add nsw i32 %151, 1
store i32 %inc106, i32* %iter, align 4
br label %for.cond58
for.end107: ; preds = %for.cond58
%call108 = call i32 @cudaThreadSynchronize()
%call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.13, i64 0, i64 0))
store i32 0, i32* %i110, align 4
br label %for.cond111
for.cond111: ; preds = %for.inc128, %for.end107
%152 = load i32, i32* %i110, align 4
%cmp112 = icmp slt i32 %152, 20
br i1 %cmp112, label %for.body113, label %for.end130
for.body113: ; preds = %for.cond111
store i32 0, i32* %j114, align 4
br label %for.cond115
for.cond115: ; preds = %for.inc124, %for.body113
%153 = load i32, i32* %j114, align 4
%cmp116 = icmp slt i32 %153, 20
br i1 %cmp116, label %for.body117, label %for.end126
for.body117: ; preds = %for.cond115
%154 = load float*, float** %J, align 8
%155 = load i32, i32* %i110, align 4
%156 = load i32, i32* %cols, align 4
%mul118 = mul nsw i32 %155, %156
%157 = load i32, i32* %j114, align 4
%add119 = add nsw i32 %mul118, %157
%idxprom120 = sext i32 %add119 to i64
%arrayidx121 = getelementptr inbounds float, float* %154, i64 %idxprom120
%158 = load float, float* %arrayidx121, align 4
%conv122 = fpext float %158 to double
%call123 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.14, i64 0, i64 0), double %conv122)
br label %for.inc124
for.inc124: ; preds = %for.body117
%159 = load i32, i32* %j114, align 4
%inc125 = add nsw i32 %159, 1
store i32 %inc125, i32* %j114, align 4
br label %for.cond115
for.end126: ; preds = %for.cond115
%call127 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.15, i64 0, i64 0))
br label %for.inc128
for.inc128: ; preds = %for.end126
%160 = load i32, i32* %i110, align 4
%inc129 = add nsw i32 %160, 1
store i32 %inc129, i32* %i110, align 4
br label %for.cond111
for.end130: ; preds = %for.cond111
%call131 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.16, i64 0, i64 0))
%161 = load float*, float** %I, align 8
%162 = bitcast float* %161 to i8*
call void @free(i8* %162) #10
%163 = load float*, float** %J, align 8
%164 = bitcast float* %163 to i8*
call void @free(i8* %164) #10
%165 = load float*, float** %C_cuda, align 8
%166 = bitcast float* %165 to i8*
%call132 = call i32 @cudaFree(i8* %166)
%167 = load float*, float** %J_cuda, align 8
%168 = bitcast float* %167 to i8*
%call133 = call i32 @cudaFree(i8* %168)
%169 = load float*, float** %E_C, align 8
%170 = bitcast float* %169 to i8*
%call134 = call i32 @cudaFree(i8* %170)
%171 = load float*, float** %W_C, align 8
%172 = bitcast float* %171 to i8*
%call135 = call i32 @cudaFree(i8* %172)
%173 = load float*, float** %N_C, align 8
%174 = bitcast float* %173 to i8*
%call136 = call i32 @cudaFree(i8* %174)
%175 = load float*, float** %S_C, align 8
%176 = bitcast float* %175 to i8*
%call137 = call i32 @cudaFree(i8* %176)
%177 = load float*, float** %c, align 8
%178 = bitcast float* %177 to i8*
call void @free(i8* %178) #10
ret void
}
; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #5
; Function Attrs: nounwind readonly
declare dso_local double @atof(i8*) #5
; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #6
declare dso_local i32 @cudaMalloc(i8**, i64) #2
; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @_Z13random_matrixPfii(float* %I, i32 %rows, i32 %cols) #7 {
entry:
%I.addr = alloca float*, align 8
%rows.addr = alloca i32, align 4
%cols.addr = alloca i32, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
store float* %I, float** %I.addr, align 8
store i32 %rows, i32* %rows.addr, align 4
store i32 %cols, i32* %cols.addr, align 4
call void @srand(i32 7) #10
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc4, %entry
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %rows.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %for.body, label %for.end6
for.body: ; preds = %for.cond
store i32 0, i32* %j, align 4
br label %for.cond1
for.cond1: ; preds = %for.inc, %for.body
%2 = load i32, i32* %j, align 4
%3 = load i32, i32* %cols.addr, align 4
%cmp2 = icmp slt i32 %2, %3
br i1 %cmp2, label %for.body3, label %for.end
for.body3: ; preds = %for.cond1
%call = call i32 @rand() #10
%conv = sitofp i32 %call to float
%div = fdiv float %conv, 0x41E0000000000000
%4 = load float*, float** %I.addr, align 8
%5 = load i32, i32* %i, align 4
%6 = load i32, i32* %cols.addr, align 4
%mul = mul nsw i32 %5, %6
%7 = load i32, i32* %j, align 4
%add = add nsw i32 %mul, %7
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
store float %div, float* %arrayidx, align 4
br label %for.inc
for.inc: ; preds = %for.body3
%8 = load i32, i32* %j, align 4
%inc = add nsw i32 %8, 1
store i32 %inc, i32* %j, align 4
br label %for.cond1
for.end: ; preds = %for.cond1
br label %for.inc4
for.inc4: ; preds = %for.end
%9 = load i32, i32* %i, align 4
%inc5 = add nsw i32 %9, 1
store i32 %inc5, i32* %i, align 4
br label %for.cond
for.end6: ; preds = %for.cond
ret void
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt3expf(float %__x) #7 comdat {
entry:
%__x.addr = alloca float, align 4
store float %__x, float* %__x.addr, align 4
%0 = load float, float* %__x.addr, align 4
%call = call float @expf(float %0) #10
ret float %call
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #7 comdat align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
%x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
%0 = load i32, i32* %vx.addr, align 4
store i32 %0, i32* %x, align 4
%y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
%1 = load i32, i32* %vy.addr, align 4
store i32 %1, i32* %y, align 4
%z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
%2 = load i32, i32* %vz.addr, align 4
store i32 %2, i32* %z, align 4
ret void
}
declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2
declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2
declare dso_local i32 @cudaThreadSynchronize() #2
; Function Attrs: nounwind
declare dso_local void @free(i8*) #6
declare dso_local i32 @cudaFree(i8*) #2
; Function Attrs: nounwind
declare dso_local void @srand(i32) #6
; Function Attrs: nounwind
declare dso_local i32 @rand() #6
; Function Attrs: nounwind
declare dso_local float @expf(float) #6
define internal void @__cuda_register_globals(i8** %0) {
entry:
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif to i8*), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
%2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff to i8*), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
ret void
}
declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)
declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)
declare dso_local i8** @__cudaRegisterFatBinary(i8*)
define internal void @__cuda_module_ctor(i8* %0) {
entry:
%1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
store i8** %1, i8*** @__cuda_gpubin_handle, align 8
call void @__cuda_register_globals(i8** %1)
call void @__cudaRegisterFatBinaryEnd(i8** %1)
%2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
ret void
}
declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)
declare dso_local void @__cudaUnregisterFatBinary(i8**)
define internal void @__cuda_module_dtor(i8* %0) {
entry:
%1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
call void @__cudaUnregisterFatBinary(i8** %1)
ret void
}
declare dso_local i32 @atexit(void (i8*)*)
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { noreturn nounwind }
attributes #9 = { nounwind readonly }
attributes #10 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}