CuPBoP/examples/srad_v2/srad-host-x86_64-unknown-li...

; ModuleID = 'srad-host-x86_64-unknown-linux-gnu.bc'
source_filename = "srad.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

$_ZSt3expf = comdat any

$_ZN4dim3C2Ejjj = comdat any

@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [67 x i8] c"Usage: %s <rows> <cols> <y1> <y2> <x1> <x2> <lamda> <no. of iter>\0A\00", align 1
@.str.1 = private unnamed_addr constant [28 x i8] c"\09<rows>   - number of rows\0A\00", align 1
@.str.2 = private unnamed_addr constant [29 x i8] c"\09<cols>    - number of cols\0A\00", align 1
@.str.3 = private unnamed_addr constant [35 x i8] c"\09<y1> \09 - y1 value of the speckle\0A\00", align 1
@.str.4 = private unnamed_addr constant [38 x i8] c"\09<y2>      - y2 value of the speckle\0A\00", align 1
@.str.5 = private unnamed_addr constant [39 x i8] c"\09<x1>       - x1 value of the speckle\0A\00", align 1
@.str.6 = private unnamed_addr constant [39 x i8] c"\09<x2>       - x2 value of the speckle\0A\00", align 1
@.str.7 = private unnamed_addr constant [27 x i8] c"\09<lamda>   - lambda (0,1)\0A\00", align 1
@.str.8 = private unnamed_addr constant [41 x i8] c"\09<no. of iter>   - number of iterations\0A\00", align 1
@.str.9 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1
@.str.10 = private unnamed_addr constant [39 x i8] c"rows and cols must be multiples of 16\0A\00", align 1
@.str.11 = private unnamed_addr constant [30 x i8] c"Randomizing the input matrix\0A\00", align 1
@.str.12 = private unnamed_addr constant [26 x i8] c"Start the SRAD main loop\0A\00", align 1
@.str.13 = private unnamed_addr constant [18 x i8] c"Printing Output:\0A\00", align 1
@.str.14 = private unnamed_addr constant [6 x i8] c"%.5f \00", align 1
@.str.15 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@.str.16 = private unnamed_addr constant [18 x i8] c"Computation Done\0A\00", align 1
@0 = private unnamed_addr constant [31 x i8] c"_Z11srad_cuda_1PfS_S_S_S_S_iif\00", align 1
@1 = private unnamed_addr constant [32 x i8] c"_Z11srad_cuda_2PfS_S_S_S_S_iiff\00", align 1
@2 = private constant [94817 x i8] c"P\EDU\BA\01\00\10\00Pr\01\00\00\00\00\00\02\00\01\01@\00\00\00HG\01\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\A0F\01\00\00\00\00\00\E0B\01\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0F\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z11srad_cuda_2PfS_S_S_S_S_iiff\00.text._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.info._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.shared._Z11srad_cuda_2PfS_S_S_S_S_iiff\00.nv.global\00blockIdx\00threadIdx\00gridDim\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c__1225\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c__1227\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp__1229\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result__1231\00$___ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp__1233\00.nv.constant0._Z11srad_cuda_2PfS_S_S_S_S_iiff\00_param\00_Z11srad_cuda_1PfS_S_S_S_S_iif\00.text._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.info._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.shared._Z11srad_cuda_1PfS_S_S_S_S_iif\00.nv.constant2._Z11srad_cuda_1PfS_S_S_S_S_iif\00__ocg_const\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm20_dblrcp_rn_slowpath_v3\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32\00$_Z11srad_cuda_1PfS_S_S_S_S_iif$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp__199\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result__201\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north__203\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south__205\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east__207\00$___ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west__209\00.nv.constant0._Z11srad_cuda_1PfS_S_S_S_S_iif\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00R\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A1\00\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\CC\00\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D7\00\00\00\01\00\0D\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\E0\00\00\00\01\00\0D\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\EA\00\00\00\01\00\0D\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F9\01\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00M\02\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9A\02\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C4\02\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\FD\02\00\00\22\00\0B\00\90\CE\00\00\00\00\00\00H\05\00\00\00\00\00\00?\03\00\00\22\00\0B\00\D8\D3\00\00\00\00\00\00`\01\00\00\00\00\00\00|\03\00\00\22\00\0B\008\D5\00\00\00\00\00\00H\08\00\00\00\00\00\00\E0\04\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00@W\00\00\00\00\00\00.\02\00\00\12\10\0B\00\00\00\00\00\00\00\00\00\80\DD\00\00\00\00\00\00\04/\08\00\10\00\00\00\15\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00\00\00\00\00\04\11\08\00\0D\00\00\00\00\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00\00\00\00\00\04\11\08\00\0C\00\00\00\00\00\00\00\04#\08\00\0B\00\00\00\00\00\00\00\04\12\08\00\0B\00\00\00\00\00\00\00\04\11\08\00\0B\00\00\00\00\00\00\00\04#\08\00\10\00\00\00\00\00\00\00\04\12\08\00\10\00\00\00\90\00\00\00\04\11\08\00\10\00\00\00\90\00\00\00\04/\08\00\0F\00\00\00\16\00\00\00\04#\08\00\0F\00\00\00\00\00\00\00\04\12\08\00\0F\00\00\00x\00\00\00\04\11\08\00\0F\00\00\00x\00\00\00\010\00\00\01*\00\00
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([94817 x i8], [94817 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %q0sqr) #0 {
entry:
  %E_C.addr = alloca float*, align 8
  %W_C.addr = alloca float*, align 8
  %N_C.addr = alloca float*, align 8
  %S_C.addr = alloca float*, align 8
  %J_cuda.addr = alloca float*, align 8
  %C_cuda.addr = alloca float*, align 8
  %cols.addr = alloca i32, align 4
  %rows.addr = alloca i32, align 4
  %q0sqr.addr = alloca float, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store float* %E_C, float** %E_C.addr, align 8
  store float* %W_C, float** %W_C.addr, align 8
  store float* %N_C, float** %N_C.addr, align 8
  store float* %S_C, float** %S_C.addr, align 8
  store float* %J_cuda, float** %J_cuda.addr, align 8
  store float* %C_cuda, float** %C_cuda.addr, align 8
  store i32 %cols, i32* %cols.addr, align 4
  store i32 %rows, i32* %rows.addr, align 4
  store float %q0sqr, float* %q0sqr.addr, align 4
  %kernel_args = alloca i8*, i64 9, align 16
  %0 = bitcast float** %E_C.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast float** %W_C.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast float** %N_C.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast float** %S_C.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast float** %J_cuda.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = bitcast float** %C_cuda.addr to i8*
  %11 = getelementptr i8*, i8** %kernel_args, i32 5
  store i8* %10, i8** %11
  %12 = bitcast i32* %cols.addr to i8*
  %13 = getelementptr i8*, i8** %kernel_args, i32 6
  store i8* %12, i8** %13
  %14 = bitcast i32* %rows.addr to i8*
  %15 = getelementptr i8*, i8** %kernel_args, i32 7
  store i8* %14, i8** %15
  %16 = bitcast float* %q0sqr.addr to i8*
  %17 = getelementptr i8*, i8** %kernel_args, i32 8
  store i8* %16, i8** %17
  %18 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %19 = load i64, i64* %shmem_size, align 8
  %20 = load i8*, i8** %stream, align 8
  %21 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %22 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false)
  %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %24 = load i64, i64* %23, align 8
  %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %26 = load i32, i32* %25, align 8
  %27 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %28 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %27, i8* align 8 %28, i64 12, i1 false)
  %29 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %30 = load i64, i64* %29, align 8
  %31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %32 = load i32, i32* %31, align 8
  %33 = bitcast i8* %20 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif to i8*), i64 %24, i32 %26, i64 %30, i32 %32, i8** %kernel_args, i64 %19, %struct.CUstream_st* %33)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)

declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %lambda, float %q0sqr) #0 {
entry:
  %E_C.addr = alloca float*, align 8
  %W_C.addr = alloca float*, align 8
  %N_C.addr = alloca float*, align 8
  %S_C.addr = alloca float*, align 8
  %J_cuda.addr = alloca float*, align 8
  %C_cuda.addr = alloca float*, align 8
  %cols.addr = alloca i32, align 4
  %rows.addr = alloca i32, align 4
  %lambda.addr = alloca float, align 4
  %q0sqr.addr = alloca float, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store float* %E_C, float** %E_C.addr, align 8
  store float* %W_C, float** %W_C.addr, align 8
  store float* %N_C, float** %N_C.addr, align 8
  store float* %S_C, float** %S_C.addr, align 8
  store float* %J_cuda, float** %J_cuda.addr, align 8
  store float* %C_cuda, float** %C_cuda.addr, align 8
  store i32 %cols, i32* %cols.addr, align 4
  store i32 %rows, i32* %rows.addr, align 4
  store float %lambda, float* %lambda.addr, align 4
  store float %q0sqr, float* %q0sqr.addr, align 4
  %kernel_args = alloca i8*, i64 10, align 16
  %0 = bitcast float** %E_C.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast float** %W_C.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast float** %N_C.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast float** %S_C.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast float** %J_cuda.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = bitcast float** %C_cuda.addr to i8*
  %11 = getelementptr i8*, i8** %kernel_args, i32 5
  store i8* %10, i8** %11
  %12 = bitcast i32* %cols.addr to i8*
  %13 = getelementptr i8*, i8** %kernel_args, i32 6
  store i8* %12, i8** %13
  %14 = bitcast i32* %rows.addr to i8*
  %15 = getelementptr i8*, i8** %kernel_args, i32 7
  store i8* %14, i8** %15
  %16 = bitcast float* %lambda.addr to i8*
  %17 = getelementptr i8*, i8** %kernel_args, i32 8
  store i8* %16, i8** %17
  %18 = bitcast float* %q0sqr.addr to i8*
  %19 = getelementptr i8*, i8** %kernel_args, i32 9
  store i8* %18, i8** %19
  %20 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %21 = load i64, i64* %shmem_size, align 8
  %22 = load i8*, i8** %stream, align 8
  %23 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %24 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false)
  %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %26 = load i64, i64* %25, align 8
  %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %28 = load i32, i32* %27, align 8
  %29 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %30 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %29, i8* align 8 %30, i64 12, i1 false)
  %31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %32 = load i64, i64* %31, align 8
  %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %34 = load i32, i32* %33, align 8
  %35 = bitcast i8* %22 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff to i8*), i64 %26, i32 %28, i64 %32, i32 %34, i8** %kernel_args, i64 %21, %struct.CUstream_st* %35)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
  %2 = load i8*, i8** %arrayidx, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([67 x i8], [67 x i8]* @.str, i64 0, i64 0), i8* %2)
  %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i64 0, i64 0))
  %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.2, i64 0, i64 0))
  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.3, i64 0, i64 0))
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.4, i64 0, i64 0))
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.5, i64 0, i64 0))
  %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.6, i64 0, i64 0))
  %9 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call7 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.7, i64 0, i64 0))
  %10 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %10, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.8, i64 0, i64 0))
  call void @exit(i32 1) #8
  unreachable
}

declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #2

; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #3

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #4 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %call = call i32 @cudaSetDevice(i32 0)
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.9, i64 0, i64 0), i32 16, i32 16)
  %0 = load i32, i32* %argc.addr, align 4
  %1 = load i8**, i8*** %argv.addr, align 8
  call void @_Z7runTestiPPc(i32 %0, i8** %1)
  ret i32 0
}

declare dso_local i32 @cudaSetDevice(i32) #2

declare dso_local i32 @printf(i8*, ...) #2

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z7runTestiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %rows = alloca i32, align 4
  %cols = alloca i32, align 4
  %size_I = alloca i32, align 4
  %size_R = alloca i32, align 4
  %niter = alloca i32, align 4
  %iter = alloca i32, align 4
  %I = alloca float*, align 8
  %J = alloca float*, align 8
  %lambda = alloca float, align 4
  %q0sqr = alloca float, align 4
  %sum = alloca float, align 4
  %sum2 = alloca float, align 4
  %tmp = alloca float, align 4
  %meanROI = alloca float, align 4
  %varROI = alloca float, align 4
  %J_cuda = alloca float*, align 8
  %C_cuda = alloca float*, align 8
  %E_C = alloca float*, align 8
  %W_C = alloca float*, align 8
  %N_C = alloca float*, align 8
  %S_C = alloca float*, align 8
  %r1 = alloca i32, align 4
  %r2 = alloca i32, align 4
  %c1 = alloca i32, align 4
  %c2 = alloca i32, align 4
  %c = alloca float*, align 8
  %k = alloca i32, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %block_x = alloca i32, align 4
  %block_y = alloca i32, align 4
  %dimBlock = alloca %struct.dim3, align 4
  %dimGrid = alloca %struct.dim3, align 4
  %agg.tmp = alloca %struct.dim3, align 4
  %agg.tmp92 = alloca %struct.dim3, align 4
  %agg.tmp.coerce = alloca { i64, i32 }, align 4
  %agg.tmp92.coerce = alloca { i64, i32 }, align 4
  %agg.tmp95 = alloca %struct.dim3, align 4
  %agg.tmp96 = alloca %struct.dim3, align 4
  %agg.tmp95.coerce = alloca { i64, i32 }, align 4
  %agg.tmp96.coerce = alloca { i64, i32 }, align 4
  %i110 = alloca i32, align 4
  %j114 = alloca i32, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  store i32 10, i32* %niter, align 4
  %0 = load i32, i32* %argc.addr, align 4
  %cmp = icmp eq i32 %0, 9
  br i1 %cmp, label %if.then, label %if.else

if.then:                                          ; preds = %entry
  %1 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %1, i64 1
  %2 = load i8*, i8** %arrayidx, align 8
  %call = call i32 @atoi(i8* %2) #9
  store i32 %call, i32* %rows, align 4
  %3 = load i8**, i8*** %argv.addr, align 8
  %arrayidx1 = getelementptr inbounds i8*, i8** %3, i64 2
  %4 = load i8*, i8** %arrayidx1, align 8
  %call2 = call i32 @atoi(i8* %4) #9
  store i32 %call2, i32* %cols, align 4
  %5 = load i32, i32* %rows, align 4
  %rem = srem i32 %5, 16
  %cmp3 = icmp ne i32 %rem, 0
  br i1 %cmp3, label %if.then6, label %lor.lhs.false

lor.lhs.false:                                    ; preds = %if.then
  %6 = load i32, i32* %cols, align 4
  %rem4 = srem i32 %6, 16
  %cmp5 = icmp ne i32 %rem4, 0
  br i1 %cmp5, label %if.then6, label %if.end

if.then6:                                         ; preds = %lor.lhs.false, %if.then
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call7 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([39 x i8], [39 x i8]* @.str.10, i64 0, i64 0))
  call void @exit(i32 1) #8
  unreachable

if.end:                                           ; preds = %lor.lhs.false
  %8 = load i8**, i8*** %argv.addr, align 8
  %arrayidx8 = getelementptr inbounds i8*, i8** %8, i64 3
  %9 = load i8*, i8** %arrayidx8, align 8
  %call9 = call i32 @atoi(i8* %9) #9
  store i32 %call9, i32* %r1, align 4
  %10 = load i8**, i8*** %argv.addr, align 8
  %arrayidx10 = getelementptr inbounds i8*, i8** %10, i64 4
  %11 = load i8*, i8** %arrayidx10, align 8
  %call11 = call i32 @atoi(i8* %11) #9
  store i32 %call11, i32* %r2, align 4
  %12 = load i8**, i8*** %argv.addr, align 8
  %arrayidx12 = getelementptr inbounds i8*, i8** %12, i64 5
  %13 = load i8*, i8** %arrayidx12, align 8
  %call13 = call i32 @atoi(i8* %13) #9
  store i32 %call13, i32* %c1, align 4
  %14 = load i8**, i8*** %argv.addr, align 8
  %arrayidx14 = getelementptr inbounds i8*, i8** %14, i64 6
  %15 = load i8*, i8** %arrayidx14, align 8
  %call15 = call i32 @atoi(i8* %15) #9
  store i32 %call15, i32* %c2, align 4
  %16 = load i8**, i8*** %argv.addr, align 8
  %arrayidx16 = getelementptr inbounds i8*, i8** %16, i64 7
  %17 = load i8*, i8** %arrayidx16, align 8
  %call17 = call double @atof(i8* %17) #9
  %conv = fptrunc double %call17 to float
  store float %conv, float* %lambda, align 4
  %18 = load i8**, i8*** %argv.addr, align 8
  %arrayidx18 = getelementptr inbounds i8*, i8** %18, i64 8
  %19 = load i8*, i8** %arrayidx18, align 8
  %call19 = call i32 @atoi(i8* %19) #9
  store i32 %call19, i32* %niter, align 4
  br label %if.end20

if.else:                                          ; preds = %entry
  %20 = load i32, i32* %argc.addr, align 4
  %21 = load i8**, i8*** %argv.addr, align 8
  call void @_Z5usageiPPc(i32 %20, i8** %21)
  br label %if.end20

if.end20:                                         ; preds = %if.else, %if.end
  %22 = load i32, i32* %cols, align 4
  %23 = load i32, i32* %rows, align 4
  %mul = mul nsw i32 %22, %23
  store i32 %mul, i32* %size_I, align 4
  %24 = load i32, i32* %r2, align 4
  %25 = load i32, i32* %r1, align 4
  %sub = sub i32 %24, %25
  %add = add i32 %sub, 1
  %26 = load i32, i32* %c2, align 4
  %27 = load i32, i32* %c1, align 4
  %sub21 = sub i32 %26, %27
  %add22 = add i32 %sub21, 1
  %mul23 = mul i32 %add, %add22
  store i32 %mul23, i32* %size_R, align 4
  %28 = load i32, i32* %size_I, align 4
  %conv24 = sext i32 %28 to i64
  %mul25 = mul i64 %conv24, 4
  %call26 = call noalias i8* @malloc(i64 %mul25) #10
  %29 = bitcast i8* %call26 to float*
  store float* %29, float** %I, align 8
  %30 = load i32, i32* %size_I, align 4
  %conv27 = sext i32 %30 to i64
  %mul28 = mul i64 %conv27, 4
  %call29 = call noalias i8* @malloc(i64 %mul28) #10
  %31 = bitcast i8* %call29 to float*
  store float* %31, float** %J, align 8
  %32 = load i32, i32* %size_I, align 4
  %conv30 = sext i32 %32 to i64
  %mul31 = mul i64 4, %conv30
  %call32 = call noalias i8* @malloc(i64 %mul31) #10
  %33 = bitcast i8* %call32 to float*
  store float* %33, float** %c, align 8
  %34 = bitcast float** %J_cuda to i8**
  %35 = load i32, i32* %size_I, align 4
  %conv33 = sext i32 %35 to i64
  %mul34 = mul i64 4, %conv33
  %call35 = call i32 @cudaMalloc(i8** %34, i64 %mul34)
  %36 = bitcast float** %C_cuda to i8**
  %37 = load i32, i32* %size_I, align 4
  %conv36 = sext i32 %37 to i64
  %mul37 = mul i64 4, %conv36
  %call38 = call i32 @cudaMalloc(i8** %36, i64 %mul37)
  %38 = bitcast float** %E_C to i8**
  %39 = load i32, i32* %size_I, align 4
  %conv39 = sext i32 %39 to i64
  %mul40 = mul i64 4, %conv39
  %call41 = call i32 @cudaMalloc(i8** %38, i64 %mul40)
  %40 = bitcast float** %W_C to i8**
  %41 = load i32, i32* %size_I, align 4
  %conv42 = sext i32 %41 to i64
  %mul43 = mul i64 4, %conv42
  %call44 = call i32 @cudaMalloc(i8** %40, i64 %mul43)
  %42 = bitcast float** %S_C to i8**
  %43 = load i32, i32* %size_I, align 4
  %conv45 = sext i32 %43 to i64
  %mul46 = mul i64 4, %conv45
  %call47 = call i32 @cudaMalloc(i8** %42, i64 %mul46)
  %44 = bitcast float** %N_C to i8**
  %45 = load i32, i32* %size_I, align 4
  %conv48 = sext i32 %45 to i64
  %mul49 = mul i64 4, %conv48
  %call50 = call i32 @cudaMalloc(i8** %44, i64 %mul49)
  %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.11, i64 0, i64 0))
  %46 = load float*, float** %I, align 8
  %47 = load i32, i32* %rows, align 4
  %48 = load i32, i32* %cols, align 4
  call void @_Z13random_matrixPfii(float* %46, i32 %47, i32 %48)
  store i32 0, i32* %k, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %if.end20
  %49 = load i32, i32* %k, align 4
  %50 = load i32, i32* %size_I, align 4
  %cmp52 = icmp slt i32 %49, %50
  br i1 %cmp52, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %51 = load float*, float** %I, align 8
  %52 = load i32, i32* %k, align 4
  %idxprom = sext i32 %52 to i64
  %arrayidx53 = getelementptr inbounds float, float* %51, i64 %idxprom
  %53 = load float, float* %arrayidx53, align 4
  %call54 = call float @_ZSt3expf(float %53)
  %54 = load float*, float** %J, align 8
  %55 = load i32, i32* %k, align 4
  %idxprom55 = sext i32 %55 to i64
  %arrayidx56 = getelementptr inbounds float, float* %54, i64 %idxprom55
  store float %call54, float* %arrayidx56, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %56 = load i32, i32* %k, align 4
  %inc = add nsw i32 %56, 1
  store i32 %inc, i32* %k, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %call57 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.12, i64 0, i64 0))
  store i32 0, i32* %iter, align 4
  br label %for.cond58

for.cond58:                                       ; preds = %for.inc105, %for.end
  %57 = load i32, i32* %iter, align 4
  %58 = load i32, i32* %niter, align 4
  %cmp59 = icmp slt i32 %57, %58
  br i1 %cmp59, label %for.body60, label %for.end107

for.body60:                                       ; preds = %for.cond58
  store float 0.000000e+00, float* %sum, align 4
  store float 0.000000e+00, float* %sum2, align 4
  %59 = load i32, i32* %r1, align 4
  store i32 %59, i32* %i, align 4
  br label %for.cond61

for.cond61:                                       ; preds = %for.inc77, %for.body60
  %60 = load i32, i32* %i, align 4
  %61 = load i32, i32* %r2, align 4
  %cmp62 = icmp ule i32 %60, %61
  br i1 %cmp62, label %for.body63, label %for.end79

for.body63:                                       ; preds = %for.cond61
  %62 = load i32, i32* %c1, align 4
  store i32 %62, i32* %j, align 4
  br label %for.cond64

for.cond64:                                       ; preds = %for.inc74, %for.body63
  %63 = load i32, i32* %j, align 4
  %64 = load i32, i32* %c2, align 4
  %cmp65 = icmp ule i32 %63, %64
  br i1 %cmp65, label %for.body66, label %for.end76

for.body66:                                       ; preds = %for.cond64
  %65 = load float*, float** %J, align 8
  %66 = load i32, i32* %i, align 4
  %67 = load i32, i32* %cols, align 4
  %mul67 = mul nsw i32 %66, %67
  %68 = load i32, i32* %j, align 4
  %add68 = add nsw i32 %mul67, %68
  %idxprom69 = sext i32 %add68 to i64
  %arrayidx70 = getelementptr inbounds float, float* %65, i64 %idxprom69
  %69 = load float, float* %arrayidx70, align 4
  store float %69, float* %tmp, align 4
  %70 = load float, float* %tmp, align 4
  %71 = load float, float* %sum, align 4
  %add71 = fadd contract float %71, %70
  store float %add71, float* %sum, align 4
  %72 = load float, float* %tmp, align 4
  %73 = load float, float* %tmp, align 4
  %mul72 = fmul contract float %72, %73
  %74 = load float, float* %sum2, align 4
  %add73 = fadd contract float %74, %mul72
  store float %add73, float* %sum2, align 4
  br label %for.inc74

for.inc74:                                        ; preds = %for.body66
  %75 = load i32, i32* %j, align 4
  %inc75 = add nsw i32 %75, 1
  store i32 %inc75, i32* %j, align 4
  br label %for.cond64

for.end76:                                        ; preds = %for.cond64
  br label %for.inc77

for.inc77:                                        ; preds = %for.end76
  %76 = load i32, i32* %i, align 4
  %inc78 = add nsw i32 %76, 1
  store i32 %inc78, i32* %i, align 4
  br label %for.cond61

for.end79:                                        ; preds = %for.cond61
  %77 = load float, float* %sum, align 4
  %78 = load i32, i32* %size_R, align 4
  %conv80 = sitofp i32 %78 to float
  %div = fdiv float %77, %conv80
  store float %div, float* %meanROI, align 4
  %79 = load float, float* %sum2, align 4
  %80 = load i32, i32* %size_R, align 4
  %conv81 = sitofp i32 %80 to float
  %div82 = fdiv float %79, %conv81
  %81 = load float, float* %meanROI, align 4
  %82 = load float, float* %meanROI, align 4
  %mul83 = fmul contract float %81, %82
  %sub84 = fsub contract float %div82, %mul83
  store float %sub84, float* %varROI, align 4
  %83 = load float, float* %varROI, align 4
  %84 = load float, float* %meanROI, align 4
  %85 = load float, float* %meanROI, align 4
  %mul85 = fmul contract float %84, %85
  %div86 = fdiv float %83, %mul85
  store float %div86, float* %q0sqr, align 4
  %86 = load i32, i32* %cols, align 4
  %div87 = sdiv i32 %86, 16
  store i32 %div87, i32* %block_x, align 4
  %87 = load i32, i32* %rows, align 4
  %div88 = sdiv i32 %87, 16
  store i32 %div88, i32* %block_y, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1)
  %88 = load i32, i32* %block_x, align 4
  %89 = load i32, i32* %block_y, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %88, i32 %89, i32 1)
  %90 = load float*, float** %J_cuda, align 8
  %91 = bitcast float* %90 to i8*
  %92 = load float*, float** %J, align 8
  %93 = bitcast float* %92 to i8*
  %94 = load i32, i32* %size_I, align 4
  %conv89 = sext i32 %94 to i64
  %mul90 = mul i64 4, %conv89
  %call91 = call i32 @cudaMemcpy(i8* %91, i8* %93, i64 %mul90, i32 1)
  %95 = bitcast %struct.dim3* %agg.tmp to i8*
  %96 = bitcast %struct.dim3* %dimGrid to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %95, i8* align 4 %96, i64 12, i1 false)
  %97 = bitcast %struct.dim3* %agg.tmp92 to i8*
  %98 = bitcast %struct.dim3* %dimBlock to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %97, i8* align 4 %98, i64 12, i1 false)
  %99 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
  %100 = bitcast %struct.dim3* %agg.tmp to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %99, i8* align 4 %100, i64 12, i1 false)
  %101 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
  %102 = load i64, i64* %101, align 4
  %103 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
  %104 = load i32, i32* %103, align 4
  %105 = bitcast { i64, i32 }* %agg.tmp92.coerce to i8*
  %106 = bitcast %struct.dim3* %agg.tmp92 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %105, i8* align 4 %106, i64 12, i1 false)
  %107 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp92.coerce, i32 0, i32 0
  %108 = load i64, i64* %107, align 4
  %109 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp92.coerce, i32 0, i32 1
  %110 = load i32, i32* %109, align 4
  %call93 = call i32 @__cudaPushCallConfiguration(i64 %102, i32 %104, i64 %108, i32 %110, i64 0, i8* null)
  %tobool = icmp ne i32 %call93, 0
  br i1 %tobool, label %kcall.end, label %kcall.configok

kcall.configok:                                   ; preds = %for.end79
  %111 = load float*, float** %E_C, align 8
  %112 = load float*, float** %W_C, align 8
  %113 = load float*, float** %N_C, align 8
  %114 = load float*, float** %S_C, align 8
  %115 = load float*, float** %J_cuda, align 8
  %116 = load float*, float** %C_cuda, align 8
  %117 = load i32, i32* %cols, align 4
  %118 = load i32, i32* %rows, align 4
  %119 = load float, float* %q0sqr, align 4
  call void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %111, float* %112, float* %113, float* %114, float* %115, float* %116, i32 %117, i32 %118, float %119)
  br label %kcall.end

kcall.end:                                        ; preds = %kcall.configok, %for.end79
  %call94 = call i32 @cudaThreadSynchronize()
  %120 = bitcast %struct.dim3* %agg.tmp95 to i8*
  %121 = bitcast %struct.dim3* %dimGrid to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %120, i8* align 4 %121, i64 12, i1 false)
  %122 = bitcast %struct.dim3* %agg.tmp96 to i8*
  %123 = bitcast %struct.dim3* %dimBlock to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %122, i8* align 4 %123, i64 12, i1 false)
  %124 = bitcast { i64, i32 }* %agg.tmp95.coerce to i8*
  %125 = bitcast %struct.dim3* %agg.tmp95 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %124, i8* align 4 %125, i64 12, i1 false)
  %126 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp95.coerce, i32 0, i32 0
  %127 = load i64, i64* %126, align 4
  %128 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp95.coerce, i32 0, i32 1
  %129 = load i32, i32* %128, align 4
  %130 = bitcast { i64, i32 }* %agg.tmp96.coerce to i8*
  %131 = bitcast %struct.dim3* %agg.tmp96 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %130, i8* align 4 %131, i64 12, i1 false)
  %132 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp96.coerce, i32 0, i32 0
  %133 = load i64, i64* %132, align 4
  %134 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp96.coerce, i32 0, i32 1
  %135 = load i32, i32* %134, align 4
  %call97 = call i32 @__cudaPushCallConfiguration(i64 %127, i32 %129, i64 %133, i32 %135, i64 0, i8* null)
  %tobool98 = icmp ne i32 %call97, 0
  br i1 %tobool98, label %kcall.end100, label %kcall.configok99

kcall.configok99:                                 ; preds = %kcall.end
  %136 = load float*, float** %E_C, align 8
  %137 = load float*, float** %W_C, align 8
  %138 = load float*, float** %N_C, align 8
  %139 = load float*, float** %S_C, align 8
  %140 = load float*, float** %J_cuda, align 8
  %141 = load float*, float** %C_cuda, align 8
  %142 = load i32, i32* %cols, align 4
  %143 = load i32, i32* %rows, align 4
  %144 = load float, float* %lambda, align 4
  %145 = load float, float* %q0sqr, align 4
  call void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %136, float* %137, float* %138, float* %139, float* %140, float* %141, i32 %142, i32 %143, float %144, float %145)
  br label %kcall.end100

kcall.end100:                                     ; preds = %kcall.configok99, %kcall.end
  %call101 = call i32 @cudaThreadSynchronize()
  %146 = load float*, float** %J, align 8
  %147 = bitcast float* %146 to i8*
  %148 = load float*, float** %J_cuda, align 8
  %149 = bitcast float* %148 to i8*
  %150 = load i32, i32* %size_I, align 4
  %conv102 = sext i32 %150 to i64
  %mul103 = mul i64 4, %conv102
  %call104 = call i32 @cudaMemcpy(i8* %147, i8* %149, i64 %mul103, i32 2)
  br label %for.inc105

for.inc105:                                       ; preds = %kcall.end100
  %151 = load i32, i32* %iter, align 4
  %inc106 = add nsw i32 %151, 1
  store i32 %inc106, i32* %iter, align 4
  br label %for.cond58

for.end107:                                       ; preds = %for.cond58
  %call108 = call i32 @cudaThreadSynchronize()
  %call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.13, i64 0, i64 0))
  store i32 0, i32* %i110, align 4
  br label %for.cond111

for.cond111:                                      ; preds = %for.inc128, %for.end107
  %152 = load i32, i32* %i110, align 4
  %cmp112 = icmp slt i32 %152, 20
  br i1 %cmp112, label %for.body113, label %for.end130

for.body113:                                      ; preds = %for.cond111
  store i32 0, i32* %j114, align 4
  br label %for.cond115

for.cond115:                                      ; preds = %for.inc124, %for.body113
  %153 = load i32, i32* %j114, align 4
  %cmp116 = icmp slt i32 %153, 20
  br i1 %cmp116, label %for.body117, label %for.end126

for.body117:                                      ; preds = %for.cond115
  %154 = load float*, float** %J, align 8
  %155 = load i32, i32* %i110, align 4
  %156 = load i32, i32* %cols, align 4
  %mul118 = mul nsw i32 %155, %156
  %157 = load i32, i32* %j114, align 4
  %add119 = add nsw i32 %mul118, %157
  %idxprom120 = sext i32 %add119 to i64
  %arrayidx121 = getelementptr inbounds float, float* %154, i64 %idxprom120
  %158 = load float, float* %arrayidx121, align 4
  %conv122 = fpext float %158 to double
  %call123 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.14, i64 0, i64 0), double %conv122)
  br label %for.inc124

for.inc124:                                       ; preds = %for.body117
  %159 = load i32, i32* %j114, align 4
  %inc125 = add nsw i32 %159, 1
  store i32 %inc125, i32* %j114, align 4
  br label %for.cond115

for.end126:                                       ; preds = %for.cond115
  %call127 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.15, i64 0, i64 0))
  br label %for.inc128

for.inc128:                                       ; preds = %for.end126
  %160 = load i32, i32* %i110, align 4
  %inc129 = add nsw i32 %160, 1
  store i32 %inc129, i32* %i110, align 4
  br label %for.cond111

for.end130:                                       ; preds = %for.cond111
  %call131 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.16, i64 0, i64 0))
  %161 = load float*, float** %I, align 8
  %162 = bitcast float* %161 to i8*
  call void @free(i8* %162) #10
  %163 = load float*, float** %J, align 8
  %164 = bitcast float* %163 to i8*
  call void @free(i8* %164) #10
  %165 = load float*, float** %C_cuda, align 8
  %166 = bitcast float* %165 to i8*
  %call132 = call i32 @cudaFree(i8* %166)
  %167 = load float*, float** %J_cuda, align 8
  %168 = bitcast float* %167 to i8*
  %call133 = call i32 @cudaFree(i8* %168)
  %169 = load float*, float** %E_C, align 8
  %170 = bitcast float* %169 to i8*
  %call134 = call i32 @cudaFree(i8* %170)
  %171 = load float*, float** %W_C, align 8
  %172 = bitcast float* %171 to i8*
  %call135 = call i32 @cudaFree(i8* %172)
  %173 = load float*, float** %N_C, align 8
  %174 = bitcast float* %173 to i8*
  %call136 = call i32 @cudaFree(i8* %174)
  %175 = load float*, float** %S_C, align 8
  %176 = bitcast float* %175 to i8*
  %call137 = call i32 @cudaFree(i8* %176)
  %177 = load float*, float** %c, align 8
  %178 = bitcast float* %177 to i8*
  call void @free(i8* %178) #10
  ret void
}

; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #5

; Function Attrs: nounwind readonly
declare dso_local double @atof(i8*) #5

; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #6

declare dso_local i32 @cudaMalloc(i8**, i64) #2

; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @_Z13random_matrixPfii(float* %I, i32 %rows, i32 %cols) #7 {
entry:
  %I.addr = alloca float*, align 8
  %rows.addr = alloca i32, align 4
  %cols.addr = alloca i32, align 4
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  store float* %I, float** %I.addr, align 8
  store i32 %rows, i32* %rows.addr, align 4
  store i32 %cols, i32* %cols.addr, align 4
  call void @srand(i32 7) #10
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc4, %entry
  %0 = load i32, i32* %i, align 4
  %1 = load i32, i32* %rows.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %for.body, label %for.end6

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %j, align 4
  br label %for.cond1

for.cond1:                                        ; preds = %for.inc, %for.body
  %2 = load i32, i32* %j, align 4
  %3 = load i32, i32* %cols.addr, align 4
  %cmp2 = icmp slt i32 %2, %3
  br i1 %cmp2, label %for.body3, label %for.end

for.body3:                                        ; preds = %for.cond1
  %call = call i32 @rand() #10
  %conv = sitofp i32 %call to float
  %div = fdiv float %conv, 0x41E0000000000000
  %4 = load float*, float** %I.addr, align 8
  %5 = load i32, i32* %i, align 4
  %6 = load i32, i32* %cols.addr, align 4
  %mul = mul nsw i32 %5, %6
  %7 = load i32, i32* %j, align 4
  %add = add nsw i32 %mul, %7
  %idxprom = sext i32 %add to i64
  %arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
  store float %div, float* %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body3
  %8 = load i32, i32* %j, align 4
  %inc = add nsw i32 %8, 1
  store i32 %inc, i32* %j, align 4
  br label %for.cond1

for.end:                                          ; preds = %for.cond1
  br label %for.inc4

for.inc4:                                         ; preds = %for.end
  %9 = load i32, i32* %i, align 4
  %inc5 = add nsw i32 %9, 1
  store i32 %inc5, i32* %i, align 4
  br label %for.cond

for.end6:                                         ; preds = %for.cond
  ret void
}

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt3expf(float %__x) #7 comdat {
entry:
  %__x.addr = alloca float, align 4
  store float %__x, float* %__x.addr, align 4
  %0 = load float, float* %__x.addr, align 4
  %call = call float @expf(float %0) #10
  ret float %call
}

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #7 comdat align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
  %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
  %0 = load i32, i32* %vx.addr, align 4
  store i32 %0, i32* %x, align 4
  %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
  %1 = load i32, i32* %vy.addr, align 4
  store i32 %1, i32* %y, align 4
  %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
  %2 = load i32, i32* %vz.addr, align 4
  store i32 %2, i32* %z, align 4
  ret void
}

declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #2

declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #2

declare dso_local i32 @cudaThreadSynchronize() #2

; Function Attrs: nounwind
declare dso_local void @free(i8*) #6

declare dso_local i32 @cudaFree(i8*) #2

; Function Attrs: nounwind
declare dso_local void @srand(i32) #6

; Function Attrs: nounwind
declare dso_local i32 @rand() #6

; Function Attrs: nounwind
declare dso_local float @expf(float) #6

define internal void @__cuda_register_globals(i8** %0) {
entry:
  %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif to i8*), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([31 x i8], [31 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff to i8*), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  ret void
}

declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)

declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)

declare dso_local i8** @__cudaRegisterFatBinary(i8*)

define internal void @__cuda_module_ctor(i8* %0) {
entry:
  %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
  store i8** %1, i8*** @__cuda_gpubin_handle, align 8
  call void @__cuda_register_globals(i8** %1)
  call void @__cudaRegisterFatBinaryEnd(i8** %1)
  %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
  ret void
}

declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)

declare dso_local void @__cudaUnregisterFatBinary(i8**)

define internal void @__cuda_module_dtor(i8* %0) {
entry:
  %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
  call void @__cudaUnregisterFatBinary(i8** %1)
  ret void
}

declare dso_local i32 @atexit(void (i8*)*)

attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { noreturn nounwind }
attributes #9 = { nounwind readonly }
attributes #10 = { nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}