CuPBoP/examples/hotspot/hotspot-host-x86_64-unknown...

; ModuleID = 'hotspot-host-x86_64-unknown-linux-gnu.bc'
source_filename = "hotspot.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

$_ZN4dim3C2Ejjj = comdat any

@t_chip = dso_local global float 0x3F40624DE0000000, align 4
@chip_height = dso_local global float 0x3F90624DE0000000, align 4
@chip_width = dso_local global float 0x3F90624DE0000000, align 4
@amb_temp = dso_local global float 8.000000e+01, align 4
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [11 x i8] c"error: %s\0A\00", align 1
@.str.1 = private unnamed_addr constant [2 x i8] c"w\00", align 1
@.str.2 = private unnamed_addr constant [25 x i8] c"The file was not opened\0A\00", align 1
@.str.3 = private unnamed_addr constant [7 x i8] c"%d\09%g\0A\00", align 1
@.str.4 = private unnamed_addr constant [2 x i8] c"r\00", align 1
@.str.5 = private unnamed_addr constant [25 x i8] c"not enough lines in file\00", align 1
@.str.6 = private unnamed_addr constant [3 x i8] c"%f\00", align 1
@.str.7 = private unnamed_addr constant [20 x i8] c"invalid file format\00", align 1
@.str.8 = private unnamed_addr constant [100 x i8] c"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> <temp_file> <power_file> <output_file>\0A\00", align 1
@.str.9 = private unnamed_addr constant [78 x i8] c"\09<grid_rows/grid_cols>  - number of rows/cols in the grid (positive integer)\0A\00", align 1
@.str.10 = private unnamed_addr constant [53 x i8] c"\09<pyramid_height> - pyramid heigh(positive integer)\0A\00", align 1
@.str.11 = private unnamed_addr constant [38 x i8] c"\09<sim_time>   - number of iterations\0A\00", align 1
@.str.12 = private unnamed_addr constant [89 x i8] c"\09<temp_file>  - name of the file containing the initial temperature values of each cell\0A\00", align 1
@.str.13 = private unnamed_addr constant [86 x i8] c"\09<power_file> - name of the file containing the dissipated power values of each cell\0A\00", align 1
@.str.14 = private unnamed_addr constant [42 x i8] c"\09<output_file> - name of the output file\0A\00", align 1
@.str.15 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1
@.str.16 = private unnamed_addr constant [26 x i8] c"unable to allocate memory\00", align 1
@.str.17 = private unnamed_addr constant [94 x i8] c"pyramidHeight: %d\0AgridSize: [%d, %d]\0Aborder:[%d, %d]\0AblockGrid:[%d, %d]\0AtargetBlock:[%d, %d]\0A\00", align 1
@.str.18 = private unnamed_addr constant [43 x i8] c"Start computing the transient temperature\0A\00", align 1
@.str.19 = private unnamed_addr constant [19 x i8] c"Ending simulation\0A\00", align 1
@0 = private unnamed_addr constant [36 x i8] c"_Z14calculate_tempiPfS_S_iiiiffffff\00", align 1
@1 = private constant [35409 x i8] c"P\EDU\BA\01\00\10\00@\8A\00\00\00\00\00\00\02\00\01\01@\00\00\00\A8v\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\00v\00\00\00\00\00\00\80s\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z14calculate_tempiPfS_S_iiiiffffff\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00blockIdx\00threadIdx\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm20_rcp_rn_f32_slowpath\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda__196\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda__198\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t__200\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00V\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AD\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E7\00\00\00\01\00\09\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F0\00\00\00\01\00\09\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FA\00\00\00\22\00\07\00`]\00\00\00\00\00\00 \04\00\00\00\00\00\00?\01\00\00\22\00\07\00\80a\00\00\00\00\00\00`\01\00\00\00\00\00\00\81\01\00\00\22\00\07\00\E0b\00\00\00\00\00\00`\08\00\00\00\00\00\00z\02\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@k\00\00\00\00\00\00\04/\08\00\0A\00\00\00\17\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00\00\00\00\00\04\11\08\00\07\00\00\00\00\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\C0\00\00\00\04\11\08\00\0A\00\00\00\C0\00\00\00\010\00\00\01*\00\00\04\0A\08\00\09\00\00\00@\01H\00\03\19H\00\04\17\0C\00\00\00\00\00\0D\00D\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0C\00@\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0B\00<\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0A\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\09\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\08\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\00,\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\00(\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0\11\00\03\1B\FF\00\04\1D\08\00h\09\00\00\D8\09\00\00\04\1C\04\00X]\00\00\04\1E\04\00\B0\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([35409 x i8], [35409 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5fatalPc(i8* %s) #0 {
entry:
  %s.addr = alloca i8*, align 8
  store i8* %s, i8** %s.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8*, i8** %s.addr, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i8* %1)
  ret void
}

declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11writeoutputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 {
entry:
  %vect.addr = alloca float*, align 8
  %grid_rows.addr = alloca i32, align 4
  %grid_cols.addr = alloca i32, align 4
  %file.addr = alloca i8*, align 8
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %index = alloca i32, align 4
  %fp = alloca %struct._IO_FILE*, align 8
  %str = alloca [256 x i8], align 16
  store float* %vect, float** %vect.addr, align 8
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i8* %file, i8** %file.addr, align 8
  store i32 0, i32* %index, align 4
  %0 = load i8*, i8** %file.addr, align 8
  %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0))
  store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
  %cmp = icmp eq %struct._IO_FILE* %call, null
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0))
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc10, %if.end
  %1 = load i32, i32* %i, align 4
  %2 = load i32, i32* %grid_rows.addr, align 4
  %cmp2 = icmp slt i32 %1, %2
  br i1 %cmp2, label %for.body, label %for.end12

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %j, align 4
  br label %for.cond3

for.cond3:                                        ; preds = %for.inc, %for.body
  %3 = load i32, i32* %j, align 4
  %4 = load i32, i32* %grid_cols.addr, align 4
  %cmp4 = icmp slt i32 %3, %4
  br i1 %cmp4, label %for.body5, label %for.end

for.body5:                                        ; preds = %for.cond3
  %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %5 = load i32, i32* %index, align 4
  %6 = load float*, float** %vect.addr, align 8
  %7 = load i32, i32* %i, align 4
  %8 = load i32, i32* %grid_cols.addr, align 4
  %mul = mul nsw i32 %7, %8
  %9 = load i32, i32* %j, align 4
  %add = add nsw i32 %mul, %9
  %idxprom = sext i32 %add to i64
  %arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom
  %10 = load float, float* %arrayidx, align 4
  %conv = fpext float %10 to double
  %call6 = call i32 (i8*, i8*, ...) @sprintf(i8* %arraydecay, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.3, i64 0, i64 0), i32 %5, double %conv) #8
  %arraydecay7 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %11 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call8 = call i32 @fputs(i8* %arraydecay7, %struct._IO_FILE* %11)
  %12 = load i32, i32* %index, align 4
  %inc = add nsw i32 %12, 1
  store i32 %inc, i32* %index, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body5
  %13 = load i32, i32* %j, align 4
  %inc9 = add nsw i32 %13, 1
  store i32 %inc9, i32* %j, align 4
  br label %for.cond3

for.end:                                          ; preds = %for.cond3
  br label %for.inc10

for.inc10:                                        ; preds = %for.end
  %14 = load i32, i32* %i, align 4
  %inc11 = add nsw i32 %14, 1
  store i32 %inc11, i32* %i, align 4
  br label %for.cond

for.end12:                                        ; preds = %for.cond
  %15 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call13 = call i32 @fclose(%struct._IO_FILE* %15)
  ret void
}

declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #1

declare dso_local i32 @printf(i8*, ...) #1

; Function Attrs: nounwind
declare dso_local i32 @sprintf(i8*, i8*, ...) #2

declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #1

declare dso_local i32 @fclose(%struct._IO_FILE*) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z9readinputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 {
entry:
  %vect.addr = alloca float*, align 8
  %grid_rows.addr = alloca i32, align 4
  %grid_cols.addr = alloca i32, align 4
  %file.addr = alloca i8*, align 8
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %fp = alloca %struct._IO_FILE*, align 8
  %str = alloca [256 x i8], align 16
  %val = alloca float, align 4
  store float* %vect, float** %vect.addr, align 8
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i8* %file, i8** %file.addr, align 8
  %0 = load i8*, i8** %file.addr, align 8
  %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i64 0, i64 0))
  store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
  %cmp = icmp eq %struct._IO_FILE* %call, null
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0))
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc16, %if.end
  %1 = load i32, i32* %i, align 4
  %2 = load i32, i32* %grid_rows.addr, align 4
  %sub = sub nsw i32 %2, 1
  %cmp2 = icmp sle i32 %1, %sub
  br i1 %cmp2, label %for.body, label %for.end18

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %j, align 4
  br label %for.cond3

for.cond3:                                        ; preds = %for.inc, %for.body
  %3 = load i32, i32* %j, align 4
  %4 = load i32, i32* %grid_cols.addr, align 4
  %sub4 = sub nsw i32 %4, 1
  %cmp5 = icmp sle i32 %3, %sub4
  br i1 %cmp5, label %for.body6, label %for.end

for.body6:                                        ; preds = %for.cond3
  %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %5 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call7 = call i8* @fgets(i8* %arraydecay, i32 256, %struct._IO_FILE* %5)
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call8 = call i32 @feof(%struct._IO_FILE* %6) #8
  %tobool = icmp ne i32 %call8, 0
  br i1 %tobool, label %if.then9, label %if.end10

if.then9:                                         ; preds = %for.body6
  call void @_Z5fatalPc(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.5, i64 0, i64 0))
  br label %if.end10

if.end10:                                         ; preds = %if.then9, %for.body6
  %arraydecay11 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %call12 = call i32 (i8*, i8*, ...) @sscanf(i8* %arraydecay11, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), float* %val) #8
  %cmp13 = icmp ne i32 %call12, 1
  br i1 %cmp13, label %if.then14, label %if.end15

if.then14:                                        ; preds = %if.end10
  call void @_Z5fatalPc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.7, i64 0, i64 0))
  br label %if.end15

if.end15:                                         ; preds = %if.then14, %if.end10
  %7 = load float, float* %val, align 4
  %8 = load float*, float** %vect.addr, align 8
  %9 = load i32, i32* %i, align 4
  %10 = load i32, i32* %grid_cols.addr, align 4
  %mul = mul nsw i32 %9, %10
  %11 = load i32, i32* %j, align 4
  %add = add nsw i32 %mul, %11
  %idxprom = sext i32 %add to i64
  %arrayidx = getelementptr inbounds float, float* %8, i64 %idxprom
  store float %7, float* %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %if.end15
  %12 = load i32, i32* %j, align 4
  %inc = add nsw i32 %12, 1
  store i32 %inc, i32* %j, align 4
  br label %for.cond3

for.end:                                          ; preds = %for.cond3
  br label %for.inc16

for.inc16:                                        ; preds = %for.end
  %13 = load i32, i32* %i, align 4
  %inc17 = add nsw i32 %13, 1
  store i32 %inc17, i32* %i, align 4
  br label %for.cond

for.end18:                                        ; preds = %for.cond
  %14 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call19 = call i32 @fclose(%struct._IO_FILE* %14)
  ret void
}

declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #1

; Function Attrs: nounwind
declare dso_local i32 @feof(%struct._IO_FILE*) #2

; Function Attrs: nounwind
declare dso_local i32 @sscanf(i8*, i8*, ...) #2

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
  %iteration.addr = alloca i32, align 4
  %power.addr = alloca float*, align 8
  %temp_src.addr = alloca float*, align 8
  %temp_dst.addr = alloca float*, align 8
  %grid_cols.addr = alloca i32, align 4
  %grid_rows.addr = alloca i32, align 4
  %border_cols.addr = alloca i32, align 4
  %border_rows.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %step.addr = alloca float, align 4
  %time_elapsed.addr = alloca float, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store i32 %iteration, i32* %iteration.addr, align 4
  store float* %power, float** %power.addr, align 8
  store float* %temp_src, float** %temp_src.addr, align 8
  store float* %temp_dst, float** %temp_dst.addr, align 8
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %border_cols, i32* %border_cols.addr, align 4
  store i32 %border_rows, i32* %border_rows.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %step, float* %step.addr, align 4
  store float %time_elapsed, float* %time_elapsed.addr, align 4
  %kernel_args = alloca i8*, i64 14, align 16
  %0 = bitcast i32* %iteration.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast float** %power.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast float** %temp_src.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast float** %temp_dst.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast i32* %grid_cols.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = bitcast i32* %grid_rows.addr to i8*
  %11 = getelementptr i8*, i8** %kernel_args, i32 5
  store i8* %10, i8** %11
  %12 = bitcast i32* %border_cols.addr to i8*
  %13 = getelementptr i8*, i8** %kernel_args, i32 6
  store i8* %12, i8** %13
  %14 = bitcast i32* %border_rows.addr to i8*
  %15 = getelementptr i8*, i8** %kernel_args, i32 7
  store i8* %14, i8** %15
  %16 = bitcast float* %Cap.addr to i8*
  %17 = getelementptr i8*, i8** %kernel_args, i32 8
  store i8* %16, i8** %17
  %18 = bitcast float* %Rx.addr to i8*
  %19 = getelementptr i8*, i8** %kernel_args, i32 9
  store i8* %18, i8** %19
  %20 = bitcast float* %Ry.addr to i8*
  %21 = getelementptr i8*, i8** %kernel_args, i32 10
  store i8* %20, i8** %21
  %22 = bitcast float* %Rz.addr to i8*
  %23 = getelementptr i8*, i8** %kernel_args, i32 11
  store i8* %22, i8** %23
  %24 = bitcast float* %step.addr to i8*
  %25 = getelementptr i8*, i8** %kernel_args, i32 12
  store i8* %24, i8** %25
  %26 = bitcast float* %time_elapsed.addr to i8*
  %27 = getelementptr i8*, i8** %kernel_args, i32 13
  store i8* %26, i8** %27
  %28 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %29 = load i64, i64* %shmem_size, align 8
  %30 = load i8*, i8** %stream, align 8
  %31 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %32 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false)
  %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %34 = load i64, i64* %33, align 8
  %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %36 = load i32, i32* %35, align 8
  %37 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %38 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false)
  %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %40 = load i64, i64* %39, align 8
  %41 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %42 = load i32, i32* %41, align 8
  %43 = bitcast i8* %30 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i64 %34, i32 %36, i64 %40, i32 %42, i8** %kernel_args, i64 %29, %struct.CUstream_st* %43)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)

declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3

; Function Attrs: noinline optnone uwtable
define dso_local i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %MatrixPower, float** %MatrixTemp, i32 %col, i32 %row, i32 %total_iterations, i32 %num_iterations, i32 %blockCols, i32 %blockRows, i32 %borderCols, i32 %borderRows) #0 {
entry:
  %MatrixPower.addr = alloca float*, align 8
  %MatrixTemp.addr = alloca float**, align 8
  %col.addr = alloca i32, align 4
  %row.addr = alloca i32, align 4
  %total_iterations.addr = alloca i32, align 4
  %num_iterations.addr = alloca i32, align 4
  %blockCols.addr = alloca i32, align 4
  %blockRows.addr = alloca i32, align 4
  %borderCols.addr = alloca i32, align 4
  %borderRows.addr = alloca i32, align 4
  %dimBlock = alloca %struct.dim3, align 4
  %dimGrid = alloca %struct.dim3, align 4
  %grid_height = alloca float, align 4
  %grid_width = alloca float, align 4
  %Cap = alloca float, align 4
  %Rx = alloca float, align 4
  %Ry = alloca float, align 4
  %Rz = alloca float, align 4
  %max_slope = alloca float, align 4
  %step = alloca float, align 4
  %t = alloca float, align 4
  %time_elapsed = alloca float, align 4
  %src = alloca i32, align 4
  %dst = alloca i32, align 4
  %temp = alloca i32, align 4
  %agg.tmp = alloca %struct.dim3, align 4
  %agg.tmp35 = alloca %struct.dim3, align 4
  %agg.tmp.coerce = alloca { i64, i32 }, align 4
  %agg.tmp35.coerce = alloca { i64, i32 }, align 4
  store float* %MatrixPower, float** %MatrixPower.addr, align 8
  store float** %MatrixTemp, float*** %MatrixTemp.addr, align 8
  store i32 %col, i32* %col.addr, align 4
  store i32 %row, i32* %row.addr, align 4
  store i32 %total_iterations, i32* %total_iterations.addr, align 4
  store i32 %num_iterations, i32* %num_iterations.addr, align 4
  store i32 %blockCols, i32* %blockCols.addr, align 4
  store i32 %blockRows, i32* %blockRows.addr, align 4
  store i32 %borderCols, i32* %borderCols.addr, align 4
  store i32 %borderRows, i32* %borderRows.addr, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1)
  %0 = load i32, i32* %blockCols.addr, align 4
  %1 = load i32, i32* %blockRows.addr, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %0, i32 %1, i32 1)
  %2 = load float, float* @chip_height, align 4
  %3 = load i32, i32* %row.addr, align 4
  %conv = sitofp i32 %3 to float
  %div = fdiv float %2, %conv
  store float %div, float* %grid_height, align 4
  %4 = load float, float* @chip_width, align 4
  %5 = load i32, i32* %col.addr, align 4
  %conv1 = sitofp i32 %5 to float
  %div2 = fdiv float %4, %conv1
  store float %div2, float* %grid_width, align 4
  %6 = load float, float* @t_chip, align 4
  %conv3 = fpext float %6 to double
  %mul = fmul contract double 8.750000e+05, %conv3
  %7 = load float, float* %grid_width, align 4
  %conv4 = fpext float %7 to double
  %mul5 = fmul contract double %mul, %conv4
  %8 = load float, float* %grid_height, align 4
  %conv6 = fpext float %8 to double
  %mul7 = fmul contract double %mul5, %conv6
  %conv8 = fptrunc double %mul7 to float
  store float %conv8, float* %Cap, align 4
  %9 = load float, float* %grid_width, align 4
  %conv9 = fpext float %9 to double
  %10 = load float, float* @t_chip, align 4
  %conv10 = fpext float %10 to double
  %mul11 = fmul contract double 2.000000e+02, %conv10
  %11 = load float, float* %grid_height, align 4
  %conv12 = fpext float %11 to double
  %mul13 = fmul contract double %mul11, %conv12
  %div14 = fdiv double %conv9, %mul13
  %conv15 = fptrunc double %div14 to float
  store float %conv15, float* %Rx, align 4
  %12 = load float, float* %grid_height, align 4
  %conv16 = fpext float %12 to double
  %13 = load float, float* @t_chip, align 4
  %conv17 = fpext float %13 to double
  %mul18 = fmul contract double 2.000000e+02, %conv17
  %14 = load float, float* %grid_width, align 4
  %conv19 = fpext float %14 to double
  %mul20 = fmul contract double %mul18, %conv19
  %div21 = fdiv double %conv16, %mul20
  %conv22 = fptrunc double %div21 to float
  store float %conv22, float* %Ry, align 4
  %15 = load float, float* @t_chip, align 4
  %16 = load float, float* %grid_height, align 4
  %mul23 = fmul contract float 1.000000e+02, %16
  %17 = load float, float* %grid_width, align 4
  %mul24 = fmul contract float %mul23, %17
  %div25 = fdiv float %15, %mul24
  store float %div25, float* %Rz, align 4
  %18 = load float, float* @t_chip, align 4
  %conv26 = fpext float %18 to double
  %mul27 = fmul contract double 5.000000e-01, %conv26
  %mul28 = fmul contract double %mul27, 1.750000e+06
  %div29 = fdiv double 3.000000e+06, %mul28
  %conv30 = fptrunc double %div29 to float
  store float %conv30, float* %max_slope, align 4
  %19 = load float, float* %max_slope, align 4
  %conv31 = fpext float %19 to double
  %div32 = fdiv double 1.000000e-03, %conv31
  %conv33 = fptrunc double %div32 to float
  store float %conv33, float* %step, align 4
  store float 0x3F50624DE0000000, float* %time_elapsed, align 4
  store i32 1, i32* %src, align 4
  store i32 0, i32* %dst, align 4
  store float 0.000000e+00, float* %t, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %20 = load float, float* %t, align 4
  %21 = load i32, i32* %total_iterations.addr, align 4
  %conv34 = sitofp i32 %21 to float
  %cmp = fcmp olt float %20, %conv34
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %22 = load i32, i32* %src, align 4
  store i32 %22, i32* %temp, align 4
  %23 = load i32, i32* %dst, align 4
  store i32 %23, i32* %src, align 4
  %24 = load i32, i32* %temp, align 4
  store i32 %24, i32* %dst, align 4
  %25 = bitcast %struct.dim3* %agg.tmp to i8*
  %26 = bitcast %struct.dim3* %dimGrid to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %25, i8* align 4 %26, i64 12, i1 false)
  %27 = bitcast %struct.dim3* %agg.tmp35 to i8*
  %28 = bitcast %struct.dim3* %dimBlock to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %27, i8* align 4 %28, i64 12, i1 false)
  %29 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
  %30 = bitcast %struct.dim3* %agg.tmp to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %29, i8* align 4 %30, i64 12, i1 false)
  %31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
  %32 = load i64, i64* %31, align 4
  %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
  %34 = load i32, i32* %33, align 4
  %35 = bitcast { i64, i32 }* %agg.tmp35.coerce to i8*
  %36 = bitcast %struct.dim3* %agg.tmp35 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %35, i8* align 4 %36, i64 12, i1 false)
  %37 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 0
  %38 = load i64, i64* %37, align 4
  %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 1
  %40 = load i32, i32* %39, align 4
  %call = call i32 @__cudaPushCallConfiguration(i64 %32, i32 %34, i64 %38, i32 %40, i64 0, i8* null)
  %tobool = icmp ne i32 %call, 0
  br i1 %tobool, label %kcall.end, label %kcall.configok

kcall.configok:                                   ; preds = %for.body
  %41 = load i32, i32* %num_iterations.addr, align 4
  %conv36 = sitofp i32 %41 to float
  %42 = load i32, i32* %total_iterations.addr, align 4
  %conv37 = sitofp i32 %42 to float
  %43 = load float, float* %t, align 4
  %sub = fsub contract float %conv37, %43
  %cmp38 = fcmp ole float %conv36, %sub
  br i1 %cmp38, label %cond.true, label %cond.false

cond.true:                                        ; preds = %kcall.configok
  %44 = load i32, i32* %num_iterations.addr, align 4
  %conv39 = sitofp i32 %44 to float
  br label %cond.end

cond.false:                                       ; preds = %kcall.configok
  %45 = load i32, i32* %total_iterations.addr, align 4
  %conv40 = sitofp i32 %45 to float
  %46 = load float, float* %t, align 4
  %sub41 = fsub contract float %conv40, %46
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi float [ %conv39, %cond.true ], [ %sub41, %cond.false ]
  %conv42 = fptosi float %cond to i32
  %47 = load float*, float** %MatrixPower.addr, align 8
  %48 = load float**, float*** %MatrixTemp.addr, align 8
  %49 = load i32, i32* %src, align 4
  %idxprom = sext i32 %49 to i64
  %arrayidx = getelementptr inbounds float*, float** %48, i64 %idxprom
  %50 = load float*, float** %arrayidx, align 8
  %51 = load float**, float*** %MatrixTemp.addr, align 8
  %52 = load i32, i32* %dst, align 4
  %idxprom43 = sext i32 %52 to i64
  %arrayidx44 = getelementptr inbounds float*, float** %51, i64 %idxprom43
  %53 = load float*, float** %arrayidx44, align 8
  %54 = load i32, i32* %col.addr, align 4
  %55 = load i32, i32* %row.addr, align 4
  %56 = load i32, i32* %borderCols.addr, align 4
  %57 = load i32, i32* %borderRows.addr, align 4
  %58 = load float, float* %Cap, align 4
  %59 = load float, float* %Rx, align 4
  %60 = load float, float* %Ry, align 4
  %61 = load float, float* %Rz, align 4
  %62 = load float, float* %step, align 4
  %63 = load float, float* %time_elapsed, align 4
  call void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %conv42, float* %47, float* %50, float* %53, i32 %54, i32 %55, i32 %56, i32 %57, float %58, float %59, float %60, float %61, float %62, float %63)
  br label %kcall.end

kcall.end:                                        ; preds = %cond.end, %for.body
  %call45 = call i32 @cudaDeviceSynchronize()
  br label %for.inc

for.inc:                                          ; preds = %kcall.end
  %64 = load i32, i32* %num_iterations.addr, align 4
  %conv46 = sitofp i32 %64 to float
  %65 = load float, float* %t, align 4
  %add = fadd contract float %65, %conv46
  store float %add, float* %t, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %66 = load i32, i32* %dst, align 4
  ret i32 %66
}

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #4 comdat align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
  %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
  %0 = load i32, i32* %vx.addr, align 4
  store i32 %0, i32* %x, align 4
  %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
  %1 = load i32, i32* %vy.addr, align 4
  store i32 %1, i32* %y, align 4
  %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
  %2 = load i32, i32* %vz.addr, align 4
  store i32 %2, i32* %z, align 4
  ret void
}

declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #1

declare dso_local i32 @cudaDeviceSynchronize() #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
  %2 = load i8*, i8** %arrayidx, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @.str.8, i64 0, i64 0), i8* %2)
  %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([78 x i8], [78 x i8]* @.str.9, i64 0, i64 0))
  %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([53 x i8], [53 x i8]* @.str.10, i64 0, i64 0))
  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.11, i64 0, i64 0))
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([89 x i8], [89 x i8]* @.str.12, i64 0, i64 0))
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([86 x i8], [86 x i8]* @.str.13, i64 0, i64 0))
  %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.14, i64 0, i64 0))
  call void @exit(i32 1) #9
  unreachable
}

; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #5

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #6 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %call = call i32 @cudaSetDevice(i32 0)
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.15, i64 0, i64 0), i32 16, i32 16)
  %0 = load i32, i32* %argc.addr, align 4
  %1 = load i8**, i8*** %argv.addr, align 8
  call void @_Z3runiPPc(i32 %0, i8** %1)
  ret i32 0
}

declare dso_local i32 @cudaSetDevice(i32) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z3runiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %size = alloca i32, align 4
  %grid_rows = alloca i32, align 4
  %grid_cols = alloca i32, align 4
  %FilesavingTemp = alloca float*, align 8
  %FilesavingPower = alloca float*, align 8
  %MatrixOut = alloca float*, align 8
  %tfile = alloca i8*, align 8
  %pfile = alloca i8*, align 8
  %ofile = alloca i8*, align 8
  %total_iterations = alloca i32, align 4
  %pyramid_height = alloca i32, align 4
  %borderCols = alloca i32, align 4
  %borderRows = alloca i32, align 4
  %smallBlockCol = alloca i32, align 4
  %smallBlockRow = alloca i32, align 4
  %blockCols = alloca i32, align 4
  %blockRows = alloca i32, align 4
  %MatrixTemp = alloca [2 x float*], align 16
  %MatrixPower = alloca float*, align 8
  %ret = alloca i32, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  store i32 60, i32* %total_iterations, align 4
  store i32 1, i32* %pyramid_height, align 4
  %0 = load i32, i32* %argc.addr, align 4
  %cmp = icmp ne i32 %0, 7
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %1 = load i32, i32* %argc.addr, align 4
  %2 = load i8**, i8*** %argv.addr, align 8
  call void @_Z5usageiPPc(i32 %1, i8** %2)
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  %3 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %3, i64 1
  %4 = load i8*, i8** %arrayidx, align 8
  %call = call i32 @atoi(i8* %4) #10
  store i32 %call, i32* %grid_rows, align 4
  %cmp1 = icmp sle i32 %call, 0
  br i1 %cmp1, label %if.then13, label %lor.lhs.false

lor.lhs.false:                                    ; preds = %if.end
  %5 = load i8**, i8*** %argv.addr, align 8
  %arrayidx2 = getelementptr inbounds i8*, i8** %5, i64 1
  %6 = load i8*, i8** %arrayidx2, align 8
  %call3 = call i32 @atoi(i8* %6) #10
  store i32 %call3, i32* %grid_cols, align 4
  %cmp4 = icmp sle i32 %call3, 0
  br i1 %cmp4, label %if.then13, label %lor.lhs.false5

lor.lhs.false5:                                   ; preds = %lor.lhs.false
  %7 = load i8**, i8*** %argv.addr, align 8
  %arrayidx6 = getelementptr inbounds i8*, i8** %7, i64 2
  %8 = load i8*, i8** %arrayidx6, align 8
  %call7 = call i32 @atoi(i8* %8) #10
  store i32 %call7, i32* %pyramid_height, align 4
  %cmp8 = icmp sle i32 %call7, 0
  br i1 %cmp8, label %if.then13, label %lor.lhs.false9

lor.lhs.false9:                                   ; preds = %lor.lhs.false5
  %9 = load i8**, i8*** %argv.addr, align 8
  %arrayidx10 = getelementptr inbounds i8*, i8** %9, i64 3
  %10 = load i8*, i8** %arrayidx10, align 8
  %call11 = call i32 @atoi(i8* %10) #10
  store i32 %call11, i32* %total_iterations, align 4
  %cmp12 = icmp sle i32 %call11, 0
  br i1 %cmp12, label %if.then13, label %if.end14

if.then13:                                        ; preds = %lor.lhs.false9, %lor.lhs.false5, %lor.lhs.false, %if.end
  %11 = load i32, i32* %argc.addr, align 4
  %12 = load i8**, i8*** %argv.addr, align 8
  call void @_Z5usageiPPc(i32 %11, i8** %12)
  br label %if.end14

if.end14:                                         ; preds = %if.then13, %lor.lhs.false9
  %13 = load i8**, i8*** %argv.addr, align 8
  %arrayidx15 = getelementptr inbounds i8*, i8** %13, i64 4
  %14 = load i8*, i8** %arrayidx15, align 8
  store i8* %14, i8** %tfile, align 8
  %15 = load i8**, i8*** %argv.addr, align 8
  %arrayidx16 = getelementptr inbounds i8*, i8** %15, i64 5
  %16 = load i8*, i8** %arrayidx16, align 8
  store i8* %16, i8** %pfile, align 8
  %17 = load i8**, i8*** %argv.addr, align 8
  %arrayidx17 = getelementptr inbounds i8*, i8** %17, i64 6
  %18 = load i8*, i8** %arrayidx17, align 8
  store i8* %18, i8** %ofile, align 8
  %19 = load i32, i32* %grid_rows, align 4
  %20 = load i32, i32* %grid_cols, align 4
  %mul = mul nsw i32 %19, %20
  store i32 %mul, i32* %size, align 4
  %21 = load i32, i32* %pyramid_height, align 4
  %mul18 = mul nsw i32 %21, 2
  %div = sdiv i32 %mul18, 2
  store i32 %div, i32* %borderCols, align 4
  %22 = load i32, i32* %pyramid_height, align 4
  %mul19 = mul nsw i32 %22, 2
  %div20 = sdiv i32 %mul19, 2
  store i32 %div20, i32* %borderRows, align 4
  %23 = load i32, i32* %pyramid_height, align 4
  %mul21 = mul nsw i32 %23, 2
  %sub = sub nsw i32 16, %mul21
  store i32 %sub, i32* %smallBlockCol, align 4
  %24 = load i32, i32* %pyramid_height, align 4
  %mul22 = mul nsw i32 %24, 2
  %sub23 = sub nsw i32 16, %mul22
  store i32 %sub23, i32* %smallBlockRow, align 4
  %25 = load i32, i32* %grid_cols, align 4
  %26 = load i32, i32* %smallBlockCol, align 4
  %div24 = sdiv i32 %25, %26
  %27 = load i32, i32* %grid_cols, align 4
  %28 = load i32, i32* %smallBlockCol, align 4
  %rem = srem i32 %27, %28
  %cmp25 = icmp eq i32 %rem, 0
  %29 = zext i1 %cmp25 to i64
  %cond = select i1 %cmp25, i32 0, i32 1
  %add = add nsw i32 %div24, %cond
  store i32 %add, i32* %blockCols, align 4
  %30 = load i32, i32* %grid_rows, align 4
  %31 = load i32, i32* %smallBlockRow, align 4
  %div26 = sdiv i32 %30, %31
  %32 = load i32, i32* %grid_rows, align 4
  %33 = load i32, i32* %smallBlockRow, align 4
  %rem27 = srem i32 %32, %33
  %cmp28 = icmp eq i32 %rem27, 0
  %34 = zext i1 %cmp28 to i64
  %cond29 = select i1 %cmp28, i32 0, i32 1
  %add30 = add nsw i32 %div26, %cond29
  store i32 %add30, i32* %blockRows, align 4
  %35 = load i32, i32* %size, align 4
  %conv = sext i32 %35 to i64
  %mul31 = mul i64 %conv, 4
  %call32 = call noalias i8* @malloc(i64 %mul31) #8
  %36 = bitcast i8* %call32 to float*
  store float* %36, float** %FilesavingTemp, align 8
  %37 = load i32, i32* %size, align 4
  %conv33 = sext i32 %37 to i64
  %mul34 = mul i64 %conv33, 4
  %call35 = call noalias i8* @malloc(i64 %mul34) #8
  %38 = bitcast i8* %call35 to float*
  store float* %38, float** %FilesavingPower, align 8
  %39 = load i32, i32* %size, align 4
  %conv36 = sext i32 %39 to i64
  %call37 = call noalias i8* @calloc(i64 %conv36, i64 4) #8
  %40 = bitcast i8* %call37 to float*
  store float* %40, float** %MatrixOut, align 8
  %41 = load float*, float** %FilesavingPower, align 8
  %tobool = icmp ne float* %41, null
  br i1 %tobool, label %lor.lhs.false38, label %if.then42

lor.lhs.false38:                                  ; preds = %if.end14
  %42 = load float*, float** %FilesavingTemp, align 8
  %tobool39 = icmp ne float* %42, null
  br i1 %tobool39, label %lor.lhs.false40, label %if.then42

lor.lhs.false40:                                  ; preds = %lor.lhs.false38
  %43 = load float*, float** %MatrixOut, align 8
  %tobool41 = icmp ne float* %43, null
  br i1 %tobool41, label %if.end43, label %if.then42

if.then42:                                        ; preds = %lor.lhs.false40, %lor.lhs.false38, %if.end14
  call void @_Z5fatalPc(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.16, i64 0, i64 0))
  br label %if.end43

if.end43:                                         ; preds = %if.then42, %lor.lhs.false40
  %44 = load i32, i32* %pyramid_height, align 4
  %45 = load i32, i32* %grid_cols, align 4
  %46 = load i32, i32* %grid_rows, align 4
  %47 = load i32, i32* %borderCols, align 4
  %48 = load i32, i32* %borderRows, align 4
  %49 = load i32, i32* %blockCols, align 4
  %50 = load i32, i32* %blockRows, align 4
  %51 = load i32, i32* %smallBlockCol, align 4
  %52 = load i32, i32* %smallBlockRow, align 4
  %call44 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([94 x i8], [94 x i8]* @.str.17, i64 0, i64 0), i32 %44, i32 %45, i32 %46, i32 %47, i32 %48, i32 %49, i32 %50, i32 %51, i32 %52)
  %53 = load float*, float** %FilesavingTemp, align 8
  %54 = load i32, i32* %grid_rows, align 4
  %55 = load i32, i32* %grid_cols, align 4
  %56 = load i8*, i8** %tfile, align 8
  call void @_Z9readinputPfiiPc(float* %53, i32 %54, i32 %55, i8* %56)
  %57 = load float*, float** %FilesavingPower, align 8
  %58 = load i32, i32* %grid_rows, align 4
  %59 = load i32, i32* %grid_cols, align 4
  %60 = load i8*, i8** %pfile, align 8
  call void @_Z9readinputPfiiPc(float* %57, i32 %58, i32 %59, i8* %60)
  %arrayidx45 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
  %61 = bitcast float** %arrayidx45 to i8**
  %62 = load i32, i32* %size, align 4
  %conv46 = sext i32 %62 to i64
  %mul47 = mul i64 4, %conv46
  %call48 = call i32 @cudaMalloc(i8** %61, i64 %mul47)
  %arrayidx49 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1
  %63 = bitcast float** %arrayidx49 to i8**
  %64 = load i32, i32* %size, align 4
  %conv50 = sext i32 %64 to i64
  %mul51 = mul i64 4, %conv50
  %call52 = call i32 @cudaMalloc(i8** %63, i64 %mul51)
  %arrayidx53 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
  %65 = load float*, float** %arrayidx53, align 16
  %66 = bitcast float* %65 to i8*
  %67 = load float*, float** %FilesavingTemp, align 8
  %68 = bitcast float* %67 to i8*
  %69 = load i32, i32* %size, align 4
  %conv54 = sext i32 %69 to i64
  %mul55 = mul i64 4, %conv54
  %call56 = call i32 @cudaMemcpy(i8* %66, i8* %68, i64 %mul55, i32 1)
  %70 = bitcast float** %MatrixPower to i8**
  %71 = load i32, i32* %size, align 4
  %conv57 = sext i32 %71 to i64
  %mul58 = mul i64 4, %conv57
  %call59 = call i32 @cudaMalloc(i8** %70, i64 %mul58)
  %72 = load float*, float** %MatrixPower, align 8
  %73 = bitcast float* %72 to i8*
  %74 = load float*, float** %FilesavingPower, align 8
  %75 = bitcast float* %74 to i8*
  %76 = load i32, i32* %size, align 4
  %conv60 = sext i32 %76 to i64
  %mul61 = mul i64 4, %conv60
  %call62 = call i32 @cudaMemcpy(i8* %73, i8* %75, i64 %mul61, i32 1)
  %call63 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.18, i64 0, i64 0))
  %77 = load float*, float** %MatrixPower, align 8
  %arraydecay = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
  %78 = load i32, i32* %grid_cols, align 4
  %79 = load i32, i32* %grid_rows, align 4
  %80 = load i32, i32* %total_iterations, align 4
  %81 = load i32, i32* %pyramid_height, align 4
  %82 = load i32, i32* %blockCols, align 4
  %83 = load i32, i32* %blockRows, align 4
  %84 = load i32, i32* %borderCols, align 4
  %85 = load i32, i32* %borderRows, align 4
  %call64 = call i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %77, float** %arraydecay, i32 %78, i32 %79, i32 %80, i32 %81, i32 %82, i32 %83, i32 %84, i32 %85)
  store i32 %call64, i32* %ret, align 4
  %call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.19, i64 0, i64 0))
  %86 = load float*, float** %MatrixOut, align 8
  %87 = bitcast float* %86 to i8*
  %88 = load i32, i32* %ret, align 4
  %idxprom = sext i32 %88 to i64
  %arrayidx66 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 %idxprom
  %89 = load float*, float** %arrayidx66, align 8
  %90 = bitcast float* %89 to i8*
  %91 = load i32, i32* %size, align 4
  %conv67 = sext i32 %91 to i64
  %mul68 = mul i64 4, %conv67
  %call69 = call i32 @cudaMemcpy(i8* %87, i8* %90, i64 %mul68, i32 2)
  %92 = load float*, float** %MatrixOut, align 8
  %93 = load i32, i32* %grid_rows, align 4
  %94 = load i32, i32* %grid_cols, align 4
  %95 = load i8*, i8** %ofile, align 8
  call void @_Z11writeoutputPfiiPc(float* %92, i32 %93, i32 %94, i8* %95)
  %96 = load float*, float** %MatrixPower, align 8
  %97 = bitcast float* %96 to i8*
  %call70 = call i32 @cudaFree(i8* %97)
  %arrayidx71 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
  %98 = load float*, float** %arrayidx71, align 16
  %99 = bitcast float* %98 to i8*
  %call72 = call i32 @cudaFree(i8* %99)
  %arrayidx73 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1
  %100 = load float*, float** %arrayidx73, align 8
  %101 = bitcast float* %100 to i8*
  %call74 = call i32 @cudaFree(i8* %101)
  %102 = load float*, float** %MatrixOut, align 8
  %103 = bitcast float* %102 to i8*
  call void @free(i8* %103) #8
  ret void
}

; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #7

; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #2

; Function Attrs: nounwind
declare dso_local noalias i8* @calloc(i64, i64) #2

declare dso_local i32 @cudaMalloc(i8**, i64) #1

declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1

declare dso_local i32 @cudaFree(i8*) #1

; Function Attrs: nounwind
declare dso_local void @free(i8*) #2

define internal void @__cuda_register_globals(i8** %0) {
entry:
  %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  ret void
}

declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)

declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)

declare dso_local i8** @__cudaRegisterFatBinary(i8*)

define internal void @__cuda_module_ctor(i8* %0) {
entry:
  %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
  store i8** %1, i8*** @__cuda_gpubin_handle, align 8
  call void @__cuda_register_globals(i8** %1)
  call void @__cudaRegisterFatBinaryEnd(i8** %1)
  %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
  ret void
}

declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)

declare dso_local void @__cudaUnregisterFatBinary(i8**)

define internal void @__cuda_module_dtor(i8* %0) {
entry:
  %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
  call void @__cudaUnregisterFatBinary(i8** %1)
  ret void
}

declare dso_local i32 @atexit(void (i8*)*)

attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { argmemonly nounwind willreturn }
attributes #4 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { nounwind }
attributes #9 = { noreturn nounwind }
attributes #10 = { nounwind readonly }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}