CuPBoP/examples/hotspot/hotspot-host-x86_64-unknown...

1023 lines
140 KiB
LLVM
Raw Normal View History

2022-05-04 20:59:38 +08:00
; ModuleID = 'hotspot-host-x86_64-unknown-linux-gnu.bc'
source_filename = "hotspot.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
$_ZN4dim3C2Ejjj = comdat any
@t_chip = dso_local global float 0x3F40624DE0000000, align 4
@chip_height = dso_local global float 0x3F90624DE0000000, align 4
@chip_width = dso_local global float 0x3F90624DE0000000, align 4
@amb_temp = dso_local global float 8.000000e+01, align 4
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [11 x i8] c"error: %s\0A\00", align 1
@.str.1 = private unnamed_addr constant [2 x i8] c"w\00", align 1
@.str.2 = private unnamed_addr constant [25 x i8] c"The file was not opened\0A\00", align 1
@.str.3 = private unnamed_addr constant [7 x i8] c"%d\09%g\0A\00", align 1
@.str.4 = private unnamed_addr constant [2 x i8] c"r\00", align 1
@.str.5 = private unnamed_addr constant [25 x i8] c"not enough lines in file\00", align 1
@.str.6 = private unnamed_addr constant [3 x i8] c"%f\00", align 1
@.str.7 = private unnamed_addr constant [20 x i8] c"invalid file format\00", align 1
@.str.8 = private unnamed_addr constant [100 x i8] c"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> <temp_file> <power_file> <output_file>\0A\00", align 1
@.str.9 = private unnamed_addr constant [78 x i8] c"\09<grid_rows/grid_cols> - number of rows/cols in the grid (positive integer)\0A\00", align 1
@.str.10 = private unnamed_addr constant [53 x i8] c"\09<pyramid_height> - pyramid heigh(positive integer)\0A\00", align 1
@.str.11 = private unnamed_addr constant [38 x i8] c"\09<sim_time> - number of iterations\0A\00", align 1
@.str.12 = private unnamed_addr constant [89 x i8] c"\09<temp_file> - name of the file containing the initial temperature values of each cell\0A\00", align 1
@.str.13 = private unnamed_addr constant [86 x i8] c"\09<power_file> - name of the file containing the dissipated power values of each cell\0A\00", align 1
@.str.14 = private unnamed_addr constant [42 x i8] c"\09<output_file> - name of the output file\0A\00", align 1
@.str.15 = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1
@.str.16 = private unnamed_addr constant [26 x i8] c"unable to allocate memory\00", align 1
@.str.17 = private unnamed_addr constant [94 x i8] c"pyramidHeight: %d\0AgridSize: [%d, %d]\0Aborder:[%d, %d]\0AblockGrid:[%d, %d]\0AtargetBlock:[%d, %d]\0A\00", align 1
@.str.18 = private unnamed_addr constant [43 x i8] c"Start computing the transient temperature\0A\00", align 1
@.str.19 = private unnamed_addr constant [19 x i8] c"Ending simulation\0A\00", align 1
@0 = private unnamed_addr constant [36 x i8] c"_Z14calculate_tempiPfS_S_iiiiffffff\00", align 1
@1 = private constant [35409 x i8] c"P\EDU\BA\01\00\10\00@\8A\00\00\00\00\00\00\02\00\01\01@\00\00\00\A8v\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\00v\00\00\00\00\00\00\80s\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z14calculate_tempiPfS_S_iiiiffffff\00.text._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.info._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.shared._Z14calculate_tempiPfS_S_iiiiffffff\00.nv.global\00blockIdx\00threadIdx\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm20_rcp_rn_f32_slowpath\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32\00$_Z14calculate_tempiPfS_S_iiiiffffff$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda__196\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda__198\00$___ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t__200\00.nv.constant0._Z14calculate_tempiPfS_S_iiiiffffff\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00V\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AD\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E7\00\00\00\01\00\09\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F0\00\00\00\01\00\09\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FA\00\00\00\22\00\07\00`]\00\00\00\00\00\00 \04\00\00\00\00\00\00?\01\00\00\22\00\07\00\80a\00\00\00\00\00\00`\01\00\00\00\00\00\00\81\01\00\00\22\00\07\00\E0b\00\00\00\00\00\00`\08\00\00\00\00\00\00z\02\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@k\00\00\00\00\00\00\04/\08\00\0A\00\00\00\17\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00\00\00\00\00\04\11\08\00\07\00\00\00\00\00\00\00\04#\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\C0\00\00\00\04\11\08\00\0A\00\00\00\C0\00\00\00\010\00\00\01*\00\00\04\0A\08\00\09\00\00\00@\01H\00\03\19H\00\04\17\0C\00\00\00\00\00\0D\00D\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0C\00@\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0B\00<\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0A\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\09\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\08\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\00,\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\00(\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0\11\00\03\1B\FF\00\04\1D\08\00h\09\00\00\D8\09\00\00\04\1C\04\00X]\00\00\04\1E\04\00\B0\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([35409 x i8], [35409 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5fatalPc(i8* %s) #0 {
entry:
%s.addr = alloca i8*, align 8
store i8* %s, i8** %s.addr, align 8
%0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%1 = load i8*, i8** %s.addr, align 8
%call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i8* %1)
ret void
}
declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11writeoutputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 {
entry:
%vect.addr = alloca float*, align 8
%grid_rows.addr = alloca i32, align 4
%grid_cols.addr = alloca i32, align 4
%file.addr = alloca i8*, align 8
%i = alloca i32, align 4
%j = alloca i32, align 4
%index = alloca i32, align 4
%fp = alloca %struct._IO_FILE*, align 8
%str = alloca [256 x i8], align 16
store float* %vect, float** %vect.addr, align 8
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i8* %file, i8** %file.addr, align 8
store i32 0, i32* %index, align 4
%0 = load i8*, i8** %file.addr, align 8
%call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0))
store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
%cmp = icmp eq %struct._IO_FILE* %call, null
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0))
br label %if.end
if.end: ; preds = %if.then, %entry
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc10, %if.end
%1 = load i32, i32* %i, align 4
%2 = load i32, i32* %grid_rows.addr, align 4
%cmp2 = icmp slt i32 %1, %2
br i1 %cmp2, label %for.body, label %for.end12
for.body: ; preds = %for.cond
store i32 0, i32* %j, align 4
br label %for.cond3
for.cond3: ; preds = %for.inc, %for.body
%3 = load i32, i32* %j, align 4
%4 = load i32, i32* %grid_cols.addr, align 4
%cmp4 = icmp slt i32 %3, %4
br i1 %cmp4, label %for.body5, label %for.end
for.body5: ; preds = %for.cond3
%arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
%5 = load i32, i32* %index, align 4
%6 = load float*, float** %vect.addr, align 8
%7 = load i32, i32* %i, align 4
%8 = load i32, i32* %grid_cols.addr, align 4
%mul = mul nsw i32 %7, %8
%9 = load i32, i32* %j, align 4
%add = add nsw i32 %mul, %9
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds float, float* %6, i64 %idxprom
%10 = load float, float* %arrayidx, align 4
%conv = fpext float %10 to double
%call6 = call i32 (i8*, i8*, ...) @sprintf(i8* %arraydecay, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.3, i64 0, i64 0), i32 %5, double %conv) #8
%arraydecay7 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
%11 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
%call8 = call i32 @fputs(i8* %arraydecay7, %struct._IO_FILE* %11)
%12 = load i32, i32* %index, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %index, align 4
br label %for.inc
for.inc: ; preds = %for.body5
%13 = load i32, i32* %j, align 4
%inc9 = add nsw i32 %13, 1
store i32 %inc9, i32* %j, align 4
br label %for.cond3
for.end: ; preds = %for.cond3
br label %for.inc10
for.inc10: ; preds = %for.end
%14 = load i32, i32* %i, align 4
%inc11 = add nsw i32 %14, 1
store i32 %inc11, i32* %i, align 4
br label %for.cond
for.end12: ; preds = %for.cond
%15 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
%call13 = call i32 @fclose(%struct._IO_FILE* %15)
ret void
}
declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #1
declare dso_local i32 @printf(i8*, ...) #1
; Function Attrs: nounwind
declare dso_local i32 @sprintf(i8*, i8*, ...) #2
declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #1
declare dso_local i32 @fclose(%struct._IO_FILE*) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z9readinputPfiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i8* %file) #0 {
entry:
%vect.addr = alloca float*, align 8
%grid_rows.addr = alloca i32, align 4
%grid_cols.addr = alloca i32, align 4
%file.addr = alloca i8*, align 8
%i = alloca i32, align 4
%j = alloca i32, align 4
%fp = alloca %struct._IO_FILE*, align 8
%str = alloca [256 x i8], align 16
%val = alloca float, align 4
store float* %vect, float** %vect.addr, align 8
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i8* %file, i8** %file.addr, align 8
%0 = load i8*, i8** %file.addr, align 8
%call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i64 0, i64 0))
store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
%cmp = icmp eq %struct._IO_FILE* %call, null
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.2, i64 0, i64 0))
br label %if.end
if.end: ; preds = %if.then, %entry
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc16, %if.end
%1 = load i32, i32* %i, align 4
%2 = load i32, i32* %grid_rows.addr, align 4
%sub = sub nsw i32 %2, 1
%cmp2 = icmp sle i32 %1, %sub
br i1 %cmp2, label %for.body, label %for.end18
for.body: ; preds = %for.cond
store i32 0, i32* %j, align 4
br label %for.cond3
for.cond3: ; preds = %for.inc, %for.body
%3 = load i32, i32* %j, align 4
%4 = load i32, i32* %grid_cols.addr, align 4
%sub4 = sub nsw i32 %4, 1
%cmp5 = icmp sle i32 %3, %sub4
br i1 %cmp5, label %for.body6, label %for.end
for.body6: ; preds = %for.cond3
%arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
%5 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
%call7 = call i8* @fgets(i8* %arraydecay, i32 256, %struct._IO_FILE* %5)
%6 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
%call8 = call i32 @feof(%struct._IO_FILE* %6) #8
%tobool = icmp ne i32 %call8, 0
br i1 %tobool, label %if.then9, label %if.end10
if.then9: ; preds = %for.body6
call void @_Z5fatalPc(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.5, i64 0, i64 0))
br label %if.end10
if.end10: ; preds = %if.then9, %for.body6
%arraydecay11 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
%call12 = call i32 (i8*, i8*, ...) @sscanf(i8* %arraydecay11, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), float* %val) #8
%cmp13 = icmp ne i32 %call12, 1
br i1 %cmp13, label %if.then14, label %if.end15
if.then14: ; preds = %if.end10
call void @_Z5fatalPc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.7, i64 0, i64 0))
br label %if.end15
if.end15: ; preds = %if.then14, %if.end10
%7 = load float, float* %val, align 4
%8 = load float*, float** %vect.addr, align 8
%9 = load i32, i32* %i, align 4
%10 = load i32, i32* %grid_cols.addr, align 4
%mul = mul nsw i32 %9, %10
%11 = load i32, i32* %j, align 4
%add = add nsw i32 %mul, %11
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds float, float* %8, i64 %idxprom
store float %7, float* %arrayidx, align 4
br label %for.inc
for.inc: ; preds = %if.end15
%12 = load i32, i32* %j, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %j, align 4
br label %for.cond3
for.end: ; preds = %for.cond3
br label %for.inc16
for.inc16: ; preds = %for.end
%13 = load i32, i32* %i, align 4
%inc17 = add nsw i32 %13, 1
store i32 %inc17, i32* %i, align 4
br label %for.cond
for.end18: ; preds = %for.cond
%14 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
%call19 = call i32 @fclose(%struct._IO_FILE* %14)
ret void
}
declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #1
; Function Attrs: nounwind
declare dso_local i32 @feof(%struct._IO_FILE*) #2
; Function Attrs: nounwind
declare dso_local i32 @sscanf(i8*, i8*, ...) #2
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
%iteration.addr = alloca i32, align 4
%power.addr = alloca float*, align 8
%temp_src.addr = alloca float*, align 8
%temp_dst.addr = alloca float*, align 8
%grid_cols.addr = alloca i32, align 4
%grid_rows.addr = alloca i32, align 4
%border_cols.addr = alloca i32, align 4
%border_rows.addr = alloca i32, align 4
%Cap.addr = alloca float, align 4
%Rx.addr = alloca float, align 4
%Ry.addr = alloca float, align 4
%Rz.addr = alloca float, align 4
%step.addr = alloca float, align 4
%time_elapsed.addr = alloca float, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store i32 %iteration, i32* %iteration.addr, align 4
store float* %power, float** %power.addr, align 8
store float* %temp_src, float** %temp_src.addr, align 8
store float* %temp_dst, float** %temp_dst.addr, align 8
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %border_cols, i32* %border_cols.addr, align 4
store i32 %border_rows, i32* %border_rows.addr, align 4
store float %Cap, float* %Cap.addr, align 4
store float %Rx, float* %Rx.addr, align 4
store float %Ry, float* %Ry.addr, align 4
store float %Rz, float* %Rz.addr, align 4
store float %step, float* %step.addr, align 4
store float %time_elapsed, float* %time_elapsed.addr, align 4
%kernel_args = alloca i8*, i64 14, align 16
%0 = bitcast i32* %iteration.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast float** %power.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast float** %temp_src.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = bitcast float** %temp_dst.addr to i8*
%7 = getelementptr i8*, i8** %kernel_args, i32 3
store i8* %6, i8** %7
%8 = bitcast i32* %grid_cols.addr to i8*
%9 = getelementptr i8*, i8** %kernel_args, i32 4
store i8* %8, i8** %9
%10 = bitcast i32* %grid_rows.addr to i8*
%11 = getelementptr i8*, i8** %kernel_args, i32 5
store i8* %10, i8** %11
%12 = bitcast i32* %border_cols.addr to i8*
%13 = getelementptr i8*, i8** %kernel_args, i32 6
store i8* %12, i8** %13
%14 = bitcast i32* %border_rows.addr to i8*
%15 = getelementptr i8*, i8** %kernel_args, i32 7
store i8* %14, i8** %15
%16 = bitcast float* %Cap.addr to i8*
%17 = getelementptr i8*, i8** %kernel_args, i32 8
store i8* %16, i8** %17
%18 = bitcast float* %Rx.addr to i8*
%19 = getelementptr i8*, i8** %kernel_args, i32 9
store i8* %18, i8** %19
%20 = bitcast float* %Ry.addr to i8*
%21 = getelementptr i8*, i8** %kernel_args, i32 10
store i8* %20, i8** %21
%22 = bitcast float* %Rz.addr to i8*
%23 = getelementptr i8*, i8** %kernel_args, i32 11
store i8* %22, i8** %23
%24 = bitcast float* %step.addr to i8*
%25 = getelementptr i8*, i8** %kernel_args, i32 12
store i8* %24, i8** %25
%26 = bitcast float* %time_elapsed.addr to i8*
%27 = getelementptr i8*, i8** %kernel_args, i32 13
store i8* %26, i8** %27
%28 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%29 = load i64, i64* %shmem_size, align 8
%30 = load i8*, i8** %stream, align 8
%31 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%32 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false)
%33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%34 = load i64, i64* %33, align 8
%35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%36 = load i32, i32* %35, align 8
%37 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%38 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false)
%39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%40 = load i64, i64* %39, align 8
%41 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%42 = load i32, i32* %41, align 8
%43 = bitcast i8* %30 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i64 %34, i32 %36, i64 %40, i32 %42, i8** %kernel_args, i64 %29, %struct.CUstream_st* %43)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)
declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3
; Function Attrs: noinline optnone uwtable
define dso_local i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %MatrixPower, float** %MatrixTemp, i32 %col, i32 %row, i32 %total_iterations, i32 %num_iterations, i32 %blockCols, i32 %blockRows, i32 %borderCols, i32 %borderRows) #0 {
entry:
%MatrixPower.addr = alloca float*, align 8
%MatrixTemp.addr = alloca float**, align 8
%col.addr = alloca i32, align 4
%row.addr = alloca i32, align 4
%total_iterations.addr = alloca i32, align 4
%num_iterations.addr = alloca i32, align 4
%blockCols.addr = alloca i32, align 4
%blockRows.addr = alloca i32, align 4
%borderCols.addr = alloca i32, align 4
%borderRows.addr = alloca i32, align 4
%dimBlock = alloca %struct.dim3, align 4
%dimGrid = alloca %struct.dim3, align 4
%grid_height = alloca float, align 4
%grid_width = alloca float, align 4
%Cap = alloca float, align 4
%Rx = alloca float, align 4
%Ry = alloca float, align 4
%Rz = alloca float, align 4
%max_slope = alloca float, align 4
%step = alloca float, align 4
%t = alloca float, align 4
%time_elapsed = alloca float, align 4
%src = alloca i32, align 4
%dst = alloca i32, align 4
%temp = alloca i32, align 4
%agg.tmp = alloca %struct.dim3, align 4
%agg.tmp35 = alloca %struct.dim3, align 4
%agg.tmp.coerce = alloca { i64, i32 }, align 4
%agg.tmp35.coerce = alloca { i64, i32 }, align 4
store float* %MatrixPower, float** %MatrixPower.addr, align 8
store float** %MatrixTemp, float*** %MatrixTemp.addr, align 8
store i32 %col, i32* %col.addr, align 4
store i32 %row, i32* %row.addr, align 4
store i32 %total_iterations, i32* %total_iterations.addr, align 4
store i32 %num_iterations, i32* %num_iterations.addr, align 4
store i32 %blockCols, i32* %blockCols.addr, align 4
store i32 %blockRows, i32* %blockRows.addr, align 4
store i32 %borderCols, i32* %borderCols.addr, align 4
store i32 %borderRows, i32* %borderRows.addr, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1)
%0 = load i32, i32* %blockCols.addr, align 4
%1 = load i32, i32* %blockRows.addr, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %0, i32 %1, i32 1)
%2 = load float, float* @chip_height, align 4
%3 = load i32, i32* %row.addr, align 4
%conv = sitofp i32 %3 to float
%div = fdiv float %2, %conv
store float %div, float* %grid_height, align 4
%4 = load float, float* @chip_width, align 4
%5 = load i32, i32* %col.addr, align 4
%conv1 = sitofp i32 %5 to float
%div2 = fdiv float %4, %conv1
store float %div2, float* %grid_width, align 4
%6 = load float, float* @t_chip, align 4
%conv3 = fpext float %6 to double
%mul = fmul contract double 8.750000e+05, %conv3
%7 = load float, float* %grid_width, align 4
%conv4 = fpext float %7 to double
%mul5 = fmul contract double %mul, %conv4
%8 = load float, float* %grid_height, align 4
%conv6 = fpext float %8 to double
%mul7 = fmul contract double %mul5, %conv6
%conv8 = fptrunc double %mul7 to float
store float %conv8, float* %Cap, align 4
%9 = load float, float* %grid_width, align 4
%conv9 = fpext float %9 to double
%10 = load float, float* @t_chip, align 4
%conv10 = fpext float %10 to double
%mul11 = fmul contract double 2.000000e+02, %conv10
%11 = load float, float* %grid_height, align 4
%conv12 = fpext float %11 to double
%mul13 = fmul contract double %mul11, %conv12
%div14 = fdiv double %conv9, %mul13
%conv15 = fptrunc double %div14 to float
store float %conv15, float* %Rx, align 4
%12 = load float, float* %grid_height, align 4
%conv16 = fpext float %12 to double
%13 = load float, float* @t_chip, align 4
%conv17 = fpext float %13 to double
%mul18 = fmul contract double 2.000000e+02, %conv17
%14 = load float, float* %grid_width, align 4
%conv19 = fpext float %14 to double
%mul20 = fmul contract double %mul18, %conv19
%div21 = fdiv double %conv16, %mul20
%conv22 = fptrunc double %div21 to float
store float %conv22, float* %Ry, align 4
%15 = load float, float* @t_chip, align 4
%16 = load float, float* %grid_height, align 4
%mul23 = fmul contract float 1.000000e+02, %16
%17 = load float, float* %grid_width, align 4
%mul24 = fmul contract float %mul23, %17
%div25 = fdiv float %15, %mul24
store float %div25, float* %Rz, align 4
%18 = load float, float* @t_chip, align 4
%conv26 = fpext float %18 to double
%mul27 = fmul contract double 5.000000e-01, %conv26
%mul28 = fmul contract double %mul27, 1.750000e+06
%div29 = fdiv double 3.000000e+06, %mul28
%conv30 = fptrunc double %div29 to float
store float %conv30, float* %max_slope, align 4
%19 = load float, float* %max_slope, align 4
%conv31 = fpext float %19 to double
%div32 = fdiv double 1.000000e-03, %conv31
%conv33 = fptrunc double %div32 to float
store float %conv33, float* %step, align 4
store float 0x3F50624DE0000000, float* %time_elapsed, align 4
store i32 1, i32* %src, align 4
store i32 0, i32* %dst, align 4
store float 0.000000e+00, float* %t, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%20 = load float, float* %t, align 4
%21 = load i32, i32* %total_iterations.addr, align 4
%conv34 = sitofp i32 %21 to float
%cmp = fcmp olt float %20, %conv34
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%22 = load i32, i32* %src, align 4
store i32 %22, i32* %temp, align 4
%23 = load i32, i32* %dst, align 4
store i32 %23, i32* %src, align 4
%24 = load i32, i32* %temp, align 4
store i32 %24, i32* %dst, align 4
%25 = bitcast %struct.dim3* %agg.tmp to i8*
%26 = bitcast %struct.dim3* %dimGrid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %25, i8* align 4 %26, i64 12, i1 false)
%27 = bitcast %struct.dim3* %agg.tmp35 to i8*
%28 = bitcast %struct.dim3* %dimBlock to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %27, i8* align 4 %28, i64 12, i1 false)
%29 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
%30 = bitcast %struct.dim3* %agg.tmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %29, i8* align 4 %30, i64 12, i1 false)
%31 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
%32 = load i64, i64* %31, align 4
%33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
%34 = load i32, i32* %33, align 4
%35 = bitcast { i64, i32 }* %agg.tmp35.coerce to i8*
%36 = bitcast %struct.dim3* %agg.tmp35 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %35, i8* align 4 %36, i64 12, i1 false)
%37 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 0
%38 = load i64, i64* %37, align 4
%39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp35.coerce, i32 0, i32 1
%40 = load i32, i32* %39, align 4
%call = call i32 @__cudaPushCallConfiguration(i64 %32, i32 %34, i64 %38, i32 %40, i64 0, i8* null)
%tobool = icmp ne i32 %call, 0
br i1 %tobool, label %kcall.end, label %kcall.configok
kcall.configok: ; preds = %for.body
%41 = load i32, i32* %num_iterations.addr, align 4
%conv36 = sitofp i32 %41 to float
%42 = load i32, i32* %total_iterations.addr, align 4
%conv37 = sitofp i32 %42 to float
%43 = load float, float* %t, align 4
%sub = fsub contract float %conv37, %43
%cmp38 = fcmp ole float %conv36, %sub
br i1 %cmp38, label %cond.true, label %cond.false
cond.true: ; preds = %kcall.configok
%44 = load i32, i32* %num_iterations.addr, align 4
%conv39 = sitofp i32 %44 to float
br label %cond.end
cond.false: ; preds = %kcall.configok
%45 = load i32, i32* %total_iterations.addr, align 4
%conv40 = sitofp i32 %45 to float
%46 = load float, float* %t, align 4
%sub41 = fsub contract float %conv40, %46
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi float [ %conv39, %cond.true ], [ %sub41, %cond.false ]
%conv42 = fptosi float %cond to i32
%47 = load float*, float** %MatrixPower.addr, align 8
%48 = load float**, float*** %MatrixTemp.addr, align 8
%49 = load i32, i32* %src, align 4
%idxprom = sext i32 %49 to i64
%arrayidx = getelementptr inbounds float*, float** %48, i64 %idxprom
%50 = load float*, float** %arrayidx, align 8
%51 = load float**, float*** %MatrixTemp.addr, align 8
%52 = load i32, i32* %dst, align 4
%idxprom43 = sext i32 %52 to i64
%arrayidx44 = getelementptr inbounds float*, float** %51, i64 %idxprom43
%53 = load float*, float** %arrayidx44, align 8
%54 = load i32, i32* %col.addr, align 4
%55 = load i32, i32* %row.addr, align 4
%56 = load i32, i32* %borderCols.addr, align 4
%57 = load i32, i32* %borderRows.addr, align 4
%58 = load float, float* %Cap, align 4
%59 = load float, float* %Rx, align 4
%60 = load float, float* %Ry, align 4
%61 = load float, float* %Rz, align 4
%62 = load float, float* %step, align 4
%63 = load float, float* %time_elapsed, align 4
call void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %conv42, float* %47, float* %50, float* %53, i32 %54, i32 %55, i32 %56, i32 %57, float %58, float %59, float %60, float %61, float %62, float %63)
br label %kcall.end
kcall.end: ; preds = %cond.end, %for.body
%call45 = call i32 @cudaDeviceSynchronize()
br label %for.inc
for.inc: ; preds = %kcall.end
%64 = load i32, i32* %num_iterations.addr, align 4
%conv46 = sitofp i32 %64 to float
%65 = load float, float* %t, align 4
%add = fadd contract float %65, %conv46
store float %add, float* %t, align 4
br label %for.cond
for.end: ; preds = %for.cond
%66 = load i32, i32* %dst, align 4
ret i32 %66
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #4 comdat align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
%x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
%0 = load i32, i32* %vx.addr, align 4
store i32 %0, i32* %x, align 4
%y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
%1 = load i32, i32* %vy.addr, align 4
store i32 %1, i32* %y, align 4
%z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
%2 = load i32, i32* %vz.addr, align 4
store i32 %2, i32* %z, align 4
ret void
}
declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #1
declare dso_local i32 @cudaDeviceSynchronize() #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #0 {
entry:
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%1 = load i8**, i8*** %argv.addr, align 8
%arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
%2 = load i8*, i8** %arrayidx, align 8
%call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @.str.8, i64 0, i64 0), i8* %2)
%3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([78 x i8], [78 x i8]* @.str.9, i64 0, i64 0))
%4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([53 x i8], [53 x i8]* @.str.10, i64 0, i64 0))
%5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str.11, i64 0, i64 0))
%6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([89 x i8], [89 x i8]* @.str.12, i64 0, i64 0))
%7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([86 x i8], [86 x i8]* @.str.13, i64 0, i64 0))
%8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
%call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.14, i64 0, i64 0))
call void @exit(i32 1) #9
unreachable
}
; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #5
; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #6 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 @cudaSetDevice(i32 0)
%call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.15, i64 0, i64 0), i32 16, i32 16)
%0 = load i32, i32* %argc.addr, align 4
%1 = load i8**, i8*** %argv.addr, align 8
call void @_Z3runiPPc(i32 %0, i8** %1)
ret i32 0
}
declare dso_local i32 @cudaSetDevice(i32) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z3runiPPc(i32 %argc, i8** %argv) #0 {
entry:
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
%size = alloca i32, align 4
%grid_rows = alloca i32, align 4
%grid_cols = alloca i32, align 4
%FilesavingTemp = alloca float*, align 8
%FilesavingPower = alloca float*, align 8
%MatrixOut = alloca float*, align 8
%tfile = alloca i8*, align 8
%pfile = alloca i8*, align 8
%ofile = alloca i8*, align 8
%total_iterations = alloca i32, align 4
%pyramid_height = alloca i32, align 4
%borderCols = alloca i32, align 4
%borderRows = alloca i32, align 4
%smallBlockCol = alloca i32, align 4
%smallBlockRow = alloca i32, align 4
%blockCols = alloca i32, align 4
%blockRows = alloca i32, align 4
%MatrixTemp = alloca [2 x float*], align 16
%MatrixPower = alloca float*, align 8
%ret = alloca i32, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
store i32 60, i32* %total_iterations, align 4
store i32 1, i32* %pyramid_height, align 4
%0 = load i32, i32* %argc.addr, align 4
%cmp = icmp ne i32 %0, 7
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%1 = load i32, i32* %argc.addr, align 4
%2 = load i8**, i8*** %argv.addr, align 8
call void @_Z5usageiPPc(i32 %1, i8** %2)
br label %if.end
if.end: ; preds = %if.then, %entry
%3 = load i8**, i8*** %argv.addr, align 8
%arrayidx = getelementptr inbounds i8*, i8** %3, i64 1
%4 = load i8*, i8** %arrayidx, align 8
%call = call i32 @atoi(i8* %4) #10
store i32 %call, i32* %grid_rows, align 4
%cmp1 = icmp sle i32 %call, 0
br i1 %cmp1, label %if.then13, label %lor.lhs.false
lor.lhs.false: ; preds = %if.end
%5 = load i8**, i8*** %argv.addr, align 8
%arrayidx2 = getelementptr inbounds i8*, i8** %5, i64 1
%6 = load i8*, i8** %arrayidx2, align 8
%call3 = call i32 @atoi(i8* %6) #10
store i32 %call3, i32* %grid_cols, align 4
%cmp4 = icmp sle i32 %call3, 0
br i1 %cmp4, label %if.then13, label %lor.lhs.false5
lor.lhs.false5: ; preds = %lor.lhs.false
%7 = load i8**, i8*** %argv.addr, align 8
%arrayidx6 = getelementptr inbounds i8*, i8** %7, i64 2
%8 = load i8*, i8** %arrayidx6, align 8
%call7 = call i32 @atoi(i8* %8) #10
store i32 %call7, i32* %pyramid_height, align 4
%cmp8 = icmp sle i32 %call7, 0
br i1 %cmp8, label %if.then13, label %lor.lhs.false9
lor.lhs.false9: ; preds = %lor.lhs.false5
%9 = load i8**, i8*** %argv.addr, align 8
%arrayidx10 = getelementptr inbounds i8*, i8** %9, i64 3
%10 = load i8*, i8** %arrayidx10, align 8
%call11 = call i32 @atoi(i8* %10) #10
store i32 %call11, i32* %total_iterations, align 4
%cmp12 = icmp sle i32 %call11, 0
br i1 %cmp12, label %if.then13, label %if.end14
if.then13: ; preds = %lor.lhs.false9, %lor.lhs.false5, %lor.lhs.false, %if.end
%11 = load i32, i32* %argc.addr, align 4
%12 = load i8**, i8*** %argv.addr, align 8
call void @_Z5usageiPPc(i32 %11, i8** %12)
br label %if.end14
if.end14: ; preds = %if.then13, %lor.lhs.false9
%13 = load i8**, i8*** %argv.addr, align 8
%arrayidx15 = getelementptr inbounds i8*, i8** %13, i64 4
%14 = load i8*, i8** %arrayidx15, align 8
store i8* %14, i8** %tfile, align 8
%15 = load i8**, i8*** %argv.addr, align 8
%arrayidx16 = getelementptr inbounds i8*, i8** %15, i64 5
%16 = load i8*, i8** %arrayidx16, align 8
store i8* %16, i8** %pfile, align 8
%17 = load i8**, i8*** %argv.addr, align 8
%arrayidx17 = getelementptr inbounds i8*, i8** %17, i64 6
%18 = load i8*, i8** %arrayidx17, align 8
store i8* %18, i8** %ofile, align 8
%19 = load i32, i32* %grid_rows, align 4
%20 = load i32, i32* %grid_cols, align 4
%mul = mul nsw i32 %19, %20
store i32 %mul, i32* %size, align 4
%21 = load i32, i32* %pyramid_height, align 4
%mul18 = mul nsw i32 %21, 2
%div = sdiv i32 %mul18, 2
store i32 %div, i32* %borderCols, align 4
%22 = load i32, i32* %pyramid_height, align 4
%mul19 = mul nsw i32 %22, 2
%div20 = sdiv i32 %mul19, 2
store i32 %div20, i32* %borderRows, align 4
%23 = load i32, i32* %pyramid_height, align 4
%mul21 = mul nsw i32 %23, 2
%sub = sub nsw i32 16, %mul21
store i32 %sub, i32* %smallBlockCol, align 4
%24 = load i32, i32* %pyramid_height, align 4
%mul22 = mul nsw i32 %24, 2
%sub23 = sub nsw i32 16, %mul22
store i32 %sub23, i32* %smallBlockRow, align 4
%25 = load i32, i32* %grid_cols, align 4
%26 = load i32, i32* %smallBlockCol, align 4
%div24 = sdiv i32 %25, %26
%27 = load i32, i32* %grid_cols, align 4
%28 = load i32, i32* %smallBlockCol, align 4
%rem = srem i32 %27, %28
%cmp25 = icmp eq i32 %rem, 0
%29 = zext i1 %cmp25 to i64
%cond = select i1 %cmp25, i32 0, i32 1
%add = add nsw i32 %div24, %cond
store i32 %add, i32* %blockCols, align 4
%30 = load i32, i32* %grid_rows, align 4
%31 = load i32, i32* %smallBlockRow, align 4
%div26 = sdiv i32 %30, %31
%32 = load i32, i32* %grid_rows, align 4
%33 = load i32, i32* %smallBlockRow, align 4
%rem27 = srem i32 %32, %33
%cmp28 = icmp eq i32 %rem27, 0
%34 = zext i1 %cmp28 to i64
%cond29 = select i1 %cmp28, i32 0, i32 1
%add30 = add nsw i32 %div26, %cond29
store i32 %add30, i32* %blockRows, align 4
%35 = load i32, i32* %size, align 4
%conv = sext i32 %35 to i64
%mul31 = mul i64 %conv, 4
%call32 = call noalias i8* @malloc(i64 %mul31) #8
%36 = bitcast i8* %call32 to float*
store float* %36, float** %FilesavingTemp, align 8
%37 = load i32, i32* %size, align 4
%conv33 = sext i32 %37 to i64
%mul34 = mul i64 %conv33, 4
%call35 = call noalias i8* @malloc(i64 %mul34) #8
%38 = bitcast i8* %call35 to float*
store float* %38, float** %FilesavingPower, align 8
%39 = load i32, i32* %size, align 4
%conv36 = sext i32 %39 to i64
%call37 = call noalias i8* @calloc(i64 %conv36, i64 4) #8
%40 = bitcast i8* %call37 to float*
store float* %40, float** %MatrixOut, align 8
%41 = load float*, float** %FilesavingPower, align 8
%tobool = icmp ne float* %41, null
br i1 %tobool, label %lor.lhs.false38, label %if.then42
lor.lhs.false38: ; preds = %if.end14
%42 = load float*, float** %FilesavingTemp, align 8
%tobool39 = icmp ne float* %42, null
br i1 %tobool39, label %lor.lhs.false40, label %if.then42
lor.lhs.false40: ; preds = %lor.lhs.false38
%43 = load float*, float** %MatrixOut, align 8
%tobool41 = icmp ne float* %43, null
br i1 %tobool41, label %if.end43, label %if.then42
if.then42: ; preds = %lor.lhs.false40, %lor.lhs.false38, %if.end14
call void @_Z5fatalPc(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.16, i64 0, i64 0))
br label %if.end43
if.end43: ; preds = %if.then42, %lor.lhs.false40
%44 = load i32, i32* %pyramid_height, align 4
%45 = load i32, i32* %grid_cols, align 4
%46 = load i32, i32* %grid_rows, align 4
%47 = load i32, i32* %borderCols, align 4
%48 = load i32, i32* %borderRows, align 4
%49 = load i32, i32* %blockCols, align 4
%50 = load i32, i32* %blockRows, align 4
%51 = load i32, i32* %smallBlockCol, align 4
%52 = load i32, i32* %smallBlockRow, align 4
%call44 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([94 x i8], [94 x i8]* @.str.17, i64 0, i64 0), i32 %44, i32 %45, i32 %46, i32 %47, i32 %48, i32 %49, i32 %50, i32 %51, i32 %52)
%53 = load float*, float** %FilesavingTemp, align 8
%54 = load i32, i32* %grid_rows, align 4
%55 = load i32, i32* %grid_cols, align 4
%56 = load i8*, i8** %tfile, align 8
call void @_Z9readinputPfiiPc(float* %53, i32 %54, i32 %55, i8* %56)
%57 = load float*, float** %FilesavingPower, align 8
%58 = load i32, i32* %grid_rows, align 4
%59 = load i32, i32* %grid_cols, align 4
%60 = load i8*, i8** %pfile, align 8
call void @_Z9readinputPfiiPc(float* %57, i32 %58, i32 %59, i8* %60)
%arrayidx45 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
%61 = bitcast float** %arrayidx45 to i8**
%62 = load i32, i32* %size, align 4
%conv46 = sext i32 %62 to i64
%mul47 = mul i64 4, %conv46
%call48 = call i32 @cudaMalloc(i8** %61, i64 %mul47)
%arrayidx49 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1
%63 = bitcast float** %arrayidx49 to i8**
%64 = load i32, i32* %size, align 4
%conv50 = sext i32 %64 to i64
%mul51 = mul i64 4, %conv50
%call52 = call i32 @cudaMalloc(i8** %63, i64 %mul51)
%arrayidx53 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
%65 = load float*, float** %arrayidx53, align 16
%66 = bitcast float* %65 to i8*
%67 = load float*, float** %FilesavingTemp, align 8
%68 = bitcast float* %67 to i8*
%69 = load i32, i32* %size, align 4
%conv54 = sext i32 %69 to i64
%mul55 = mul i64 4, %conv54
%call56 = call i32 @cudaMemcpy(i8* %66, i8* %68, i64 %mul55, i32 1)
%70 = bitcast float** %MatrixPower to i8**
%71 = load i32, i32* %size, align 4
%conv57 = sext i32 %71 to i64
%mul58 = mul i64 4, %conv57
%call59 = call i32 @cudaMalloc(i8** %70, i64 %mul58)
%72 = load float*, float** %MatrixPower, align 8
%73 = bitcast float* %72 to i8*
%74 = load float*, float** %FilesavingPower, align 8
%75 = bitcast float* %74 to i8*
%76 = load i32, i32* %size, align 4
%conv60 = sext i32 %76 to i64
%mul61 = mul i64 4, %conv60
%call62 = call i32 @cudaMemcpy(i8* %73, i8* %75, i64 %mul61, i32 1)
%call63 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.18, i64 0, i64 0))
%77 = load float*, float** %MatrixPower, align 8
%arraydecay = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
%78 = load i32, i32* %grid_cols, align 4
%79 = load i32, i32* %grid_rows, align 4
%80 = load i32, i32* %total_iterations, align 4
%81 = load i32, i32* %pyramid_height, align 4
%82 = load i32, i32* %blockCols, align 4
%83 = load i32, i32* %blockRows, align 4
%84 = load i32, i32* %borderCols, align 4
%85 = load i32, i32* %borderRows, align 4
%call64 = call i32 @_Z17compute_tran_tempPfPS_iiiiiiii(float* %77, float** %arraydecay, i32 %78, i32 %79, i32 %80, i32 %81, i32 %82, i32 %83, i32 %84, i32 %85)
store i32 %call64, i32* %ret, align 4
%call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.19, i64 0, i64 0))
%86 = load float*, float** %MatrixOut, align 8
%87 = bitcast float* %86 to i8*
%88 = load i32, i32* %ret, align 4
%idxprom = sext i32 %88 to i64
%arrayidx66 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 %idxprom
%89 = load float*, float** %arrayidx66, align 8
%90 = bitcast float* %89 to i8*
%91 = load i32, i32* %size, align 4
%conv67 = sext i32 %91 to i64
%mul68 = mul i64 4, %conv67
%call69 = call i32 @cudaMemcpy(i8* %87, i8* %90, i64 %mul68, i32 2)
%92 = load float*, float** %MatrixOut, align 8
%93 = load i32, i32* %grid_rows, align 4
%94 = load i32, i32* %grid_cols, align 4
%95 = load i8*, i8** %ofile, align 8
call void @_Z11writeoutputPfiiPc(float* %92, i32 %93, i32 %94, i8* %95)
%96 = load float*, float** %MatrixPower, align 8
%97 = bitcast float* %96 to i8*
%call70 = call i32 @cudaFree(i8* %97)
%arrayidx71 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 0
%98 = load float*, float** %arrayidx71, align 16
%99 = bitcast float* %98 to i8*
%call72 = call i32 @cudaFree(i8* %99)
%arrayidx73 = getelementptr inbounds [2 x float*], [2 x float*]* %MatrixTemp, i64 0, i64 1
%100 = load float*, float** %arrayidx73, align 8
%101 = bitcast float* %100 to i8*
%call74 = call i32 @cudaFree(i8* %101)
%102 = load float*, float** %MatrixOut, align 8
%103 = bitcast float* %102 to i8*
call void @free(i8* %103) #8
ret void
}
; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #7
; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #2
; Function Attrs: nounwind
declare dso_local noalias i8* @calloc(i64, i64) #2
declare dso_local i32 @cudaMalloc(i8**, i64) #1
declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1
declare dso_local i32 @cudaFree(i8*) #1
; Function Attrs: nounwind
declare dso_local void @free(i8*) #2
define internal void @__cuda_register_globals(i8** %0) {
entry:
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff to i8*), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([36 x i8], [36 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
ret void
}
declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)
declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)
declare dso_local i8** @__cudaRegisterFatBinary(i8*)
define internal void @__cuda_module_ctor(i8* %0) {
entry:
%1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
store i8** %1, i8*** @__cuda_gpubin_handle, align 8
call void @__cuda_register_globals(i8** %1)
call void @__cudaRegisterFatBinaryEnd(i8** %1)
%2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
ret void
}
declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)
declare dso_local void @__cudaUnregisterFatBinary(i8**)
define internal void @__cuda_module_dtor(i8* %0) {
entry:
%1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
call void @__cudaUnregisterFatBinary(i8** %1)
ret void
}
declare dso_local i32 @atexit(void (i8*)*)
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { argmemonly nounwind willreturn }
attributes #4 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { nounwind }
attributes #9 = { noreturn nounwind }
attributes #10 = { nounwind readonly }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}