CuPBoP/examples/hotspot3D/3D-host-x86_64-unknown-linu...

; ModuleID = '3D-host-x86_64-unknown-linux-gnu.bc'
source_filename = "3D.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.timeval = type { i64, i64 }
%struct.timezone = type { i32, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

$_ZN4dim3C2Ejjj = comdat any

$_ZSt4sqrtf = comdat any

@.str = private unnamed_addr constant [16 x i8] c"Time: %.3f (s)\0A\00", align 1
@t_chip = dso_local global float 0x3F40624DE0000000, align 4
@chip_height = dso_local global float 0x3F90624DE0000000, align 4
@chip_width = dso_local global float 0x3F90624DE0000000, align 4
@amb_temp = dso_local global float 8.000000e+01, align 4
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str.1 = private unnamed_addr constant [11 x i8] c"Error: %s\0A\00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"r\00", align 1
@.str.3 = private unnamed_addr constant [24 x i8] c"The file was not opened\00", align 1
@.str.4 = private unnamed_addr constant [20 x i8] c"Error reading file\0A\00", align 1
@.str.5 = private unnamed_addr constant [25 x i8] c"not enough lines in file\00", align 1
@.str.6 = private unnamed_addr constant [3 x i8] c"%f\00", align 1
@.str.7 = private unnamed_addr constant [20 x i8] c"invalid file format\00", align 1
@.str.8 = private unnamed_addr constant [2 x i8] c"w\00", align 1
@.str.9 = private unnamed_addr constant [25 x i8] c"The file was not opened\0A\00", align 1
@.str.10 = private unnamed_addr constant [7 x i8] c"%d\09%g\0A\00", align 1
@.str.11 = private unnamed_addr constant [81 x i8] c"Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> <outputFile>\0A\00", align 1
@.str.12 = private unnamed_addr constant [68 x i8] c"\09<rows/cols>  - number of rows/cols in the grid (positive integer)\0A\00", align 1
@.str.13 = private unnamed_addr constant [62 x i8] c"\09<layers>  - number of layers in the grid (positive integer)\0A\00", align 1
@.str.14 = private unnamed_addr constant [37 x i8] c"\09<iteration> - number of iterations\0A\00", align 1
@.str.15 = private unnamed_addr constant [83 x i8] c"\09<powerFile>  - name of the file containing the initial power values of each cell\0A\00", align 1
@.str.16 = private unnamed_addr constant [88 x i8] c"\09<tempFile>  - name of the file containing the initial temperature values of each cell\0A\00", align 1
@.str.17 = private unnamed_addr constant [28 x i8] c"\09<outputFile - output file\0A\00", align 1
@.str.18 = private unnamed_addr constant [14 x i8] c"Accuracy: %e\0A\00", align 1
@0 = private unnamed_addr constant [33 x i8] c"_Z11hotspotOpt1PfS_S_fiiifffffff\00", align 1
@1 = private constant [27433 x i8] c"P\EDU\BA\01\00\10\00\18k\00\00\00\00\00\00\02\00\01\01@\00\00\00([\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\80Z\00\00\00\00\00\00@X\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\09\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.info._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.shared._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.global\00.nv.constant0._Z11hotspotOpt1PfS_S_fiiifffffff\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z11hotspotOpt1PfS_S_fiiifffffff\00.text._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.info._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.shared._Z11hotspotOpt1PfS_S_fiiifffffff\00.nv.global\00blockDim\00blockIdx\00threadIdx\00.nv.constant0._Z11hotspotOpt1PfS_S_fiiifffffff\00_param\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00S\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D0\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DB\00\00\00\01\00\08\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\E4\00\00\00\01\00\08\00\02\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\ED\00\00\00\01\00\08\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\F7\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\07\00\00\00\00\00\00\00\00\00@R\00\00\00\00\00\00\04/\08\00\07\00\00\00\17\00\00\00\04#\08\00\07\00\00\00\00\00\00\00\04\12\08\00\07\00\00\00x\00\00\00\04\11\08\00\07\00\00\00x\00\00\00\010\00\00\01*\00\00\04\0A\08\00\06\00\00\00@\01D\00\03\19D\00\04\17\0C\00\00\00\00\00\0D\00@\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0C\00<\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0B\008\00\00\F0\11\00\04\17\0C\00\00\00\00\00\0A\004\00\00\F0\11\00\04\17\0C\00\00\00\00\00\09\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\08\00,\00\00\F0\11\00\04\17\0C\00\00\00\00\00\07\00(\00\00\F0\11\00\04\17\0C\00\00\00\00\00\06\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00\1C\00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\08\00X\09\00\00\08\0A\00\00\04\1C\04\00\18R\00\00\04\1E\04\00 \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EF\1F\E0\FD\03<d\00\01\00\87\00\80\07\98L\01\01\87\F8\FF\FF\0F\1C\00\00w\03\00\00\C8\F0\EF\1F\E0\FD\03\BC\7F\00\07\01\07\00\80\03l[\0F\00\80\00\00\00@\E2\C0\00\10\00\00\00\A0\E3\EF\1F\E0!\03\BC\7F\00\00\01\F7\0F\00\00\10\\\00\0A\07\00\00\00\E0\\\02\00\07\00\80\07\98\\\EF\1F\E0\FD\03\BC\7F\00\03\00\F7\0F\80\07\98\\\00\
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([27433 x i8], [27433 x i8]* @1, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i64 @_Z8get_timev() #0 {
entry:
  %tv = alloca %struct.timeval, align 8
  %call = call i32 @gettimeofday(%struct.timeval* %tv, %struct.timezone* null) #8
  %tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 0
  %0 = load i64, i64* %tv_sec, align 8
  %mul = mul nsw i64 %0, 1000000
  %tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %tv, i32 0, i32 1
  %1 = load i64, i64* %tv_usec, align 8
  %add = add nsw i64 %mul, %1
  ret i64 %add
}

; Function Attrs: nounwind
declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #2 {
entry:
  %p.addr = alloca float*, align 8
  %tIn.addr = alloca float*, align 8
  %tOut.addr = alloca float*, align 8
  %sdc.addr = alloca float, align 4
  %nx.addr = alloca i32, align 4
  %ny.addr = alloca i32, align 4
  %nz.addr = alloca i32, align 4
  %ce.addr = alloca float, align 4
  %cw.addr = alloca float, align 4
  %cn.addr = alloca float, align 4
  %cs.addr = alloca float, align 4
  %ct.addr = alloca float, align 4
  %cb.addr = alloca float, align 4
  %cc.addr = alloca float, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store float* %p, float** %p.addr, align 8
  store float* %tIn, float** %tIn.addr, align 8
  store float* %tOut, float** %tOut.addr, align 8
  store float %sdc, float* %sdc.addr, align 4
  store i32 %nx, i32* %nx.addr, align 4
  store i32 %ny, i32* %ny.addr, align 4
  store i32 %nz, i32* %nz.addr, align 4
  store float %ce, float* %ce.addr, align 4
  store float %cw, float* %cw.addr, align 4
  store float %cn, float* %cn.addr, align 4
  store float %cs, float* %cs.addr, align 4
  store float %ct, float* %ct.addr, align 4
  store float %cb, float* %cb.addr, align 4
  store float %cc, float* %cc.addr, align 4
  %kernel_args = alloca i8*, i64 14, align 16
  %0 = bitcast float** %p.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast float** %tIn.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast float** %tOut.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast float* %sdc.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast i32* %nx.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = bitcast i32* %ny.addr to i8*
  %11 = getelementptr i8*, i8** %kernel_args, i32 5
  store i8* %10, i8** %11
  %12 = bitcast i32* %nz.addr to i8*
  %13 = getelementptr i8*, i8** %kernel_args, i32 6
  store i8* %12, i8** %13
  %14 = bitcast float* %ce.addr to i8*
  %15 = getelementptr i8*, i8** %kernel_args, i32 7
  store i8* %14, i8** %15
  %16 = bitcast float* %cw.addr to i8*
  %17 = getelementptr i8*, i8** %kernel_args, i32 8
  store i8* %16, i8** %17
  %18 = bitcast float* %cn.addr to i8*
  %19 = getelementptr i8*, i8** %kernel_args, i32 9
  store i8* %18, i8** %19
  %20 = bitcast float* %cs.addr to i8*
  %21 = getelementptr i8*, i8** %kernel_args, i32 10
  store i8* %20, i8** %21
  %22 = bitcast float* %ct.addr to i8*
  %23 = getelementptr i8*, i8** %kernel_args, i32 11
  store i8* %22, i8** %23
  %24 = bitcast float* %cb.addr to i8*
  %25 = getelementptr i8*, i8** %kernel_args, i32 12
  store i8* %24, i8** %25
  %26 = bitcast float* %cc.addr to i8*
  %27 = getelementptr i8*, i8** %kernel_args, i32 13
  store i8* %26, i8** %27
  %28 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %29 = load i64, i64* %shmem_size, align 8
  %30 = load i8*, i8** %stream, align 8
  %31 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %32 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %31, i8* align 8 %32, i64 12, i1 false)
  %33 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %34 = load i64, i64* %33, align 8
  %35 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %36 = load i32, i32* %35, align 8
  %37 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %38 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 12, i1 false)
  %39 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %40 = load i64, i64* %39, align 8
  %41 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %42 = load i32, i32* %41, align 8
  %43 = bitcast i8* %30 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff to i8*), i64 %34, i32 %36, i64 %40, i32 %42, i8** %kernel_args, i64 %29, %struct.CUstream_st* %43)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)

declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z12hotspot_opt1PfS_S_iiifffffi(float* %p, float* %tIn, float* %tOut, i32 %nx, i32 %ny, i32 %nz, float %Cap, float %Rx, float %Ry, float %Rz, float %dt, i32 %numiter) #2 {
entry:
  %p.addr = alloca float*, align 8
  %tIn.addr = alloca float*, align 8
  %tOut.addr = alloca float*, align 8
  %nx.addr = alloca i32, align 4
  %ny.addr = alloca i32, align 4
  %nz.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %dt.addr = alloca float, align 4
  %numiter.addr = alloca i32, align 4
  %ce = alloca float, align 4
  %cw = alloca float, align 4
  %cn = alloca float, align 4
  %cs = alloca float, align 4
  %ct = alloca float, align 4
  %cb = alloca float, align 4
  %cc = alloca float, align 4
  %stepDivCap = alloca float, align 4
  %s = alloca i64, align 8
  %tIn_d = alloca float*, align 8
  %tOut_d = alloca float*, align 8
  %p_d = alloca float*, align 8
  %block_dim = alloca %struct.dim3, align 4
  %grid_dim = alloca %struct.dim3, align 4
  %start = alloca i64, align 8
  %i = alloca i32, align 4
  %agg.tmp = alloca %struct.dim3, align 4
  %agg.tmp23 = alloca %struct.dim3, align 4
  %agg.tmp.coerce = alloca { i64, i32 }, align 4
  %agg.tmp23.coerce = alloca { i64, i32 }, align 4
  %t = alloca float*, align 8
  %stop = alloca i64, align 8
  %time = alloca float, align 4
  store float* %p, float** %p.addr, align 8
  store float* %tIn, float** %tIn.addr, align 8
  store float* %tOut, float** %tOut.addr, align 8
  store i32 %nx, i32* %nx.addr, align 4
  store i32 %ny, i32* %ny.addr, align 4
  store i32 %nz, i32* %nz.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %dt, float* %dt.addr, align 4
  store i32 %numiter, i32* %numiter.addr, align 4
  %0 = load float, float* %dt.addr, align 4
  %1 = load float, float* %Cap.addr, align 4
  %div = fdiv float %0, %1
  store float %div, float* %stepDivCap, align 4
  %2 = load float, float* %stepDivCap, align 4
  %3 = load float, float* %Rx.addr, align 4
  %div1 = fdiv float %2, %3
  store float %div1, float* %cw, align 4
  store float %div1, float* %ce, align 4
  %4 = load float, float* %stepDivCap, align 4
  %5 = load float, float* %Ry.addr, align 4
  %div2 = fdiv float %4, %5
  store float %div2, float* %cs, align 4
  store float %div2, float* %cn, align 4
  %6 = load float, float* %stepDivCap, align 4
  %7 = load float, float* %Rz.addr, align 4
  %div3 = fdiv float %6, %7
  store float %div3, float* %cb, align 4
  store float %div3, float* %ct, align 4
  %8 = load float, float* %ce, align 4
  %conv = fpext float %8 to double
  %mul = fmul contract double 2.000000e+00, %conv
  %9 = load float, float* %cn, align 4
  %conv4 = fpext float %9 to double
  %mul5 = fmul contract double 2.000000e+00, %conv4
  %add = fadd contract double %mul, %mul5
  %10 = load float, float* %ct, align 4
  %conv6 = fpext float %10 to double
  %mul7 = fmul contract double 3.000000e+00, %conv6
  %add8 = fadd contract double %add, %mul7
  %sub = fsub contract double 1.000000e+00, %add8
  %conv9 = fptrunc double %sub to float
  store float %conv9, float* %cc, align 4
  %11 = load i32, i32* %nx.addr, align 4
  %conv10 = sext i32 %11 to i64
  %mul11 = mul i64 4, %conv10
  %12 = load i32, i32* %ny.addr, align 4
  %conv12 = sext i32 %12 to i64
  %mul13 = mul i64 %mul11, %conv12
  %13 = load i32, i32* %nz.addr, align 4
  %conv14 = sext i32 %13 to i64
  %mul15 = mul i64 %mul13, %conv14
  store i64 %mul15, i64* %s, align 8
  %14 = bitcast float** %p_d to i8**
  %15 = load i64, i64* %s, align 8
  %call = call i32 @cudaMalloc(i8** %14, i64 %15)
  %16 = bitcast float** %tIn_d to i8**
  %17 = load i64, i64* %s, align 8
  %call16 = call i32 @cudaMalloc(i8** %16, i64 %17)
  %18 = bitcast float** %tOut_d to i8**
  %19 = load i64, i64* %s, align 8
  %call17 = call i32 @cudaMalloc(i8** %18, i64 %19)
  %20 = load float*, float** %tIn_d, align 8
  %21 = bitcast float* %20 to i8*
  %22 = load float*, float** %tIn.addr, align 8
  %23 = bitcast float* %22 to i8*
  %24 = load i64, i64* %s, align 8
  %call18 = call i32 @cudaMemcpy(i8* %21, i8* %23, i64 %24, i32 1)
  %25 = load float*, float** %p_d, align 8
  %26 = bitcast float* %25 to i8*
  %27 = load float*, float** %p.addr, align 8
  %28 = bitcast float* %27 to i8*
  %29 = load i64, i64* %s, align 8
  %call19 = call i32 @cudaMemcpy(i8* %26, i8* %28, i64 %29, i32 1)
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %block_dim, i32 64, i32 4, i32 1)
  %30 = load i32, i32* %nx.addr, align 4
  %div20 = sdiv i32 %30, 64
  %31 = load i32, i32* %ny.addr, align 4
  %div21 = sdiv i32 %31, 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid_dim, i32 %div20, i32 %div21, i32 1)
  %call22 = call i64 @_Z8get_timev()
  store i64 %call22, i64* %start, align 8
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %32 = load i32, i32* %i, align 4
  %33 = load i32, i32* %numiter.addr, align 4
  %cmp = icmp slt i32 %32, %33
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %34 = bitcast %struct.dim3* %agg.tmp to i8*
  %35 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %34, i8* align 4 %35, i64 12, i1 false)
  %36 = bitcast %struct.dim3* %agg.tmp23 to i8*
  %37 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %36, i8* align 4 %37, i64 12, i1 false)
  %38 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
  %39 = bitcast %struct.dim3* %agg.tmp to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %38, i8* align 4 %39, i64 12, i1 false)
  %40 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
  %41 = load i64, i64* %40, align 4
  %42 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
  %43 = load i32, i32* %42, align 4
  %44 = bitcast { i64, i32 }* %agg.tmp23.coerce to i8*
  %45 = bitcast %struct.dim3* %agg.tmp23 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %44, i8* align 4 %45, i64 12, i1 false)
  %46 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp23.coerce, i32 0, i32 0
  %47 = load i64, i64* %46, align 4
  %48 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp23.coerce, i32 0, i32 1
  %49 = load i32, i32* %48, align 4
  %call24 = call i32 @__cudaPushCallConfiguration(i64 %41, i32 %43, i64 %47, i32 %49, i64 0, i8* null)
  %tobool = icmp ne i32 %call24, 0
  br i1 %tobool, label %kcall.end, label %kcall.configok

kcall.configok:                                   ; preds = %for.body
  %50 = load float*, float** %p_d, align 8
  %51 = load float*, float** %tIn_d, align 8
  %52 = load float*, float** %tOut_d, align 8
  %53 = load float, float* %stepDivCap, align 4
  %54 = load i32, i32* %nx.addr, align 4
  %55 = load i32, i32* %ny.addr, align 4
  %56 = load i32, i32* %nz.addr, align 4
  %57 = load float, float* %ce, align 4
  %58 = load float, float* %cw, align 4
  %59 = load float, float* %cn, align 4
  %60 = load float, float* %cs, align 4
  %61 = load float, float* %ct, align 4
  %62 = load float, float* %cb, align 4
  %63 = load float, float* %cc, align 4
  call void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %50, float* %51, float* %52, float %53, i32 %54, i32 %55, i32 %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63)
  br label %kcall.end

kcall.end:                                        ; preds = %kcall.configok, %for.body
  %64 = load float*, float** %tIn_d, align 8
  store float* %64, float** %t, align 8
  %65 = load float*, float** %tOut_d, align 8
  store float* %65, float** %tIn_d, align 8
  %66 = load float*, float** %t, align 8
  store float* %66, float** %tOut_d, align 8
  br label %for.inc

for.inc:                                          ; preds = %kcall.end
  %67 = load i32, i32* %i, align 4
  %inc = add nsw i32 %67, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %call25 = call i32 @cudaDeviceSynchronize()
  %call26 = call i64 @_Z8get_timev()
  store i64 %call26, i64* %stop, align 8
  %68 = load i64, i64* %stop, align 8
  %69 = load i64, i64* %start, align 8
  %sub27 = sub nsw i64 %68, %69
  %conv28 = sitofp i64 %sub27 to double
  %div29 = fdiv double %conv28, 1.000000e+06
  %conv30 = fptrunc double %div29 to float
  store float %conv30, float* %time, align 4
  %70 = load float, float* %time, align 4
  %conv31 = fpext float %70 to double
  %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str, i64 0, i64 0), double %conv31)
  %71 = load float*, float** %tOut.addr, align 8
  %72 = bitcast float* %71 to i8*
  %73 = load float*, float** %tOut_d, align 8
  %74 = bitcast float* %73 to i8*
  %75 = load i64, i64* %s, align 8
  %call33 = call i32 @cudaMemcpy(i8* %72, i8* %74, i64 %75, i32 2)
  %76 = load float*, float** %p_d, align 8
  %77 = bitcast float* %76 to i8*
  %call34 = call i32 @cudaFree(i8* %77)
  %78 = load float*, float** %tIn_d, align 8
  %79 = bitcast float* %78 to i8*
  %call35 = call i32 @cudaFree(i8* %79)
  %80 = load float*, float** %tOut_d, align 8
  %81 = bitcast float* %80 to i8*
  %call36 = call i32 @cudaFree(i8* %81)
  ret void
}

declare dso_local i32 @cudaMalloc(i8**, i64) #4

declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #4

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #0 comdat align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
  %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
  %0 = load i32, i32* %vx.addr, align 4
  store i32 %0, i32* %x, align 4
  %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
  %1 = load i32, i32* %vy.addr, align 4
  store i32 %1, i32* %y, align 4
  %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
  %2 = load i32, i32* %vz.addr, align 4
  store i32 %2, i32* %z, align 4
  ret void
}

declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4

declare dso_local i32 @cudaDeviceSynchronize() #4

declare dso_local i32 @printf(i8*, ...) #4

declare dso_local i32 @cudaFree(i8*) #4

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5fatalPKc(i8* %s) #2 {
entry:
  %s.addr = alloca i8*, align 8
  store i8* %s, i8** %s.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8*, i8** %s.addr, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.1, i64 0, i64 0), i8* %1)
  ret void
}

declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #4

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z9readinputPfiiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i32 %layers, i8* %file) #2 {
entry:
  %vect.addr = alloca float*, align 8
  %grid_rows.addr = alloca i32, align 4
  %grid_cols.addr = alloca i32, align 4
  %layers.addr = alloca i32, align 4
  %file.addr = alloca i8*, align 8
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %k = alloca i32, align 4
  %fp = alloca %struct._IO_FILE*, align 8
  %str = alloca [256 x i8], align 16
  %val = alloca float, align 4
  store float* %vect, float** %vect.addr, align 8
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %layers, i32* %layers.addr, align 4
  store i8* %file, i8** %file.addr, align 8
  %0 = load i8*, i8** %file.addr, align 8
  %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0))
  store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
  %cmp = icmp eq %struct._IO_FILE* %call, null
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  call void @_Z5fatalPKc(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.3, i64 0, i64 0))
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc28, %if.end
  %1 = load i32, i32* %i, align 4
  %2 = load i32, i32* %grid_rows.addr, align 4
  %sub = sub nsw i32 %2, 1
  %cmp1 = icmp sle i32 %1, %sub
  br i1 %cmp1, label %for.body, label %for.end30

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %j, align 4
  br label %for.cond2

for.cond2:                                        ; preds = %for.inc25, %for.body
  %3 = load i32, i32* %j, align 4
  %4 = load i32, i32* %grid_cols.addr, align 4
  %sub3 = sub nsw i32 %4, 1
  %cmp4 = icmp sle i32 %3, %sub3
  br i1 %cmp4, label %for.body5, label %for.end27

for.body5:                                        ; preds = %for.cond2
  store i32 0, i32* %k, align 4
  br label %for.cond6

for.cond6:                                        ; preds = %for.inc, %for.body5
  %5 = load i32, i32* %k, align 4
  %6 = load i32, i32* %layers.addr, align 4
  %sub7 = sub nsw i32 %6, 1
  %cmp8 = icmp sle i32 %5, %sub7
  br i1 %cmp8, label %for.body9, label %for.end

for.body9:                                        ; preds = %for.cond6
  %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call10 = call i8* @fgets(i8* %arraydecay, i32 256, %struct._IO_FILE* %7)
  %cmp11 = icmp eq i8* %call10, null
  br i1 %cmp11, label %if.then12, label %if.end13

if.then12:                                        ; preds = %for.body9
  call void @_Z5fatalPKc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.4, i64 0, i64 0))
  br label %if.end13

if.end13:                                         ; preds = %if.then12, %for.body9
  %8 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call14 = call i32 @feof(%struct._IO_FILE* %8) #8
  %tobool = icmp ne i32 %call14, 0
  br i1 %tobool, label %if.then15, label %if.end16

if.then15:                                        ; preds = %if.end13
  call void @_Z5fatalPKc(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.5, i64 0, i64 0))
  br label %if.end16

if.end16:                                         ; preds = %if.then15, %if.end13
  %arraydecay17 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %call18 = call i32 (i8*, i8*, ...) @sscanf(i8* %arraydecay17, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0), float* %val) #8
  %cmp19 = icmp ne i32 %call18, 1
  br i1 %cmp19, label %if.then20, label %if.end21

if.then20:                                        ; preds = %if.end16
  call void @_Z5fatalPKc(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str.7, i64 0, i64 0))
  br label %if.end21

if.end21:                                         ; preds = %if.then20, %if.end16
  %9 = load float, float* %val, align 4
  %10 = load float*, float** %vect.addr, align 8
  %11 = load i32, i32* %i, align 4
  %12 = load i32, i32* %grid_cols.addr, align 4
  %mul = mul nsw i32 %11, %12
  %13 = load i32, i32* %j, align 4
  %add = add nsw i32 %mul, %13
  %14 = load i32, i32* %k, align 4
  %15 = load i32, i32* %grid_rows.addr, align 4
  %mul22 = mul nsw i32 %14, %15
  %16 = load i32, i32* %grid_cols.addr, align 4
  %mul23 = mul nsw i32 %mul22, %16
  %add24 = add nsw i32 %add, %mul23
  %idxprom = sext i32 %add24 to i64
  %arrayidx = getelementptr inbounds float, float* %10, i64 %idxprom
  store float %9, float* %arrayidx, align 4
  br label %for.inc

for.inc:                                          ; preds = %if.end21
  %17 = load i32, i32* %k, align 4
  %inc = add nsw i32 %17, 1
  store i32 %inc, i32* %k, align 4
  br label %for.cond6

for.end:                                          ; preds = %for.cond6
  br label %for.inc25

for.inc25:                                        ; preds = %for.end
  %18 = load i32, i32* %j, align 4
  %inc26 = add nsw i32 %18, 1
  store i32 %inc26, i32* %j, align 4
  br label %for.cond2

for.end27:                                        ; preds = %for.cond2
  br label %for.inc28

for.inc28:                                        ; preds = %for.end27
  %19 = load i32, i32* %i, align 4
  %inc29 = add nsw i32 %19, 1
  store i32 %inc29, i32* %i, align 4
  br label %for.cond

for.end30:                                        ; preds = %for.cond
  %20 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call31 = call i32 @fclose(%struct._IO_FILE* %20)
  ret void
}

declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #4

declare dso_local i8* @fgets(i8*, i32, %struct._IO_FILE*) #4

; Function Attrs: nounwind
declare dso_local i32 @feof(%struct._IO_FILE*) #1

; Function Attrs: nounwind
declare dso_local i32 @sscanf(i8*, i8*, ...) #1

declare dso_local i32 @fclose(%struct._IO_FILE*) #4

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z11writeoutputPfiiiPc(float* %vect, i32 %grid_rows, i32 %grid_cols, i32 %layers, i8* %file) #2 {
entry:
  %vect.addr = alloca float*, align 8
  %grid_rows.addr = alloca i32, align 4
  %grid_cols.addr = alloca i32, align 4
  %layers.addr = alloca i32, align 4
  %file.addr = alloca i8*, align 8
  %i = alloca i32, align 4
  %j = alloca i32, align 4
  %k = alloca i32, align 4
  %index = alloca i32, align 4
  %fp = alloca %struct._IO_FILE*, align 8
  %str = alloca [256 x i8], align 16
  store float* %vect, float** %vect.addr, align 8
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %layers, i32* %layers.addr, align 4
  store i8* %file, i8** %file.addr, align 8
  store i32 0, i32* %index, align 4
  %0 = load i8*, i8** %file.addr, align 8
  %call = call %struct._IO_FILE* @fopen(i8* %0, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.8, i64 0, i64 0))
  store %struct._IO_FILE* %call, %struct._IO_FILE** %fp, align 8
  %cmp = icmp eq %struct._IO_FILE* %call, null
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.9, i64 0, i64 0))
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc19, %if.end
  %1 = load i32, i32* %i, align 4
  %2 = load i32, i32* %grid_rows.addr, align 4
  %cmp2 = icmp slt i32 %1, %2
  br i1 %cmp2, label %for.body, label %for.end21

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %j, align 4
  br label %for.cond3

for.cond3:                                        ; preds = %for.inc16, %for.body
  %3 = load i32, i32* %j, align 4
  %4 = load i32, i32* %grid_cols.addr, align 4
  %cmp4 = icmp slt i32 %3, %4
  br i1 %cmp4, label %for.body5, label %for.end18

for.body5:                                        ; preds = %for.cond3
  store i32 0, i32* %k, align 4
  br label %for.cond6

for.cond6:                                        ; preds = %for.inc, %for.body5
  %5 = load i32, i32* %k, align 4
  %6 = load i32, i32* %layers.addr, align 4
  %cmp7 = icmp slt i32 %5, %6
  br i1 %cmp7, label %for.body8, label %for.end

for.body8:                                        ; preds = %for.cond6
  %arraydecay = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %7 = load i32, i32* %index, align 4
  %8 = load float*, float** %vect.addr, align 8
  %9 = load i32, i32* %i, align 4
  %10 = load i32, i32* %grid_cols.addr, align 4
  %mul = mul nsw i32 %9, %10
  %11 = load i32, i32* %j, align 4
  %add = add nsw i32 %mul, %11
  %12 = load i32, i32* %k, align 4
  %13 = load i32, i32* %grid_rows.addr, align 4
  %mul9 = mul nsw i32 %12, %13
  %14 = load i32, i32* %grid_cols.addr, align 4
  %mul10 = mul nsw i32 %mul9, %14
  %add11 = add nsw i32 %add, %mul10
  %idxprom = sext i32 %add11 to i64
  %arrayidx = getelementptr inbounds float, float* %8, i64 %idxprom
  %15 = load float, float* %arrayidx, align 4
  %conv = fpext float %15 to double
  %call12 = call i32 (i8*, i8*, ...) @sprintf(i8* %arraydecay, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.10, i64 0, i64 0), i32 %7, double %conv) #8
  %arraydecay13 = getelementptr inbounds [256 x i8], [256 x i8]* %str, i64 0, i64 0
  %16 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call14 = call i32 @fputs(i8* %arraydecay13, %struct._IO_FILE* %16)
  %17 = load i32, i32* %index, align 4
  %inc = add nsw i32 %17, 1
  store i32 %inc, i32* %index, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body8
  %18 = load i32, i32* %k, align 4
  %inc15 = add nsw i32 %18, 1
  store i32 %inc15, i32* %k, align 4
  br label %for.cond6

for.end:                                          ; preds = %for.cond6
  br label %for.inc16

for.inc16:                                        ; preds = %for.end
  %19 = load i32, i32* %j, align 4
  %inc17 = add nsw i32 %19, 1
  store i32 %inc17, i32* %j, align 4
  br label %for.cond3

for.end18:                                        ; preds = %for.cond3
  br label %for.inc19

for.inc19:                                        ; preds = %for.end18
  %20 = load i32, i32* %i, align 4
  %inc20 = add nsw i32 %20, 1
  store i32 %inc20, i32* %i, align 4
  br label %for.cond

for.end21:                                        ; preds = %for.cond
  %21 = load %struct._IO_FILE*, %struct._IO_FILE** %fp, align 8
  %call22 = call i32 @fclose(%struct._IO_FILE* %21)
  ret void
}

; Function Attrs: nounwind
declare dso_local i32 @sprintf(i8*, i8*, ...) #1

declare dso_local i32 @fputs(i8*, %struct._IO_FILE*) #4

; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @_Z14computeTempCPUPfS_S_iiifffffi(float* %pIn, float* %tIn, float* %tOut, i32 %nx, i32 %ny, i32 %nz, float %Cap, float %Rx, float %Ry, float %Rz, float %dt, i32 %numiter) #0 {
entry:
  %pIn.addr = alloca float*, align 8
  %tIn.addr = alloca float*, align 8
  %tOut.addr = alloca float*, align 8
  %nx.addr = alloca i32, align 4
  %ny.addr = alloca i32, align 4
  %nz.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %dt.addr = alloca float, align 4
  %numiter.addr = alloca i32, align 4
  %ce = alloca float, align 4
  %cw = alloca float, align 4
  %cn = alloca float, align 4
  %cs = alloca float, align 4
  %ct = alloca float, align 4
  %cb = alloca float, align 4
  %cc = alloca float, align 4
  %stepDivCap = alloca float, align 4
  %c = alloca i32, align 4
  %w = alloca i32, align 4
  %e = alloca i32, align 4
  %n = alloca i32, align 4
  %s = alloca i32, align 4
  %b = alloca i32, align 4
  %t = alloca i32, align 4
  %x = alloca i32, align 4
  %y = alloca i32, align 4
  %z = alloca i32, align 4
  %i = alloca i32, align 4
  %temp = alloca float*, align 8
  store float* %pIn, float** %pIn.addr, align 8
  store float* %tIn, float** %tIn.addr, align 8
  store float* %tOut, float** %tOut.addr, align 8
  store i32 %nx, i32* %nx.addr, align 4
  store i32 %ny, i32* %ny.addr, align 4
  store i32 %nz, i32* %nz.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %dt, float* %dt.addr, align 4
  store i32 %numiter, i32* %numiter.addr, align 4
  %0 = load float, float* %dt.addr, align 4
  %1 = load float, float* %Cap.addr, align 4
  %div = fdiv float %0, %1
  store float %div, float* %stepDivCap, align 4
  %2 = load float, float* %stepDivCap, align 4
  %3 = load float, float* %Rx.addr, align 4
  %div1 = fdiv float %2, %3
  store float %div1, float* %cw, align 4
  store float %div1, float* %ce, align 4
  %4 = load float, float* %stepDivCap, align 4
  %5 = load float, float* %Ry.addr, align 4
  %div2 = fdiv float %4, %5
  store float %div2, float* %cs, align 4
  store float %div2, float* %cn, align 4
  %6 = load float, float* %stepDivCap, align 4
  %7 = load float, float* %Rz.addr, align 4
  %div3 = fdiv float %6, %7
  store float %div3, float* %cb, align 4
  store float %div3, float* %ct, align 4
  %8 = load float, float* %ce, align 4
  %conv = fpext float %8 to double
  %mul = fmul contract double 2.000000e+00, %conv
  %9 = load float, float* %cn, align 4
  %conv4 = fpext float %9 to double
  %mul5 = fmul contract double 2.000000e+00, %conv4
  %add = fadd contract double %mul, %mul5
  %10 = load float, float* %ct, align 4
  %conv6 = fpext float %10 to double
  %mul7 = fmul contract double 3.000000e+00, %conv6
  %add8 = fadd contract double %add, %mul7
  %sub = fsub contract double 1.000000e+00, %add8
  %conv9 = fptrunc double %sub to float
  store float %conv9, float* %cc, align 4
  store i32 0, i32* %i, align 4
  br label %do.body

do.body:                                          ; preds = %do.cond, %entry
  store i32 0, i32* %z, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc95, %do.body
  %11 = load i32, i32* %z, align 4
  %12 = load i32, i32* %nz.addr, align 4
  %cmp = icmp slt i32 %11, %12
  br i1 %cmp, label %for.body, label %for.end97

for.body:                                         ; preds = %for.cond
  store i32 0, i32* %y, align 4
  br label %for.cond10

for.cond10:                                       ; preds = %for.inc92, %for.body
  %13 = load i32, i32* %y, align 4
  %14 = load i32, i32* %ny.addr, align 4
  %cmp11 = icmp slt i32 %13, %14
  br i1 %cmp11, label %for.body12, label %for.end94

for.body12:                                       ; preds = %for.cond10
  store i32 0, i32* %x, align 4
  br label %for.cond13

for.cond13:                                       ; preds = %for.inc, %for.body12
  %15 = load i32, i32* %x, align 4
  %16 = load i32, i32* %nx.addr, align 4
  %cmp14 = icmp slt i32 %15, %16
  br i1 %cmp14, label %for.body15, label %for.end

for.body15:                                       ; preds = %for.cond13
  %17 = load i32, i32* %x, align 4
  %18 = load i32, i32* %y, align 4
  %19 = load i32, i32* %nx.addr, align 4
  %mul16 = mul nsw i32 %18, %19
  %add17 = add nsw i32 %17, %mul16
  %20 = load i32, i32* %z, align 4
  %21 = load i32, i32* %nx.addr, align 4
  %mul18 = mul nsw i32 %20, %21
  %22 = load i32, i32* %ny.addr, align 4
  %mul19 = mul nsw i32 %mul18, %22
  %add20 = add nsw i32 %add17, %mul19
  store i32 %add20, i32* %c, align 4
  %23 = load i32, i32* %x, align 4
  %cmp21 = icmp eq i32 %23, 0
  br i1 %cmp21, label %cond.true, label %cond.false

cond.true:                                        ; preds = %for.body15
  %24 = load i32, i32* %c, align 4
  br label %cond.end

cond.false:                                       ; preds = %for.body15
  %25 = load i32, i32* %c, align 4
  %sub22 = sub nsw i32 %25, 1
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %24, %cond.true ], [ %sub22, %cond.false ]
  store i32 %cond, i32* %w, align 4
  %26 = load i32, i32* %x, align 4
  %27 = load i32, i32* %nx.addr, align 4
  %sub23 = sub nsw i32 %27, 1
  %cmp24 = icmp eq i32 %26, %sub23
  br i1 %cmp24, label %cond.true25, label %cond.false26

cond.true25:                                      ; preds = %cond.end
  %28 = load i32, i32* %c, align 4
  br label %cond.end28

cond.false26:                                     ; preds = %cond.end
  %29 = load i32, i32* %c, align 4
  %add27 = add nsw i32 %29, 1
  br label %cond.end28

cond.end28:                                       ; preds = %cond.false26, %cond.true25
  %cond29 = phi i32 [ %28, %cond.true25 ], [ %add27, %cond.false26 ]
  store i32 %cond29, i32* %e, align 4
  %30 = load i32, i32* %y, align 4
  %cmp30 = icmp eq i32 %30, 0
  br i1 %cmp30, label %cond.true31, label %cond.false32

cond.true31:                                      ; preds = %cond.end28
  %31 = load i32, i32* %c, align 4
  br label %cond.end34

cond.false32:                                     ; preds = %cond.end28
  %32 = load i32, i32* %c, align 4
  %33 = load i32, i32* %nx.addr, align 4
  %sub33 = sub nsw i32 %32, %33
  br label %cond.end34

cond.end34:                                       ; preds = %cond.false32, %cond.true31
  %cond35 = phi i32 [ %31, %cond.true31 ], [ %sub33, %cond.false32 ]
  store i32 %cond35, i32* %n, align 4
  %34 = load i32, i32* %y, align 4
  %35 = load i32, i32* %ny.addr, align 4
  %sub36 = sub nsw i32 %35, 1
  %cmp37 = icmp eq i32 %34, %sub36
  br i1 %cmp37, label %cond.true38, label %cond.false39

cond.true38:                                      ; preds = %cond.end34
  %36 = load i32, i32* %c, align 4
  br label %cond.end41

cond.false39:                                     ; preds = %cond.end34
  %37 = load i32, i32* %c, align 4
  %38 = load i32, i32* %nx.addr, align 4
  %add40 = add nsw i32 %37, %38
  br label %cond.end41

cond.end41:                                       ; preds = %cond.false39, %cond.true38
  %cond42 = phi i32 [ %36, %cond.true38 ], [ %add40, %cond.false39 ]
  store i32 %cond42, i32* %s, align 4
  %39 = load i32, i32* %z, align 4
  %cmp43 = icmp eq i32 %39, 0
  br i1 %cmp43, label %cond.true44, label %cond.false45

cond.true44:                                      ; preds = %cond.end41
  %40 = load i32, i32* %c, align 4
  br label %cond.end48

cond.false45:                                     ; preds = %cond.end41
  %41 = load i32, i32* %c, align 4
  %42 = load i32, i32* %nx.addr, align 4
  %43 = load i32, i32* %ny.addr, align 4
  %mul46 = mul nsw i32 %42, %43
  %sub47 = sub nsw i32 %41, %mul46
  br label %cond.end48

cond.end48:                                       ; preds = %cond.false45, %cond.true44
  %cond49 = phi i32 [ %40, %cond.true44 ], [ %sub47, %cond.false45 ]
  store i32 %cond49, i32* %b, align 4
  %44 = load i32, i32* %z, align 4
  %45 = load i32, i32* %nz.addr, align 4
  %sub50 = sub nsw i32 %45, 1
  %cmp51 = icmp eq i32 %44, %sub50
  br i1 %cmp51, label %cond.true52, label %cond.false53

cond.true52:                                      ; preds = %cond.end48
  %46 = load i32, i32* %c, align 4
  br label %cond.end56

cond.false53:                                     ; preds = %cond.end48
  %47 = load i32, i32* %c, align 4
  %48 = load i32, i32* %nx.addr, align 4
  %49 = load i32, i32* %ny.addr, align 4
  %mul54 = mul nsw i32 %48, %49
  %add55 = add nsw i32 %47, %mul54
  br label %cond.end56

cond.end56:                                       ; preds = %cond.false53, %cond.true52
  %cond57 = phi i32 [ %46, %cond.true52 ], [ %add55, %cond.false53 ]
  store i32 %cond57, i32* %t, align 4
  %50 = load float*, float** %tIn.addr, align 8
  %51 = load i32, i32* %c, align 4
  %idxprom = sext i32 %51 to i64
  %arrayidx = getelementptr inbounds float, float* %50, i64 %idxprom
  %52 = load float, float* %arrayidx, align 4
  %53 = load float, float* %cc, align 4
  %mul58 = fmul contract float %52, %53
  %54 = load float*, float** %tIn.addr, align 8
  %55 = load i32, i32* %n, align 4
  %idxprom59 = sext i32 %55 to i64
  %arrayidx60 = getelementptr inbounds float, float* %54, i64 %idxprom59
  %56 = load float, float* %arrayidx60, align 4
  %57 = load float, float* %cn, align 4
  %mul61 = fmul contract float %56, %57
  %add62 = fadd contract float %mul58, %mul61
  %58 = load float*, float** %tIn.addr, align 8
  %59 = load i32, i32* %s, align 4
  %idxprom63 = sext i32 %59 to i64
  %arrayidx64 = getelementptr inbounds float, float* %58, i64 %idxprom63
  %60 = load float, float* %arrayidx64, align 4
  %61 = load float, float* %cs, align 4
  %mul65 = fmul contract float %60, %61
  %add66 = fadd contract float %add62, %mul65
  %62 = load float*, float** %tIn.addr, align 8
  %63 = load i32, i32* %e, align 4
  %idxprom67 = sext i32 %63 to i64
  %arrayidx68 = getelementptr inbounds float, float* %62, i64 %idxprom67
  %64 = load float, float* %arrayidx68, align 4
  %65 = load float, float* %ce, align 4
  %mul69 = fmul contract float %64, %65
  %add70 = fadd contract float %add66, %mul69
  %66 = load float*, float** %tIn.addr, align 8
  %67 = load i32, i32* %w, align 4
  %idxprom71 = sext i32 %67 to i64
  %arrayidx72 = getelementptr inbounds float, float* %66, i64 %idxprom71
  %68 = load float, float* %arrayidx72, align 4
  %69 = load float, float* %cw, align 4
  %mul73 = fmul contract float %68, %69
  %add74 = fadd contract float %add70, %mul73
  %70 = load float*, float** %tIn.addr, align 8
  %71 = load i32, i32* %t, align 4
  %idxprom75 = sext i32 %71 to i64
  %arrayidx76 = getelementptr inbounds float, float* %70, i64 %idxprom75
  %72 = load float, float* %arrayidx76, align 4
  %73 = load float, float* %ct, align 4
  %mul77 = fmul contract float %72, %73
  %add78 = fadd contract float %add74, %mul77
  %74 = load float*, float** %tIn.addr, align 8
  %75 = load i32, i32* %b, align 4
  %idxprom79 = sext i32 %75 to i64
  %arrayidx80 = getelementptr inbounds float, float* %74, i64 %idxprom79
  %76 = load float, float* %arrayidx80, align 4
  %77 = load float, float* %cb, align 4
  %mul81 = fmul contract float %76, %77
  %add82 = fadd contract float %add78, %mul81
  %78 = load float, float* %dt.addr, align 4
  %79 = load float, float* %Cap.addr, align 4
  %div83 = fdiv float %78, %79
  %80 = load float*, float** %pIn.addr, align 8
  %81 = load i32, i32* %c, align 4
  %idxprom84 = sext i32 %81 to i64
  %arrayidx85 = getelementptr inbounds float, float* %80, i64 %idxprom84
  %82 = load float, float* %arrayidx85, align 4
  %mul86 = fmul contract float %div83, %82
  %add87 = fadd contract float %add82, %mul86
  %83 = load float, float* %ct, align 4
  %84 = load float, float* @amb_temp, align 4
  %mul88 = fmul contract float %83, %84
  %add89 = fadd contract float %add87, %mul88
  %85 = load float*, float** %tOut.addr, align 8
  %86 = load i32, i32* %c, align 4
  %idxprom90 = sext i32 %86 to i64
  %arrayidx91 = getelementptr inbounds float, float* %85, i64 %idxprom90
  store float %add89, float* %arrayidx91, align 4
  br label %for.inc

for.inc:                                          ; preds = %cond.end56
  %87 = load i32, i32* %x, align 4
  %inc = add nsw i32 %87, 1
  store i32 %inc, i32* %x, align 4
  br label %for.cond13

for.end:                                          ; preds = %for.cond13
  br label %for.inc92

for.inc92:                                        ; preds = %for.end
  %88 = load i32, i32* %y, align 4
  %inc93 = add nsw i32 %88, 1
  store i32 %inc93, i32* %y, align 4
  br label %for.cond10

for.end94:                                        ; preds = %for.cond10
  br label %for.inc95

for.inc95:                                        ; preds = %for.end94
  %89 = load i32, i32* %z, align 4
  %inc96 = add nsw i32 %89, 1
  store i32 %inc96, i32* %z, align 4
  br label %for.cond

for.end97:                                        ; preds = %for.cond
  %90 = load float*, float** %tIn.addr, align 8
  store float* %90, float** %temp, align 8
  %91 = load float*, float** %tOut.addr, align 8
  store float* %91, float** %tIn.addr, align 8
  %92 = load float*, float** %temp, align 8
  store float* %92, float** %tOut.addr, align 8
  %93 = load i32, i32* %i, align 4
  %inc98 = add nsw i32 %93, 1
  store i32 %inc98, i32* %i, align 4
  br label %do.cond

do.cond:                                          ; preds = %for.end97
  %94 = load i32, i32* %i, align 4
  %95 = load i32, i32* %numiter.addr, align 4
  %cmp99 = icmp slt i32 %94, %95
  br i1 %cmp99, label %do.body, label %do.end

do.end:                                           ; preds = %do.cond
  ret void
}

; Function Attrs: noinline optnone uwtable
define dso_local float @_Z8accuracyPfS_i(float* %arr1, float* %arr2, i32 %len) #2 {
entry:
  %arr1.addr = alloca float*, align 8
  %arr2.addr = alloca float*, align 8
  %len.addr = alloca i32, align 4
  %err = alloca float, align 4
  %i = alloca i32, align 4
  store float* %arr1, float** %arr1.addr, align 8
  store float* %arr2, float** %arr2.addr, align 8
  store i32 %len, i32* %len.addr, align 4
  store float 0.000000e+00, float* %err, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
  %0 = load i32, i32* %i, align 4
  %1 = load i32, i32* %len.addr, align 4
  %cmp = icmp slt i32 %0, %1
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %2 = load float*, float** %arr1.addr, align 8
  %3 = load i32, i32* %i, align 4
  %idxprom = sext i32 %3 to i64
  %arrayidx = getelementptr inbounds float, float* %2, i64 %idxprom
  %4 = load float, float* %arrayidx, align 4
  %5 = load float*, float** %arr2.addr, align 8
  %6 = load i32, i32* %i, align 4
  %idxprom1 = sext i32 %6 to i64
  %arrayidx2 = getelementptr inbounds float, float* %5, i64 %idxprom1
  %7 = load float, float* %arrayidx2, align 4
  %sub = fsub contract float %4, %7
  %8 = load float*, float** %arr1.addr, align 8
  %9 = load i32, i32* %i, align 4
  %idxprom3 = sext i32 %9 to i64
  %arrayidx4 = getelementptr inbounds float, float* %8, i64 %idxprom3
  %10 = load float, float* %arrayidx4, align 4
  %11 = load float*, float** %arr2.addr, align 8
  %12 = load i32, i32* %i, align 4
  %idxprom5 = sext i32 %12 to i64
  %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5
  %13 = load float, float* %arrayidx6, align 4
  %sub7 = fsub contract float %10, %13
  %mul = fmul contract float %sub, %sub7
  %14 = load float, float* %err, align 4
  %add = fadd contract float %14, %mul
  store float %add, float* %err, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %15 = load i32, i32* %i, align 4
  %inc = add nsw i32 %15, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %16 = load float, float* %err, align 4
  %17 = load i32, i32* %len.addr, align 4
  %conv = sitofp i32 %17 to float
  %div = fdiv float %16, %conv
  %call = call float @_ZSt4sqrtf(float %div)
  ret float %call
}

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt4sqrtf(float %__x) #0 comdat {
entry:
  %__x.addr = alloca float, align 4
  store float %__x, float* %__x.addr, align 4
  %0 = load float, float* %__x.addr, align 4
  %call = call float @sqrtf(float %0) #8
  ret float %call
}

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5usageiPPc(i32 %argc, i8** %argv) #2 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
  %2 = load i8*, i8** %arrayidx, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([81 x i8], [81 x i8]* @.str.11, i64 0, i64 0), i8* %2)
  %3 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call1 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %3, i8* getelementptr inbounds ([68 x i8], [68 x i8]* @.str.12, i64 0, i64 0))
  %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %4, i8* getelementptr inbounds ([62 x i8], [62 x i8]* @.str.13, i64 0, i64 0))
  %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call3 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.14, i64 0, i64 0))
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call4 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([83 x i8], [83 x i8]* @.str.15, i64 0, i64 0))
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([88 x i8], [88 x i8]* @.str.16, i64 0, i64 0))
  %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call6 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.17, i64 0, i64 0))
  call void @exit(i32 1) #9
  unreachable
}

; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #5

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #6 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %pfile = alloca i8*, align 8
  %tfile = alloca i8*, align 8
  %ofile = alloca i8*, align 8
  %iterations = alloca i32, align 4
  %numCols = alloca i32, align 4
  %numRows = alloca i32, align 4
  %layers = alloca i32, align 4
  %dx = alloca float, align 4
  %dy = alloca float, align 4
  %dz = alloca float, align 4
  %Cap = alloca float, align 4
  %Rx = alloca float, align 4
  %Ry = alloca float, align 4
  %Rz = alloca float, align 4
  %max_slope = alloca float, align 4
  %dt = alloca float, align 4
  %powerIn = alloca float*, align 8
  %tempOut = alloca float*, align 8
  %tempIn = alloca float*, align 8
  %tempCopy = alloca float*, align 8
  %size = alloca i32, align 4
  %answer = alloca float*, align 8
  %acc = alloca float, align 4
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %call = call i32 @cudaSetDevice(i32 0)
  %0 = load i32, i32* %argc.addr, align 4
  %cmp = icmp ne i32 %0, 7
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %1 = load i32, i32* %argc.addr, align 4
  %2 = load i8**, i8*** %argv.addr, align 8
  call void @_Z5usageiPPc(i32 %1, i8** %2)
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  %3 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %3, i64 3
  %4 = load i8*, i8** %arrayidx, align 8
  %call1 = call i32 @atoi(i8* %4) #10
  store i32 %call1, i32* %iterations, align 4
  %5 = load i8**, i8*** %argv.addr, align 8
  %arrayidx2 = getelementptr inbounds i8*, i8** %5, i64 4
  %6 = load i8*, i8** %arrayidx2, align 8
  store i8* %6, i8** %pfile, align 8
  %7 = load i8**, i8*** %argv.addr, align 8
  %arrayidx3 = getelementptr inbounds i8*, i8** %7, i64 5
  %8 = load i8*, i8** %arrayidx3, align 8
  store i8* %8, i8** %tfile, align 8
  %9 = load i8**, i8*** %argv.addr, align 8
  %arrayidx4 = getelementptr inbounds i8*, i8** %9, i64 6
  %10 = load i8*, i8** %arrayidx4, align 8
  store i8* %10, i8** %ofile, align 8
  %11 = load i8**, i8*** %argv.addr, align 8
  %arrayidx5 = getelementptr inbounds i8*, i8** %11, i64 1
  %12 = load i8*, i8** %arrayidx5, align 8
  %call6 = call i32 @atoi(i8* %12) #10
  store i32 %call6, i32* %numCols, align 4
  %13 = load i8**, i8*** %argv.addr, align 8
  %arrayidx7 = getelementptr inbounds i8*, i8** %13, i64 1
  %14 = load i8*, i8** %arrayidx7, align 8
  %call8 = call i32 @atoi(i8* %14) #10
  store i32 %call8, i32* %numRows, align 4
  %15 = load i8**, i8*** %argv.addr, align 8
  %arrayidx9 = getelementptr inbounds i8*, i8** %15, i64 2
  %16 = load i8*, i8** %arrayidx9, align 8
  %call10 = call i32 @atoi(i8* %16) #10
  store i32 %call10, i32* %layers, align 4
  %17 = load float, float* @chip_height, align 4
  %18 = load i32, i32* %numRows, align 4
  %conv = sitofp i32 %18 to float
  %div = fdiv float %17, %conv
  store float %div, float* %dx, align 4
  %19 = load float, float* @chip_width, align 4
  %20 = load i32, i32* %numCols, align 4
  %conv11 = sitofp i32 %20 to float
  %div12 = fdiv float %19, %conv11
  store float %div12, float* %dy, align 4
  %21 = load float, float* @t_chip, align 4
  %22 = load i32, i32* %layers, align 4
  %conv13 = sitofp i32 %22 to float
  %div14 = fdiv float %21, %conv13
  store float %div14, float* %dz, align 4
  %23 = load float, float* @t_chip, align 4
  %conv15 = fpext float %23 to double
  %mul = fmul contract double 8.750000e+05, %conv15
  %24 = load float, float* %dx, align 4
  %conv16 = fpext float %24 to double
  %mul17 = fmul contract double %mul, %conv16
  %25 = load float, float* %dy, align 4
  %conv18 = fpext float %25 to double
  %mul19 = fmul contract double %mul17, %conv18
  %conv20 = fptrunc double %mul19 to float
  store float %conv20, float* %Cap, align 4
  %26 = load float, float* %dy, align 4
  %conv21 = fpext float %26 to double
  %27 = load float, float* @t_chip, align 4
  %conv22 = fpext float %27 to double
  %mul23 = fmul contract double 2.000000e+02, %conv22
  %28 = load float, float* %dx, align 4
  %conv24 = fpext float %28 to double
  %mul25 = fmul contract double %mul23, %conv24
  %div26 = fdiv double %conv21, %mul25
  %conv27 = fptrunc double %div26 to float
  store float %conv27, float* %Rx, align 4
  %29 = load float, float* %dx, align 4
  %conv28 = fpext float %29 to double
  %30 = load float, float* @t_chip, align 4
  %conv29 = fpext float %30 to double
  %mul30 = fmul contract double 2.000000e+02, %conv29
  %31 = load float, float* %dy, align 4
  %conv31 = fpext float %31 to double
  %mul32 = fmul contract double %mul30, %conv31
  %div33 = fdiv double %conv28, %mul32
  %conv34 = fptrunc double %div33 to float
  store float %conv34, float* %Ry, align 4
  %32 = load float, float* %dz, align 4
  %33 = load float, float* %dx, align 4
  %mul35 = fmul contract float 1.000000e+02, %33
  %34 = load float, float* %dy, align 4
  %mul36 = fmul contract float %mul35, %34
  %div37 = fdiv float %32, %mul36
  store float %div37, float* %Rz, align 4
  %35 = load float, float* @t_chip, align 4
  %conv38 = fpext float %35 to double
  %mul39 = fmul contract double 5.000000e-01, %conv38
  %mul40 = fmul contract double %mul39, 1.750000e+06
  %div41 = fdiv double 3.000000e+06, %mul40
  %conv42 = fptrunc double %div41 to float
  store float %conv42, float* %max_slope, align 4
  %36 = load float, float* %max_slope, align 4
  %conv43 = fpext float %36 to double
  %div44 = fdiv double 1.000000e-03, %conv43
  %conv45 = fptrunc double %div44 to float
  store float %conv45, float* %dt, align 4
  %37 = load i32, i32* %numCols, align 4
  %38 = load i32, i32* %numRows, align 4
  %mul46 = mul nsw i32 %37, %38
  %39 = load i32, i32* %layers, align 4
  %mul47 = mul nsw i32 %mul46, %39
  store i32 %mul47, i32* %size, align 4
  %40 = load i32, i32* %size, align 4
  %conv48 = sext i32 %40 to i64
  %call49 = call noalias i8* @calloc(i64 %conv48, i64 4) #8
  %41 = bitcast i8* %call49 to float*
  store float* %41, float** %powerIn, align 8
  %42 = load i32, i32* %size, align 4
  %conv50 = sext i32 %42 to i64
  %mul51 = mul i64 %conv50, 4
  %call52 = call noalias i8* @malloc(i64 %mul51) #8
  %43 = bitcast i8* %call52 to float*
  store float* %43, float** %tempCopy, align 8
  %44 = load i32, i32* %size, align 4
  %conv53 = sext i32 %44 to i64
  %call54 = call noalias i8* @calloc(i64 %conv53, i64 4) #8
  %45 = bitcast i8* %call54 to float*
  store float* %45, float** %tempIn, align 8
  %46 = load i32, i32* %size, align 4
  %conv55 = sext i32 %46 to i64
  %call56 = call noalias i8* @calloc(i64 %conv55, i64 4) #8
  %47 = bitcast i8* %call56 to float*
  store float* %47, float** %tempOut, align 8
  %48 = load i32, i32* %size, align 4
  %conv57 = sext i32 %48 to i64
  %call58 = call noalias i8* @calloc(i64 %conv57, i64 4) #8
  %49 = bitcast i8* %call58 to float*
  store float* %49, float** %answer, align 8
  %50 = load float*, float** %powerIn, align 8
  %51 = load i32, i32* %numRows, align 4
  %52 = load i32, i32* %numCols, align 4
  %53 = load i32, i32* %layers, align 4
  %54 = load i8*, i8** %pfile, align 8
  call void @_Z9readinputPfiiiPc(float* %50, i32 %51, i32 %52, i32 %53, i8* %54)
  %55 = load float*, float** %tempIn, align 8
  %56 = load i32, i32* %numRows, align 4
  %57 = load i32, i32* %numCols, align 4
  %58 = load i32, i32* %layers, align 4
  %59 = load i8*, i8** %tfile, align 8
  call void @_Z9readinputPfiiiPc(float* %55, i32 %56, i32 %57, i32 %58, i8* %59)
  %60 = load float*, float** %tempCopy, align 8
  %61 = bitcast float* %60 to i8*
  %62 = load float*, float** %tempIn, align 8
  %63 = bitcast float* %62 to i8*
  %64 = load i32, i32* %size, align 4
  %conv59 = sext i32 %64 to i64
  %mul60 = mul i64 %conv59, 4
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %61, i8* align 4 %63, i64 %mul60, i1 false)
  %65 = load float*, float** %powerIn, align 8
  %66 = load float*, float** %tempIn, align 8
  %67 = load float*, float** %tempOut, align 8
  %68 = load i32, i32* %numCols, align 4
  %69 = load i32, i32* %numRows, align 4
  %70 = load i32, i32* %layers, align 4
  %71 = load float, float* %Cap, align 4
  %72 = load float, float* %Rx, align 4
  %73 = load float, float* %Ry, align 4
  %74 = load float, float* %Rz, align 4
  %75 = load float, float* %dt, align 4
  %76 = load i32, i32* %iterations, align 4
  call void @_Z12hotspot_opt1PfS_S_iiifffffi(float* %65, float* %66, float* %67, i32 %68, i32 %69, i32 %70, float %71, float %72, float %73, float %74, float %75, i32 %76)
  %77 = load float*, float** %powerIn, align 8
  %78 = load float*, float** %tempCopy, align 8
  %79 = load float*, float** %answer, align 8
  %80 = load i32, i32* %numCols, align 4
  %81 = load i32, i32* %numRows, align 4
  %82 = load i32, i32* %layers, align 4
  %83 = load float, float* %Cap, align 4
  %84 = load float, float* %Rx, align 4
  %85 = load float, float* %Ry, align 4
  %86 = load float, float* %Rz, align 4
  %87 = load float, float* %dt, align 4
  %88 = load i32, i32* %iterations, align 4
  call void @_Z14computeTempCPUPfS_S_iiifffffi(float* %77, float* %78, float* %79, i32 %80, i32 %81, i32 %82, float %83, float %84, float %85, float %86, float %87, i32 %88)
  %89 = load float*, float** %tempOut, align 8
  %90 = load float*, float** %answer, align 8
  %91 = load i32, i32* %numRows, align 4
  %92 = load i32, i32* %numCols, align 4
  %mul61 = mul nsw i32 %91, %92
  %93 = load i32, i32* %layers, align 4
  %mul62 = mul nsw i32 %mul61, %93
  %call63 = call float @_Z8accuracyPfS_i(float* %89, float* %90, i32 %mul62)
  store float %call63, float* %acc, align 4
  %94 = load float, float* %acc, align 4
  %conv64 = fpext float %94 to double
  %call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.18, i64 0, i64 0), double %conv64)
  %95 = load float*, float** %tempOut, align 8
  %96 = load i32, i32* %numRows, align 4
  %97 = load i32, i32* %numCols, align 4
  %98 = load i32, i32* %layers, align 4
  %99 = load i8*, i8** %ofile, align 8
  call void @_Z11writeoutputPfiiiPc(float* %95, i32 %96, i32 %97, i32 %98, i8* %99)
  %100 = load float*, float** %tempIn, align 8
  %101 = bitcast float* %100 to i8*
  call void @free(i8* %101) #8
  %102 = load float*, float** %tempOut, align 8
  %103 = bitcast float* %102 to i8*
  call void @free(i8* %103) #8
  %104 = load float*, float** %powerIn, align 8
  %105 = bitcast float* %104 to i8*
  call void @free(i8* %105) #8
  ret i32 0
}

declare dso_local i32 @cudaSetDevice(i32) #4

; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #7

; Function Attrs: nounwind
declare dso_local noalias i8* @calloc(i64, i64) #1

; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #1

; Function Attrs: nounwind
declare dso_local void @free(i8*) #1

; Function Attrs: nounwind
declare dso_local float @sqrtf(float) #1

define internal void @__cuda_register_globals(i8** %0) {
entry:
  %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff to i8*), i8* getelementptr inbounds ([33 x i8], [33 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([33 x i8], [33 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  ret void
}

declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)

declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)

declare dso_local i8** @__cudaRegisterFatBinary(i8*)

define internal void @__cuda_module_ctor(i8* %0) {
entry:
  %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
  store i8** %1, i8*** @__cuda_gpubin_handle, align 8
  call void @__cuda_register_globals(i8** %1)
  call void @__cudaRegisterFatBinaryEnd(i8** %1)
  %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
  ret void
}

declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)

declare dso_local void @__cudaUnregisterFatBinary(i8**)

define internal void @__cuda_module_dtor(i8* %0) {
entry:
  %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
  call void @__cudaUnregisterFatBinary(i8** %1)
  ret void
}

declare dso_local i32 @atexit(void (i8*)*)

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { argmemonly nounwind willreturn }
attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { nounwind }
attributes #9 = { noreturn nounwind }
attributes #10 = { nounwind readonly }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}