CuPBoP/examples/hotspot/hotspot-cuda-nvptx64-nvidia...

; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "hotspot.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }

$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any

$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any

$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any

$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any

@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
  %iteration.addr = alloca i32, align 4
  %power.addr = alloca float*, align 8
  %temp_src.addr = alloca float*, align 8
  %temp_dst.addr = alloca float*, align 8
  %grid_cols.addr = alloca i32, align 4
  %grid_rows.addr = alloca i32, align 4
  %border_cols.addr = alloca i32, align 4
  %border_rows.addr = alloca i32, align 4
  %Cap.addr = alloca float, align 4
  %Rx.addr = alloca float, align 4
  %Ry.addr = alloca float, align 4
  %Rz.addr = alloca float, align 4
  %step.addr = alloca float, align 4
  %time_elapsed.addr = alloca float, align 4
  %amb_temp = alloca float, align 4
  %step_div_Cap = alloca float, align 4
  %Rx_1 = alloca float, align 4
  %Ry_1 = alloca float, align 4
  %Rz_1 = alloca float, align 4
  %bx = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %small_block_rows = alloca i32, align 4
  %small_block_cols = alloca i32, align 4
  %blkY = alloca i32, align 4
  %blkX = alloca i32, align 4
  %blkYmax = alloca i32, align 4
  %blkXmax = alloca i32, align 4
  %yidx = alloca i32, align 4
  %xidx = alloca i32, align 4
  %loadYidx = alloca i32, align 4
  %loadXidx = alloca i32, align 4
  %index = alloca i32, align 4
  %validYmin = alloca i32, align 4
  %validYmax = alloca i32, align 4
  %validXmin = alloca i32, align 4
  %validXmax = alloca i32, align 4
  %N = alloca i32, align 4
  %S = alloca i32, align 4
  %W = alloca i32, align 4
  %E = alloca i32, align 4
  %computed = alloca i8, align 1
  %i = alloca i32, align 4
  store i32 %iteration, i32* %iteration.addr, align 4
  store float* %power, float** %power.addr, align 8
  store float* %temp_src, float** %temp_src.addr, align 8
  store float* %temp_dst, float** %temp_dst.addr, align 8
  store i32 %grid_cols, i32* %grid_cols.addr, align 4
  store i32 %grid_rows, i32* %grid_rows.addr, align 4
  store i32 %border_cols, i32* %border_cols.addr, align 4
  store i32 %border_rows, i32* %border_rows.addr, align 4
  store float %Cap, float* %Cap.addr, align 4
  store float %Rx, float* %Rx.addr, align 4
  store float %Ry, float* %Ry.addr, align 4
  store float %Rz, float* %Rz.addr, align 4
  store float %step, float* %step.addr, align 4
  store float %time_elapsed, float* %time_elapsed.addr, align 4
  store float 8.000000e+01, float* %amb_temp, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %bx, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call1, i32* %by, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call2, i32* %tx, align 4
  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call3, i32* %ty, align 4
  %0 = load float, float* %step.addr, align 4
  %1 = load float, float* %Cap.addr, align 4
  %div = fdiv float %0, %1
  store float %div, float* %step_div_Cap, align 4
  %2 = load float, float* %Rx.addr, align 4
  %div4 = fdiv float 1.000000e+00, %2
  store float %div4, float* %Rx_1, align 4
  %3 = load float, float* %Ry.addr, align 4
  %div5 = fdiv float 1.000000e+00, %3
  store float %div5, float* %Ry_1, align 4
  %4 = load float, float* %Rz.addr, align 4
  %div6 = fdiv float 1.000000e+00, %4
  store float %div6, float* %Rz_1, align 4
  %5 = load i32, i32* %iteration.addr, align 4
  %mul = mul nsw i32 %5, 2
  %sub = sub nsw i32 16, %mul
  store i32 %sub, i32* %small_block_rows, align 4
  %6 = load i32, i32* %iteration.addr, align 4
  %mul7 = mul nsw i32 %6, 2
  %sub8 = sub nsw i32 16, %mul7
  store i32 %sub8, i32* %small_block_cols, align 4
  %7 = load i32, i32* %small_block_rows, align 4
  %8 = load i32, i32* %by, align 4
  %mul9 = mul nsw i32 %7, %8
  %9 = load i32, i32* %border_rows.addr, align 4
  %sub10 = sub nsw i32 %mul9, %9
  store i32 %sub10, i32* %blkY, align 4
  %10 = load i32, i32* %small_block_cols, align 4
  %11 = load i32, i32* %bx, align 4
  %mul11 = mul nsw i32 %10, %11
  %12 = load i32, i32* %border_cols.addr, align 4
  %sub12 = sub nsw i32 %mul11, %12
  store i32 %sub12, i32* %blkX, align 4
  %13 = load i32, i32* %blkY, align 4
  %add = add nsw i32 %13, 16
  %sub13 = sub nsw i32 %add, 1
  store i32 %sub13, i32* %blkYmax, align 4
  %14 = load i32, i32* %blkX, align 4
  %add14 = add nsw i32 %14, 16
  %sub15 = sub nsw i32 %add14, 1
  store i32 %sub15, i32* %blkXmax, align 4
  %15 = load i32, i32* %blkY, align 4
  %16 = load i32, i32* %ty, align 4
  %add16 = add nsw i32 %15, %16
  store i32 %add16, i32* %yidx, align 4
  %17 = load i32, i32* %blkX, align 4
  %18 = load i32, i32* %tx, align 4
  %add17 = add nsw i32 %17, %18
  store i32 %add17, i32* %xidx, align 4
  %19 = load i32, i32* %yidx, align 4
  store i32 %19, i32* %loadYidx, align 4
  %20 = load i32, i32* %xidx, align 4
  store i32 %20, i32* %loadXidx, align 4
  %21 = load i32, i32* %grid_cols.addr, align 4
  %22 = load i32, i32* %loadYidx, align 4
  %mul18 = mul nsw i32 %21, %22
  %23 = load i32, i32* %loadXidx, align 4
  %add19 = add nsw i32 %mul18, %23
  store i32 %add19, i32* %index, align 4
  %24 = load i32, i32* %loadYidx, align 4
  %cmp = icmp sge i32 %24, 0
  br i1 %cmp, label %land.lhs.true, label %if.end

land.lhs.true:                                    ; preds = %entry
  %25 = load i32, i32* %loadYidx, align 4
  %26 = load i32, i32* %grid_rows.addr, align 4
  %sub20 = sub nsw i32 %26, 1
  %cmp21 = icmp sle i32 %25, %sub20
  br i1 %cmp21, label %land.lhs.true22, label %if.end

land.lhs.true22:                                  ; preds = %land.lhs.true
  %27 = load i32, i32* %loadXidx, align 4
  %cmp23 = icmp sge i32 %27, 0
  br i1 %cmp23, label %land.lhs.true24, label %if.end

land.lhs.true24:                                  ; preds = %land.lhs.true22
  %28 = load i32, i32* %loadXidx, align 4
  %29 = load i32, i32* %grid_cols.addr, align 4
  %sub25 = sub nsw i32 %29, 1
  %cmp26 = icmp sle i32 %28, %sub25
  br i1 %cmp26, label %if.then, label %if.end

if.then:                                          ; preds = %land.lhs.true24
  %30 = load float*, float** %temp_src.addr, align 8
  %31 = load i32, i32* %index, align 4
  %idxprom = sext i32 %31 to i64
  %arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
  %32 = load float, float* %arrayidx, align 4
  %33 = load i32, i32* %ty, align 4
  %idxprom27 = sext i32 %33 to i64
  %arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
  %34 = load i32, i32* %tx, align 4
  %idxprom29 = sext i32 %34 to i64
  %arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
  store float %32, float* %arrayidx30, align 4
  %35 = load float*, float** %power.addr, align 8
  %36 = load i32, i32* %index, align 4
  %idxprom31 = sext i32 %36 to i64
  %arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
  %37 = load float, float* %arrayidx32, align 4
  %38 = load i32, i32* %ty, align 4
  %idxprom33 = sext i32 %38 to i64
  %arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
  %39 = load i32, i32* %tx, align 4
  %idxprom35 = sext i32 %39 to i64
  %arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
  store float %37, float* %arrayidx36, align 4
  br label %if.end

if.end:                                           ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
  call void @llvm.nvvm.barrier0()
  %40 = load i32, i32* %blkY, align 4
  %cmp37 = icmp slt i32 %40, 0
  br i1 %cmp37, label %cond.true, label %cond.false

cond.true:                                        ; preds = %if.end
  %41 = load i32, i32* %blkY, align 4
  %sub38 = sub nsw i32 0, %41
  br label %cond.end

cond.false:                                       ; preds = %if.end
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  %cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
  store i32 %cond, i32* %validYmin, align 4
  %42 = load i32, i32* %blkYmax, align 4
  %43 = load i32, i32* %grid_rows.addr, align 4
  %sub39 = sub nsw i32 %43, 1
  %cmp40 = icmp sgt i32 %42, %sub39
  br i1 %cmp40, label %cond.true41, label %cond.false45

cond.true41:                                      ; preds = %cond.end
  %44 = load i32, i32* %blkYmax, align 4
  %45 = load i32, i32* %grid_rows.addr, align 4
  %sub42 = sub nsw i32 %44, %45
  %add43 = add nsw i32 %sub42, 1
  %sub44 = sub nsw i32 15, %add43
  br label %cond.end46

cond.false45:                                     ; preds = %cond.end
  br label %cond.end46

cond.end46:                                       ; preds = %cond.false45, %cond.true41
  %cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
  store i32 %cond47, i32* %validYmax, align 4
  %46 = load i32, i32* %blkX, align 4
  %cmp48 = icmp slt i32 %46, 0
  br i1 %cmp48, label %cond.true49, label %cond.false51

cond.true49:                                      ; preds = %cond.end46
  %47 = load i32, i32* %blkX, align 4
  %sub50 = sub nsw i32 0, %47
  br label %cond.end52

cond.false51:                                     ; preds = %cond.end46
  br label %cond.end52

cond.end52:                                       ; preds = %cond.false51, %cond.true49
  %cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
  store i32 %cond53, i32* %validXmin, align 4
  %48 = load i32, i32* %blkXmax, align 4
  %49 = load i32, i32* %grid_cols.addr, align 4
  %sub54 = sub nsw i32 %49, 1
  %cmp55 = icmp sgt i32 %48, %sub54
  br i1 %cmp55, label %cond.true56, label %cond.false60

cond.true56:                                      ; preds = %cond.end52
  %50 = load i32, i32* %blkXmax, align 4
  %51 = load i32, i32* %grid_cols.addr, align 4
  %sub57 = sub nsw i32 %50, %51
  %add58 = add nsw i32 %sub57, 1
  %sub59 = sub nsw i32 15, %add58
  br label %cond.end61

cond.false60:                                     ; preds = %cond.end52
  br label %cond.end61

cond.end61:                                       ; preds = %cond.false60, %cond.true56
  %cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
  store i32 %cond62, i32* %validXmax, align 4
  %52 = load i32, i32* %ty, align 4
  %sub63 = sub nsw i32 %52, 1
  store i32 %sub63, i32* %N, align 4
  %53 = load i32, i32* %ty, align 4
  %add64 = add nsw i32 %53, 1
  store i32 %add64, i32* %S, align 4
  %54 = load i32, i32* %tx, align 4
  %sub65 = sub nsw i32 %54, 1
  store i32 %sub65, i32* %W, align 4
  %55 = load i32, i32* %tx, align 4
  %add66 = add nsw i32 %55, 1
  store i32 %add66, i32* %E, align 4
  %56 = load i32, i32* %N, align 4
  %57 = load i32, i32* %validYmin, align 4
  %cmp67 = icmp slt i32 %56, %57
  br i1 %cmp67, label %cond.true68, label %cond.false69

cond.true68:                                      ; preds = %cond.end61
  %58 = load i32, i32* %validYmin, align 4
  br label %cond.end70

cond.false69:                                     ; preds = %cond.end61
  %59 = load i32, i32* %N, align 4
  br label %cond.end70

cond.end70:                                       ; preds = %cond.false69, %cond.true68
  %cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
  store i32 %cond71, i32* %N, align 4
  %60 = load i32, i32* %S, align 4
  %61 = load i32, i32* %validYmax, align 4
  %cmp72 = icmp sgt i32 %60, %61
  br i1 %cmp72, label %cond.true73, label %cond.false74

cond.true73:                                      ; preds = %cond.end70
  %62 = load i32, i32* %validYmax, align 4
  br label %cond.end75

cond.false74:                                     ; preds = %cond.end70
  %63 = load i32, i32* %S, align 4
  br label %cond.end75

cond.end75:                                       ; preds = %cond.false74, %cond.true73
  %cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
  store i32 %cond76, i32* %S, align 4
  %64 = load i32, i32* %W, align 4
  %65 = load i32, i32* %validXmin, align 4
  %cmp77 = icmp slt i32 %64, %65
  br i1 %cmp77, label %cond.true78, label %cond.false79

cond.true78:                                      ; preds = %cond.end75
  %66 = load i32, i32* %validXmin, align 4
  br label %cond.end80

cond.false79:                                     ; preds = %cond.end75
  %67 = load i32, i32* %W, align 4
  br label %cond.end80

cond.end80:                                       ; preds = %cond.false79, %cond.true78
  %cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
  store i32 %cond81, i32* %W, align 4
  %68 = load i32, i32* %E, align 4
  %69 = load i32, i32* %validXmax, align 4
  %cmp82 = icmp sgt i32 %68, %69
  br i1 %cmp82, label %cond.true83, label %cond.false84

cond.true83:                                      ; preds = %cond.end80
  %70 = load i32, i32* %validXmax, align 4
  br label %cond.end85

cond.false84:                                     ; preds = %cond.end80
  %71 = load i32, i32* %E, align 4
  br label %cond.end85

cond.end85:                                       ; preds = %cond.false84, %cond.true83
  %cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
  store i32 %cond86, i32* %E, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %cond.end85
  %72 = load i32, i32* %i, align 4
  %73 = load i32, i32* %iteration.addr, align 4
  %cmp87 = icmp slt i32 %72, %73
  br i1 %cmp87, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  store i8 0, i8* %computed, align 1
  %74 = load i32, i32* %tx, align 4
  %75 = load i32, i32* %i, align 4
  %add88 = add nsw i32 %75, 1
  %cmp89 = icmp sge i32 %74, %add88
  br i1 %cmp89, label %land.lhs.true90, label %if.end175

land.lhs.true90:                                  ; preds = %for.body
  %76 = load i32, i32* %tx, align 4
  %77 = load i32, i32* %i, align 4
  %sub91 = sub nsw i32 16, %77
  %sub92 = sub nsw i32 %sub91, 2
  %cmp93 = icmp sle i32 %76, %sub92
  br i1 %cmp93, label %land.lhs.true94, label %if.end175

land.lhs.true94:                                  ; preds = %land.lhs.true90
  %78 = load i32, i32* %ty, align 4
  %79 = load i32, i32* %i, align 4
  %add95 = add nsw i32 %79, 1
  %cmp96 = icmp sge i32 %78, %add95
  br i1 %cmp96, label %land.lhs.true97, label %if.end175

land.lhs.true97:                                  ; preds = %land.lhs.true94
  %80 = load i32, i32* %ty, align 4
  %81 = load i32, i32* %i, align 4
  %sub98 = sub nsw i32 16, %81
  %sub99 = sub nsw i32 %sub98, 2
  %cmp100 = icmp sle i32 %80, %sub99
  br i1 %cmp100, label %land.lhs.true101, label %if.end175

land.lhs.true101:                                 ; preds = %land.lhs.true97
  %82 = load i32, i32* %tx, align 4
  %83 = load i32, i32* %validXmin, align 4
  %cmp102 = icmp sge i32 %82, %83
  br i1 %cmp102, label %land.lhs.true103, label %if.end175

land.lhs.true103:                                 ; preds = %land.lhs.true101
  %84 = load i32, i32* %tx, align 4
  %85 = load i32, i32* %validXmax, align 4
  %cmp104 = icmp sle i32 %84, %85
  br i1 %cmp104, label %land.lhs.true105, label %if.end175

land.lhs.true105:                                 ; preds = %land.lhs.true103
  %86 = load i32, i32* %ty, align 4
  %87 = load i32, i32* %validYmin, align 4
  %cmp106 = icmp sge i32 %86, %87
  br i1 %cmp106, label %land.lhs.true107, label %if.end175

land.lhs.true107:                                 ; preds = %land.lhs.true105
  %88 = load i32, i32* %ty, align 4
  %89 = load i32, i32* %validYmax, align 4
  %cmp108 = icmp sle i32 %88, %89
  br i1 %cmp108, label %if.then109, label %if.end175

if.then109:                                       ; preds = %land.lhs.true107
  store i8 1, i8* %computed, align 1
  %90 = load i32, i32* %ty, align 4
  %idxprom110 = sext i32 %90 to i64
  %arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
  %91 = load i32, i32* %tx, align 4
  %idxprom112 = sext i32 %91 to i64
  %arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
  %92 = load float, float* %arrayidx113, align 4
  %conv = fpext float %92 to double
  %93 = load float, float* %step_div_Cap, align 4
  %conv114 = fpext float %93 to double
  %94 = load i32, i32* %ty, align 4
  %idxprom115 = sext i32 %94 to i64
  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
  %95 = load i32, i32* %tx, align 4
  %idxprom117 = sext i32 %95 to i64
  %arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
  %96 = load float, float* %arrayidx118, align 4
  %conv119 = fpext float %96 to double
  %97 = load i32, i32* %S, align 4
  %idxprom120 = sext i32 %97 to i64
  %arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
  %98 = load i32, i32* %tx, align 4
  %idxprom122 = sext i32 %98 to i64
  %arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
  %99 = load float, float* %arrayidx123, align 4
  %100 = load i32, i32* %N, align 4
  %idxprom124 = sext i32 %100 to i64
  %arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
  %101 = load i32, i32* %tx, align 4
  %idxprom126 = sext i32 %101 to i64
  %arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
  %102 = load float, float* %arrayidx127, align 4
  %add128 = fadd contract float %99, %102
  %conv129 = fpext float %add128 to double
  %103 = load i32, i32* %ty, align 4
  %idxprom130 = sext i32 %103 to i64
  %arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
  %104 = load i32, i32* %tx, align 4
  %idxprom132 = sext i32 %104 to i64
  %arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
  %105 = load float, float* %arrayidx133, align 4
  %conv134 = fpext float %105 to double
  %mul135 = fmul contract double 2.000000e+00, %conv134
  %sub136 = fsub contract double %conv129, %mul135
  %106 = load float, float* %Ry_1, align 4
  %conv137 = fpext float %106 to double
  %mul138 = fmul contract double %sub136, %conv137
  %add139 = fadd contract double %conv119, %mul138
  %107 = load i32, i32* %ty, align 4
  %idxprom140 = sext i32 %107 to i64
  %arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
  %108 = load i32, i32* %E, align 4
  %idxprom142 = sext i32 %108 to i64
  %arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
  %109 = load float, float* %arrayidx143, align 4
  %110 = load i32, i32* %ty, align 4
  %idxprom144 = sext i32 %110 to i64
  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
  %111 = load i32, i32* %W, align 4
  %idxprom146 = sext i32 %111 to i64
  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
  %112 = load float, float* %arrayidx147, align 4
  %add148 = fadd contract float %109, %112
  %conv149 = fpext float %add148 to double
  %113 = load i32, i32* %ty, align 4
  %idxprom150 = sext i32 %113 to i64
  %arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
  %114 = load i32, i32* %tx, align 4
  %idxprom152 = sext i32 %114 to i64
  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
  %115 = load float, float* %arrayidx153, align 4
  %conv154 = fpext float %115 to double
  %mul155 = fmul contract double 2.000000e+00, %conv154
  %sub156 = fsub contract double %conv149, %mul155
  %116 = load float, float* %Rx_1, align 4
  %conv157 = fpext float %116 to double
  %mul158 = fmul contract double %sub156, %conv157
  %add159 = fadd contract double %add139, %mul158
  %117 = load float, float* %amb_temp, align 4
  %118 = load i32, i32* %ty, align 4
  %idxprom160 = sext i32 %118 to i64
  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
  %119 = load i32, i32* %tx, align 4
  %idxprom162 = sext i32 %119 to i64
  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
  %120 = load float, float* %arrayidx163, align 4
  %sub164 = fsub contract float %117, %120
  %121 = load float, float* %Rz_1, align 4
  %mul165 = fmul contract float %sub164, %121
  %conv166 = fpext float %mul165 to double
  %add167 = fadd contract double %add159, %conv166
  %mul168 = fmul contract double %conv114, %add167
  %add169 = fadd contract double %conv, %mul168
  %conv170 = fptrunc double %add169 to float
  %122 = load i32, i32* %ty, align 4
  %idxprom171 = sext i32 %122 to i64
  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
  %123 = load i32, i32* %tx, align 4
  %idxprom173 = sext i32 %123 to i64
  %arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
  store float %conv170, float* %arrayidx174, align 4
  br label %if.end175

if.end175:                                        ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
  call void @llvm.nvvm.barrier0()
  %124 = load i32, i32* %i, align 4
  %125 = load i32, i32* %iteration.addr, align 4
  %sub176 = sub nsw i32 %125, 1
  %cmp177 = icmp eq i32 %124, %sub176
  br i1 %cmp177, label %if.then178, label %if.end179

if.then178:                                       ; preds = %if.end175
  br label %for.end

if.end179:                                        ; preds = %if.end175
  %126 = load i8, i8* %computed, align 1
  %tobool = trunc i8 %126 to i1
  br i1 %tobool, label %if.then180, label %if.end189

if.then180:                                       ; preds = %if.end179
  %127 = load i32, i32* %ty, align 4
  %idxprom181 = sext i32 %127 to i64
  %arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
  %128 = load i32, i32* %tx, align 4
  %idxprom183 = sext i32 %128 to i64
  %arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
  %129 = load float, float* %arrayidx184, align 4
  %130 = load i32, i32* %ty, align 4
  %idxprom185 = sext i32 %130 to i64
  %arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
  %131 = load i32, i32* %tx, align 4
  %idxprom187 = sext i32 %131 to i64
  %arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
  store float %129, float* %arrayidx188, align 4
  br label %if.end189

if.end189:                                        ; preds = %if.then180, %if.end179
  call void @llvm.nvvm.barrier0()
  br label %for.inc

for.inc:                                          ; preds = %if.end189
  %132 = load i32, i32* %i, align 4
  %inc = add nsw i32 %132, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond

for.end:                                          ; preds = %if.then178, %for.cond
  %133 = load i8, i8* %computed, align 1
  %tobool190 = trunc i8 %133 to i1
  br i1 %tobool190, label %if.then191, label %if.end198

if.then191:                                       ; preds = %for.end
  %134 = load i32, i32* %ty, align 4
  %idxprom192 = sext i32 %134 to i64
  %arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
  %135 = load i32, i32* %tx, align 4
  %idxprom194 = sext i32 %135 to i64
  %arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
  %136 = load float, float* %arrayidx195, align 4
  %137 = load float*, float** %temp_dst.addr, align 8
  %138 = load i32, i32* %index, align 4
  %idxprom196 = sext i32 %138 to i64
  %arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
  store float %136, float* %arrayidx197, align 4
  br label %if.end198

if.end198:                                        ; preds = %if.then191, %for.end
  ret void
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
}

; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3

attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }

!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}