CuPBoP/examples/hotspot/hotspot-cuda-nvptx64-nvidia...

720 lines
33 KiB
LLVM

; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "hotspot.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
%iteration.addr = alloca i32, align 4
%power.addr = alloca float*, align 8
%temp_src.addr = alloca float*, align 8
%temp_dst.addr = alloca float*, align 8
%grid_cols.addr = alloca i32, align 4
%grid_rows.addr = alloca i32, align 4
%border_cols.addr = alloca i32, align 4
%border_rows.addr = alloca i32, align 4
%Cap.addr = alloca float, align 4
%Rx.addr = alloca float, align 4
%Ry.addr = alloca float, align 4
%Rz.addr = alloca float, align 4
%step.addr = alloca float, align 4
%time_elapsed.addr = alloca float, align 4
%amb_temp = alloca float, align 4
%step_div_Cap = alloca float, align 4
%Rx_1 = alloca float, align 4
%Ry_1 = alloca float, align 4
%Rz_1 = alloca float, align 4
%bx = alloca i32, align 4
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%small_block_rows = alloca i32, align 4
%small_block_cols = alloca i32, align 4
%blkY = alloca i32, align 4
%blkX = alloca i32, align 4
%blkYmax = alloca i32, align 4
%blkXmax = alloca i32, align 4
%yidx = alloca i32, align 4
%xidx = alloca i32, align 4
%loadYidx = alloca i32, align 4
%loadXidx = alloca i32, align 4
%index = alloca i32, align 4
%validYmin = alloca i32, align 4
%validYmax = alloca i32, align 4
%validXmin = alloca i32, align 4
%validXmax = alloca i32, align 4
%N = alloca i32, align 4
%S = alloca i32, align 4
%W = alloca i32, align 4
%E = alloca i32, align 4
%computed = alloca i8, align 1
%i = alloca i32, align 4
store i32 %iteration, i32* %iteration.addr, align 4
store float* %power, float** %power.addr, align 8
store float* %temp_src, float** %temp_src.addr, align 8
store float* %temp_dst, float** %temp_dst.addr, align 8
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %border_cols, i32* %border_cols.addr, align 4
store i32 %border_rows, i32* %border_rows.addr, align 4
store float %Cap, float* %Cap.addr, align 4
store float %Rx, float* %Rx.addr, align 4
store float %Ry, float* %Ry.addr, align 4
store float %Rz, float* %Rz.addr, align 4
store float %step, float* %step.addr, align 4
store float %time_elapsed, float* %time_elapsed.addr, align 4
store float 8.000000e+01, float* %amb_temp, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %bx, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call1, i32* %by, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call2, i32* %tx, align 4
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call3, i32* %ty, align 4
%0 = load float, float* %step.addr, align 4
%1 = load float, float* %Cap.addr, align 4
%div = fdiv float %0, %1
store float %div, float* %step_div_Cap, align 4
%2 = load float, float* %Rx.addr, align 4
%div4 = fdiv float 1.000000e+00, %2
store float %div4, float* %Rx_1, align 4
%3 = load float, float* %Ry.addr, align 4
%div5 = fdiv float 1.000000e+00, %3
store float %div5, float* %Ry_1, align 4
%4 = load float, float* %Rz.addr, align 4
%div6 = fdiv float 1.000000e+00, %4
store float %div6, float* %Rz_1, align 4
%5 = load i32, i32* %iteration.addr, align 4
%mul = mul nsw i32 %5, 2
%sub = sub nsw i32 16, %mul
store i32 %sub, i32* %small_block_rows, align 4
%6 = load i32, i32* %iteration.addr, align 4
%mul7 = mul nsw i32 %6, 2
%sub8 = sub nsw i32 16, %mul7
store i32 %sub8, i32* %small_block_cols, align 4
%7 = load i32, i32* %small_block_rows, align 4
%8 = load i32, i32* %by, align 4
%mul9 = mul nsw i32 %7, %8
%9 = load i32, i32* %border_rows.addr, align 4
%sub10 = sub nsw i32 %mul9, %9
store i32 %sub10, i32* %blkY, align 4
%10 = load i32, i32* %small_block_cols, align 4
%11 = load i32, i32* %bx, align 4
%mul11 = mul nsw i32 %10, %11
%12 = load i32, i32* %border_cols.addr, align 4
%sub12 = sub nsw i32 %mul11, %12
store i32 %sub12, i32* %blkX, align 4
%13 = load i32, i32* %blkY, align 4
%add = add nsw i32 %13, 16
%sub13 = sub nsw i32 %add, 1
store i32 %sub13, i32* %blkYmax, align 4
%14 = load i32, i32* %blkX, align 4
%add14 = add nsw i32 %14, 16
%sub15 = sub nsw i32 %add14, 1
store i32 %sub15, i32* %blkXmax, align 4
%15 = load i32, i32* %blkY, align 4
%16 = load i32, i32* %ty, align 4
%add16 = add nsw i32 %15, %16
store i32 %add16, i32* %yidx, align 4
%17 = load i32, i32* %blkX, align 4
%18 = load i32, i32* %tx, align 4
%add17 = add nsw i32 %17, %18
store i32 %add17, i32* %xidx, align 4
%19 = load i32, i32* %yidx, align 4
store i32 %19, i32* %loadYidx, align 4
%20 = load i32, i32* %xidx, align 4
store i32 %20, i32* %loadXidx, align 4
%21 = load i32, i32* %grid_cols.addr, align 4
%22 = load i32, i32* %loadYidx, align 4
%mul18 = mul nsw i32 %21, %22
%23 = load i32, i32* %loadXidx, align 4
%add19 = add nsw i32 %mul18, %23
store i32 %add19, i32* %index, align 4
%24 = load i32, i32* %loadYidx, align 4
%cmp = icmp sge i32 %24, 0
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%25 = load i32, i32* %loadYidx, align 4
%26 = load i32, i32* %grid_rows.addr, align 4
%sub20 = sub nsw i32 %26, 1
%cmp21 = icmp sle i32 %25, %sub20
br i1 %cmp21, label %land.lhs.true22, label %if.end
land.lhs.true22: ; preds = %land.lhs.true
%27 = load i32, i32* %loadXidx, align 4
%cmp23 = icmp sge i32 %27, 0
br i1 %cmp23, label %land.lhs.true24, label %if.end
land.lhs.true24: ; preds = %land.lhs.true22
%28 = load i32, i32* %loadXidx, align 4
%29 = load i32, i32* %grid_cols.addr, align 4
%sub25 = sub nsw i32 %29, 1
%cmp26 = icmp sle i32 %28, %sub25
br i1 %cmp26, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true24
%30 = load float*, float** %temp_src.addr, align 8
%31 = load i32, i32* %index, align 4
%idxprom = sext i32 %31 to i64
%arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
%32 = load float, float* %arrayidx, align 4
%33 = load i32, i32* %ty, align 4
%idxprom27 = sext i32 %33 to i64
%arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
%34 = load i32, i32* %tx, align 4
%idxprom29 = sext i32 %34 to i64
%arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
store float %32, float* %arrayidx30, align 4
%35 = load float*, float** %power.addr, align 8
%36 = load i32, i32* %index, align 4
%idxprom31 = sext i32 %36 to i64
%arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
%37 = load float, float* %arrayidx32, align 4
%38 = load i32, i32* %ty, align 4
%idxprom33 = sext i32 %38 to i64
%arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
%39 = load i32, i32* %tx, align 4
%idxprom35 = sext i32 %39 to i64
%arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
store float %37, float* %arrayidx36, align 4
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
call void @llvm.nvvm.barrier0()
%40 = load i32, i32* %blkY, align 4
%cmp37 = icmp slt i32 %40, 0
br i1 %cmp37, label %cond.true, label %cond.false
cond.true: ; preds = %if.end
%41 = load i32, i32* %blkY, align 4
%sub38 = sub nsw i32 0, %41
br label %cond.end
cond.false: ; preds = %if.end
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
store i32 %cond, i32* %validYmin, align 4
%42 = load i32, i32* %blkYmax, align 4
%43 = load i32, i32* %grid_rows.addr, align 4
%sub39 = sub nsw i32 %43, 1
%cmp40 = icmp sgt i32 %42, %sub39
br i1 %cmp40, label %cond.true41, label %cond.false45
cond.true41: ; preds = %cond.end
%44 = load i32, i32* %blkYmax, align 4
%45 = load i32, i32* %grid_rows.addr, align 4
%sub42 = sub nsw i32 %44, %45
%add43 = add nsw i32 %sub42, 1
%sub44 = sub nsw i32 15, %add43
br label %cond.end46
cond.false45: ; preds = %cond.end
br label %cond.end46
cond.end46: ; preds = %cond.false45, %cond.true41
%cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
store i32 %cond47, i32* %validYmax, align 4
%46 = load i32, i32* %blkX, align 4
%cmp48 = icmp slt i32 %46, 0
br i1 %cmp48, label %cond.true49, label %cond.false51
cond.true49: ; preds = %cond.end46
%47 = load i32, i32* %blkX, align 4
%sub50 = sub nsw i32 0, %47
br label %cond.end52
cond.false51: ; preds = %cond.end46
br label %cond.end52
cond.end52: ; preds = %cond.false51, %cond.true49
%cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
store i32 %cond53, i32* %validXmin, align 4
%48 = load i32, i32* %blkXmax, align 4
%49 = load i32, i32* %grid_cols.addr, align 4
%sub54 = sub nsw i32 %49, 1
%cmp55 = icmp sgt i32 %48, %sub54
br i1 %cmp55, label %cond.true56, label %cond.false60
cond.true56: ; preds = %cond.end52
%50 = load i32, i32* %blkXmax, align 4
%51 = load i32, i32* %grid_cols.addr, align 4
%sub57 = sub nsw i32 %50, %51
%add58 = add nsw i32 %sub57, 1
%sub59 = sub nsw i32 15, %add58
br label %cond.end61
cond.false60: ; preds = %cond.end52
br label %cond.end61
cond.end61: ; preds = %cond.false60, %cond.true56
%cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
store i32 %cond62, i32* %validXmax, align 4
%52 = load i32, i32* %ty, align 4
%sub63 = sub nsw i32 %52, 1
store i32 %sub63, i32* %N, align 4
%53 = load i32, i32* %ty, align 4
%add64 = add nsw i32 %53, 1
store i32 %add64, i32* %S, align 4
%54 = load i32, i32* %tx, align 4
%sub65 = sub nsw i32 %54, 1
store i32 %sub65, i32* %W, align 4
%55 = load i32, i32* %tx, align 4
%add66 = add nsw i32 %55, 1
store i32 %add66, i32* %E, align 4
%56 = load i32, i32* %N, align 4
%57 = load i32, i32* %validYmin, align 4
%cmp67 = icmp slt i32 %56, %57
br i1 %cmp67, label %cond.true68, label %cond.false69
cond.true68: ; preds = %cond.end61
%58 = load i32, i32* %validYmin, align 4
br label %cond.end70
cond.false69: ; preds = %cond.end61
%59 = load i32, i32* %N, align 4
br label %cond.end70
cond.end70: ; preds = %cond.false69, %cond.true68
%cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
store i32 %cond71, i32* %N, align 4
%60 = load i32, i32* %S, align 4
%61 = load i32, i32* %validYmax, align 4
%cmp72 = icmp sgt i32 %60, %61
br i1 %cmp72, label %cond.true73, label %cond.false74
cond.true73: ; preds = %cond.end70
%62 = load i32, i32* %validYmax, align 4
br label %cond.end75
cond.false74: ; preds = %cond.end70
%63 = load i32, i32* %S, align 4
br label %cond.end75
cond.end75: ; preds = %cond.false74, %cond.true73
%cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
store i32 %cond76, i32* %S, align 4
%64 = load i32, i32* %W, align 4
%65 = load i32, i32* %validXmin, align 4
%cmp77 = icmp slt i32 %64, %65
br i1 %cmp77, label %cond.true78, label %cond.false79
cond.true78: ; preds = %cond.end75
%66 = load i32, i32* %validXmin, align 4
br label %cond.end80
cond.false79: ; preds = %cond.end75
%67 = load i32, i32* %W, align 4
br label %cond.end80
cond.end80: ; preds = %cond.false79, %cond.true78
%cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
store i32 %cond81, i32* %W, align 4
%68 = load i32, i32* %E, align 4
%69 = load i32, i32* %validXmax, align 4
%cmp82 = icmp sgt i32 %68, %69
br i1 %cmp82, label %cond.true83, label %cond.false84
cond.true83: ; preds = %cond.end80
%70 = load i32, i32* %validXmax, align 4
br label %cond.end85
cond.false84: ; preds = %cond.end80
%71 = load i32, i32* %E, align 4
br label %cond.end85
cond.end85: ; preds = %cond.false84, %cond.true83
%cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
store i32 %cond86, i32* %E, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %cond.end85
%72 = load i32, i32* %i, align 4
%73 = load i32, i32* %iteration.addr, align 4
%cmp87 = icmp slt i32 %72, %73
br i1 %cmp87, label %for.body, label %for.end
for.body: ; preds = %for.cond
store i8 0, i8* %computed, align 1
%74 = load i32, i32* %tx, align 4
%75 = load i32, i32* %i, align 4
%add88 = add nsw i32 %75, 1
%cmp89 = icmp sge i32 %74, %add88
br i1 %cmp89, label %land.lhs.true90, label %if.end175
land.lhs.true90: ; preds = %for.body
%76 = load i32, i32* %tx, align 4
%77 = load i32, i32* %i, align 4
%sub91 = sub nsw i32 16, %77
%sub92 = sub nsw i32 %sub91, 2
%cmp93 = icmp sle i32 %76, %sub92
br i1 %cmp93, label %land.lhs.true94, label %if.end175
land.lhs.true94: ; preds = %land.lhs.true90
%78 = load i32, i32* %ty, align 4
%79 = load i32, i32* %i, align 4
%add95 = add nsw i32 %79, 1
%cmp96 = icmp sge i32 %78, %add95
br i1 %cmp96, label %land.lhs.true97, label %if.end175
land.lhs.true97: ; preds = %land.lhs.true94
%80 = load i32, i32* %ty, align 4
%81 = load i32, i32* %i, align 4
%sub98 = sub nsw i32 16, %81
%sub99 = sub nsw i32 %sub98, 2
%cmp100 = icmp sle i32 %80, %sub99
br i1 %cmp100, label %land.lhs.true101, label %if.end175
land.lhs.true101: ; preds = %land.lhs.true97
%82 = load i32, i32* %tx, align 4
%83 = load i32, i32* %validXmin, align 4
%cmp102 = icmp sge i32 %82, %83
br i1 %cmp102, label %land.lhs.true103, label %if.end175
land.lhs.true103: ; preds = %land.lhs.true101
%84 = load i32, i32* %tx, align 4
%85 = load i32, i32* %validXmax, align 4
%cmp104 = icmp sle i32 %84, %85
br i1 %cmp104, label %land.lhs.true105, label %if.end175
land.lhs.true105: ; preds = %land.lhs.true103
%86 = load i32, i32* %ty, align 4
%87 = load i32, i32* %validYmin, align 4
%cmp106 = icmp sge i32 %86, %87
br i1 %cmp106, label %land.lhs.true107, label %if.end175
land.lhs.true107: ; preds = %land.lhs.true105
%88 = load i32, i32* %ty, align 4
%89 = load i32, i32* %validYmax, align 4
%cmp108 = icmp sle i32 %88, %89
br i1 %cmp108, label %if.then109, label %if.end175
if.then109: ; preds = %land.lhs.true107
store i8 1, i8* %computed, align 1
%90 = load i32, i32* %ty, align 4
%idxprom110 = sext i32 %90 to i64
%arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
%91 = load i32, i32* %tx, align 4
%idxprom112 = sext i32 %91 to i64
%arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
%92 = load float, float* %arrayidx113, align 4
%conv = fpext float %92 to double
%93 = load float, float* %step_div_Cap, align 4
%conv114 = fpext float %93 to double
%94 = load i32, i32* %ty, align 4
%idxprom115 = sext i32 %94 to i64
%arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
%95 = load i32, i32* %tx, align 4
%idxprom117 = sext i32 %95 to i64
%arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
%96 = load float, float* %arrayidx118, align 4
%conv119 = fpext float %96 to double
%97 = load i32, i32* %S, align 4
%idxprom120 = sext i32 %97 to i64
%arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
%98 = load i32, i32* %tx, align 4
%idxprom122 = sext i32 %98 to i64
%arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
%99 = load float, float* %arrayidx123, align 4
%100 = load i32, i32* %N, align 4
%idxprom124 = sext i32 %100 to i64
%arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
%101 = load i32, i32* %tx, align 4
%idxprom126 = sext i32 %101 to i64
%arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
%102 = load float, float* %arrayidx127, align 4
%add128 = fadd contract float %99, %102
%conv129 = fpext float %add128 to double
%103 = load i32, i32* %ty, align 4
%idxprom130 = sext i32 %103 to i64
%arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
%104 = load i32, i32* %tx, align 4
%idxprom132 = sext i32 %104 to i64
%arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
%105 = load float, float* %arrayidx133, align 4
%conv134 = fpext float %105 to double
%mul135 = fmul contract double 2.000000e+00, %conv134
%sub136 = fsub contract double %conv129, %mul135
%106 = load float, float* %Ry_1, align 4
%conv137 = fpext float %106 to double
%mul138 = fmul contract double %sub136, %conv137
%add139 = fadd contract double %conv119, %mul138
%107 = load i32, i32* %ty, align 4
%idxprom140 = sext i32 %107 to i64
%arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
%108 = load i32, i32* %E, align 4
%idxprom142 = sext i32 %108 to i64
%arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
%109 = load float, float* %arrayidx143, align 4
%110 = load i32, i32* %ty, align 4
%idxprom144 = sext i32 %110 to i64
%arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
%111 = load i32, i32* %W, align 4
%idxprom146 = sext i32 %111 to i64
%arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
%112 = load float, float* %arrayidx147, align 4
%add148 = fadd contract float %109, %112
%conv149 = fpext float %add148 to double
%113 = load i32, i32* %ty, align 4
%idxprom150 = sext i32 %113 to i64
%arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
%114 = load i32, i32* %tx, align 4
%idxprom152 = sext i32 %114 to i64
%arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
%115 = load float, float* %arrayidx153, align 4
%conv154 = fpext float %115 to double
%mul155 = fmul contract double 2.000000e+00, %conv154
%sub156 = fsub contract double %conv149, %mul155
%116 = load float, float* %Rx_1, align 4
%conv157 = fpext float %116 to double
%mul158 = fmul contract double %sub156, %conv157
%add159 = fadd contract double %add139, %mul158
%117 = load float, float* %amb_temp, align 4
%118 = load i32, i32* %ty, align 4
%idxprom160 = sext i32 %118 to i64
%arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
%119 = load i32, i32* %tx, align 4
%idxprom162 = sext i32 %119 to i64
%arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
%120 = load float, float* %arrayidx163, align 4
%sub164 = fsub contract float %117, %120
%121 = load float, float* %Rz_1, align 4
%mul165 = fmul contract float %sub164, %121
%conv166 = fpext float %mul165 to double
%add167 = fadd contract double %add159, %conv166
%mul168 = fmul contract double %conv114, %add167
%add169 = fadd contract double %conv, %mul168
%conv170 = fptrunc double %add169 to float
%122 = load i32, i32* %ty, align 4
%idxprom171 = sext i32 %122 to i64
%arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
%123 = load i32, i32* %tx, align 4
%idxprom173 = sext i32 %123 to i64
%arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
store float %conv170, float* %arrayidx174, align 4
br label %if.end175
if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
call void @llvm.nvvm.barrier0()
%124 = load i32, i32* %i, align 4
%125 = load i32, i32* %iteration.addr, align 4
%sub176 = sub nsw i32 %125, 1
%cmp177 = icmp eq i32 %124, %sub176
br i1 %cmp177, label %if.then178, label %if.end179
if.then178: ; preds = %if.end175
br label %for.end
if.end179: ; preds = %if.end175
%126 = load i8, i8* %computed, align 1
%tobool = trunc i8 %126 to i1
br i1 %tobool, label %if.then180, label %if.end189
if.then180: ; preds = %if.end179
%127 = load i32, i32* %ty, align 4
%idxprom181 = sext i32 %127 to i64
%arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
%128 = load i32, i32* %tx, align 4
%idxprom183 = sext i32 %128 to i64
%arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
%129 = load float, float* %arrayidx184, align 4
%130 = load i32, i32* %ty, align 4
%idxprom185 = sext i32 %130 to i64
%arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
%131 = load i32, i32* %tx, align 4
%idxprom187 = sext i32 %131 to i64
%arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
store float %129, float* %arrayidx188, align 4
br label %if.end189
if.end189: ; preds = %if.then180, %if.end179
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end189
%132 = load i32, i32* %i, align 4
%inc = add nsw i32 %132, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %if.then178, %for.cond
%133 = load i8, i8* %computed, align 1
%tobool190 = trunc i8 %133 to i1
br i1 %tobool190, label %if.then191, label %if.end198
if.then191: ; preds = %for.end
%134 = load i32, i32* %ty, align 4
%idxprom192 = sext i32 %134 to i64
%arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
%135 = load i32, i32* %tx, align 4
%idxprom194 = sext i32 %135 to i64
%arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
%136 = load float, float* %arrayidx195, align 4
%137 = load float*, float** %temp_dst.addr, align 8
%138 = load i32, i32* %index, align 4
%idxprom196 = sext i32 %138 to i64
%arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
store float %136, float* %arrayidx197, align 4
br label %if.end198
if.end198: ; preds = %if.then191, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}