CuPBoP/examples/srad_v2/srad-cuda-nvptx64-nvidia-cu...

; ModuleID = 'srad-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "srad.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_gridDim_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }

$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any

$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any

$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any

$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any

$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv = comdat any

$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any

@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1
@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp = internal addrspace(3) global [16 x [16 x float]] undef, align 4

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
  %p.addr = alloca i8**, align 8
  %s.addr = alloca i64, align 8
  store i8** %p, i8*** %p.addr, align 8
  store i64 %s, i64* %s.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
  %p.addr = alloca %struct.cudaFuncAttributes*, align 8
  %c.addr = alloca i8*, align 8
  store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
  store i8* %c, i8** %c.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
  %value.addr = alloca i32*, align 8
  %attr.addr = alloca i32, align 4
  %device.addr = alloca i32, align 4
  store i32* %value, i32** %value.addr, align 8
  store i32 %attr, i32* %attr.addr, align 4
  store i32 %device, i32* %device.addr, align 4
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
  %device.addr = alloca i32*, align 8
  store i32* %device, i32** %device.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
  %numBlocks.addr = alloca i32*, align 8
  %func.addr = alloca i8*, align 8
  %blockSize.addr = alloca i32, align 4
  %dynamicSmemSize.addr = alloca i64, align 8
  %flags.addr = alloca i32, align 4
  store i32* %numBlocks, i32** %numBlocks.addr, align 8
  store i8* %func, i8** %func.addr, align 8
  store i32 %blockSize, i32* %blockSize.addr, align 4
  store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
  store i32 %flags, i32* %flags.addr, align 4
  ret i32 999
}

; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z11srad_cuda_1PfS_S_S_S_S_iif(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %q0sqr) #0 {
entry:
  %E_C.addr = alloca float*, align 8
  %W_C.addr = alloca float*, align 8
  %N_C.addr = alloca float*, align 8
  %S_C.addr = alloca float*, align 8
  %J_cuda.addr = alloca float*, align 8
  %C_cuda.addr = alloca float*, align 8
  %cols.addr = alloca i32, align 4
  %rows.addr = alloca i32, align 4
  %q0sqr.addr = alloca float, align 4
  %bx = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_n = alloca i32, align 4
  %index_s = alloca i32, align 4
  %index_w = alloca i32, align 4
  %index_e = alloca i32, align 4
  %n = alloca float, align 4
  %w = alloca float, align 4
  %e = alloca float, align 4
  %s = alloca float, align 4
  %jc = alloca float, align 4
  %g2 = alloca float, align 4
  %l = alloca float, align 4
  %num = alloca float, align 4
  %den = alloca float, align 4
  %qsqr = alloca float, align 4
  %c = alloca float, align 4
  store float* %E_C, float** %E_C.addr, align 8
  store float* %W_C, float** %W_C.addr, align 8
  store float* %N_C, float** %N_C.addr, align 8
  store float* %S_C, float** %S_C.addr, align 8
  store float* %J_cuda, float** %J_cuda.addr, align 8
  store float* %C_cuda, float** %C_cuda.addr, align 8
  store i32 %cols, i32* %cols.addr, align 4
  store i32 %rows, i32* %rows.addr, align 4
  store float %q0sqr, float* %q0sqr.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %bx, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call1, i32* %by, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call2, i32* %tx, align 4
  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call3, i32* %ty, align 4
  %0 = load i32, i32* %cols.addr, align 4
  %mul = mul nsw i32 %0, 16
  %1 = load i32, i32* %by, align 4
  %mul4 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %bx, align 4
  %mul5 = mul nsw i32 16, %2
  %add = add nsw i32 %mul4, %mul5
  %3 = load i32, i32* %cols.addr, align 4
  %4 = load i32, i32* %ty, align 4
  %mul6 = mul nsw i32 %3, %4
  %add7 = add nsw i32 %add, %mul6
  %5 = load i32, i32* %tx, align 4
  %add8 = add nsw i32 %add7, %5
  store i32 %add8, i32* %index, align 4
  %6 = load i32, i32* %cols.addr, align 4
  %mul9 = mul nsw i32 %6, 16
  %7 = load i32, i32* %by, align 4
  %mul10 = mul nsw i32 %mul9, %7
  %8 = load i32, i32* %bx, align 4
  %mul11 = mul nsw i32 16, %8
  %add12 = add nsw i32 %mul10, %mul11
  %9 = load i32, i32* %tx, align 4
  %add13 = add nsw i32 %add12, %9
  %10 = load i32, i32* %cols.addr, align 4
  %sub = sub nsw i32 %add13, %10
  store i32 %sub, i32* %index_n, align 4
  %11 = load i32, i32* %cols.addr, align 4
  %mul14 = mul nsw i32 %11, 16
  %12 = load i32, i32* %by, align 4
  %mul15 = mul nsw i32 %mul14, %12
  %13 = load i32, i32* %bx, align 4
  %mul16 = mul nsw i32 16, %13
  %add17 = add nsw i32 %mul15, %mul16
  %14 = load i32, i32* %cols.addr, align 4
  %mul18 = mul nsw i32 %14, 16
  %add19 = add nsw i32 %add17, %mul18
  %15 = load i32, i32* %tx, align 4
  %add20 = add nsw i32 %add19, %15
  store i32 %add20, i32* %index_s, align 4
  %16 = load i32, i32* %cols.addr, align 4
  %mul21 = mul nsw i32 %16, 16
  %17 = load i32, i32* %by, align 4
  %mul22 = mul nsw i32 %mul21, %17
  %18 = load i32, i32* %bx, align 4
  %mul23 = mul nsw i32 16, %18
  %add24 = add nsw i32 %mul22, %mul23
  %19 = load i32, i32* %cols.addr, align 4
  %20 = load i32, i32* %ty, align 4
  %mul25 = mul nsw i32 %19, %20
  %add26 = add nsw i32 %add24, %mul25
  %sub27 = sub nsw i32 %add26, 1
  store i32 %sub27, i32* %index_w, align 4
  %21 = load i32, i32* %cols.addr, align 4
  %mul28 = mul nsw i32 %21, 16
  %22 = load i32, i32* %by, align 4
  %mul29 = mul nsw i32 %mul28, %22
  %23 = load i32, i32* %bx, align 4
  %mul30 = mul nsw i32 16, %23
  %add31 = add nsw i32 %mul29, %mul30
  %24 = load i32, i32* %cols.addr, align 4
  %25 = load i32, i32* %ty, align 4
  %mul32 = mul nsw i32 %24, %25
  %add33 = add nsw i32 %add31, %mul32
  %add34 = add nsw i32 %add33, 16
  store i32 %add34, i32* %index_e, align 4
  %26 = load float*, float** %J_cuda.addr, align 8
  %27 = load i32, i32* %index_n, align 4
  %idxprom = sext i32 %27 to i64
  %arrayidx = getelementptr inbounds float, float* %26, i64 %idxprom
  %28 = load float, float* %arrayidx, align 4
  %29 = load i32, i32* %ty, align 4
  %idxprom35 = sext i32 %29 to i64
  %arrayidx36 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom35
  %30 = load i32, i32* %tx, align 4
  %idxprom37 = sext i32 %30 to i64
  %arrayidx38 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx36, i64 0, i64 %idxprom37
  store float %28, float* %arrayidx38, align 4
  %31 = load float*, float** %J_cuda.addr, align 8
  %32 = load i32, i32* %index_s, align 4
  %idxprom39 = sext i32 %32 to i64
  %arrayidx40 = getelementptr inbounds float, float* %31, i64 %idxprom39
  %33 = load float, float* %arrayidx40, align 4
  %34 = load i32, i32* %ty, align 4
  %idxprom41 = sext i32 %34 to i64
  %arrayidx42 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom41
  %35 = load i32, i32* %tx, align 4
  %idxprom43 = sext i32 %35 to i64
  %arrayidx44 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx42, i64 0, i64 %idxprom43
  store float %33, float* %arrayidx44, align 4
  %36 = load i32, i32* %by, align 4
  %cmp = icmp eq i32 %36, 0
  br i1 %cmp, label %if.then, label %if.else

if.then:                                          ; preds = %entry
  %37 = load float*, float** %J_cuda.addr, align 8
  %38 = load i32, i32* %bx, align 4
  %mul45 = mul nsw i32 16, %38
  %39 = load i32, i32* %tx, align 4
  %add46 = add nsw i32 %mul45, %39
  %idxprom47 = sext i32 %add46 to i64
  %arrayidx48 = getelementptr inbounds float, float* %37, i64 %idxprom47
  %40 = load float, float* %arrayidx48, align 4
  %41 = load i32, i32* %ty, align 4
  %idxprom49 = sext i32 %41 to i64
  %arrayidx50 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom49
  %42 = load i32, i32* %tx, align 4
  %idxprom51 = sext i32 %42 to i64
  %arrayidx52 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx50, i64 0, i64 %idxprom51
  store float %40, float* %arrayidx52, align 4
  br label %if.end72

if.else:                                          ; preds = %entry
  %43 = load i32, i32* %by, align 4
  %call53 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2
  %sub54 = sub i32 %call53, 1
  %cmp55 = icmp eq i32 %43, %sub54
  br i1 %cmp55, label %if.then56, label %if.end

if.then56:                                        ; preds = %if.else
  %44 = load float*, float** %J_cuda.addr, align 8
  %45 = load i32, i32* %cols.addr, align 4
  %mul57 = mul nsw i32 %45, 16
  %call58 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2
  %sub59 = sub i32 %call58, 1
  %mul60 = mul i32 %mul57, %sub59
  %46 = load i32, i32* %bx, align 4
  %mul61 = mul nsw i32 16, %46
  %add62 = add i32 %mul60, %mul61
  %47 = load i32, i32* %cols.addr, align 4
  %mul63 = mul nsw i32 %47, 15
  %add64 = add i32 %add62, %mul63
  %48 = load i32, i32* %tx, align 4
  %add65 = add i32 %add64, %48
  %idxprom66 = zext i32 %add65 to i64
  %arrayidx67 = getelementptr inbounds float, float* %44, i64 %idxprom66
  %49 = load float, float* %arrayidx67, align 4
  %50 = load i32, i32* %ty, align 4
  %idxprom68 = sext i32 %50 to i64
  %arrayidx69 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom68
  %51 = load i32, i32* %tx, align 4
  %idxprom70 = sext i32 %51 to i64
  %arrayidx71 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx69, i64 0, i64 %idxprom70
  store float %49, float* %arrayidx71, align 4
  br label %if.end

if.end:                                           ; preds = %if.then56, %if.else
  br label %if.end72

if.end72:                                         ; preds = %if.end, %if.then
  call void @llvm.nvvm.barrier0()
  %52 = load float*, float** %J_cuda.addr, align 8
  %53 = load i32, i32* %index_w, align 4
  %idxprom73 = sext i32 %53 to i64
  %arrayidx74 = getelementptr inbounds float, float* %52, i64 %idxprom73
  %54 = load float, float* %arrayidx74, align 4
  %55 = load i32, i32* %ty, align 4
  %idxprom75 = sext i32 %55 to i64
  %arrayidx76 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom75
  %56 = load i32, i32* %tx, align 4
  %idxprom77 = sext i32 %56 to i64
  %arrayidx78 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx76, i64 0, i64 %idxprom77
  store float %54, float* %arrayidx78, align 4
  %57 = load float*, float** %J_cuda.addr, align 8
  %58 = load i32, i32* %index_e, align 4
  %idxprom79 = sext i32 %58 to i64
  %arrayidx80 = getelementptr inbounds float, float* %57, i64 %idxprom79
  %59 = load float, float* %arrayidx80, align 4
  %60 = load i32, i32* %ty, align 4
  %idxprom81 = sext i32 %60 to i64
  %arrayidx82 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom81
  %61 = load i32, i32* %tx, align 4
  %idxprom83 = sext i32 %61 to i64
  %arrayidx84 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx82, i64 0, i64 %idxprom83
  store float %59, float* %arrayidx84, align 4
  %62 = load i32, i32* %bx, align 4
  %cmp85 = icmp eq i32 %62, 0
  br i1 %cmp85, label %if.then86, label %if.else97

if.then86:                                        ; preds = %if.end72
  %63 = load float*, float** %J_cuda.addr, align 8
  %64 = load i32, i32* %cols.addr, align 4
  %mul87 = mul nsw i32 %64, 16
  %65 = load i32, i32* %by, align 4
  %mul88 = mul nsw i32 %mul87, %65
  %66 = load i32, i32* %cols.addr, align 4
  %67 = load i32, i32* %ty, align 4
  %mul89 = mul nsw i32 %66, %67
  %add90 = add nsw i32 %mul88, %mul89
  %idxprom91 = sext i32 %add90 to i64
  %arrayidx92 = getelementptr inbounds float, float* %63, i64 %idxprom91
  %68 = load float, float* %arrayidx92, align 4
  %69 = load i32, i32* %ty, align 4
  %idxprom93 = sext i32 %69 to i64
  %arrayidx94 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom93
  %70 = load i32, i32* %tx, align 4
  %idxprom95 = sext i32 %70 to i64
  %arrayidx96 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx94, i64 0, i64 %idxprom95
  store float %68, float* %arrayidx96, align 4
  br label %if.end119

if.else97:                                        ; preds = %if.end72
  %71 = load i32, i32* %bx, align 4
  %call98 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2
  %sub99 = sub i32 %call98, 1
  %cmp100 = icmp eq i32 %71, %sub99
  br i1 %cmp100, label %if.then101, label %if.end118

if.then101:                                       ; preds = %if.else97
  %72 = load float*, float** %J_cuda.addr, align 8
  %73 = load i32, i32* %cols.addr, align 4
  %mul102 = mul nsw i32 %73, 16
  %74 = load i32, i32* %by, align 4
  %mul103 = mul nsw i32 %mul102, %74
  %call104 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2
  %sub105 = sub i32 %call104, 1
  %mul106 = mul i32 16, %sub105
  %add107 = add i32 %mul103, %mul106
  %75 = load i32, i32* %cols.addr, align 4
  %76 = load i32, i32* %ty, align 4
  %mul108 = mul nsw i32 %75, %76
  %add109 = add i32 %add107, %mul108
  %add110 = add i32 %add109, 16
  %sub111 = sub i32 %add110, 1
  %idxprom112 = zext i32 %sub111 to i64
  %arrayidx113 = getelementptr inbounds float, float* %72, i64 %idxprom112
  %77 = load float, float* %arrayidx113, align 4
  %78 = load i32, i32* %ty, align 4
  %idxprom114 = sext i32 %78 to i64
  %arrayidx115 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom114
  %79 = load i32, i32* %tx, align 4
  %idxprom116 = sext i32 %79 to i64
  %arrayidx117 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx115, i64 0, i64 %idxprom116
  store float %77, float* %arrayidx117, align 4
  br label %if.end118

if.end118:                                        ; preds = %if.then101, %if.else97
  br label %if.end119

if.end119:                                        ; preds = %if.end118, %if.then86
  call void @llvm.nvvm.barrier0()
  %80 = load float*, float** %J_cuda.addr, align 8
  %81 = load i32, i32* %index, align 4
  %idxprom120 = sext i32 %81 to i64
  %arrayidx121 = getelementptr inbounds float, float* %80, i64 %idxprom120
  %82 = load float, float* %arrayidx121, align 4
  %83 = load i32, i32* %ty, align 4
  %idxprom122 = sext i32 %83 to i64
  %arrayidx123 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom122
  %84 = load i32, i32* %tx, align 4
  %idxprom124 = sext i32 %84 to i64
  %arrayidx125 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx123, i64 0, i64 %idxprom124
  store float %82, float* %arrayidx125, align 4
  call void @llvm.nvvm.barrier0()
  %85 = load i32, i32* %ty, align 4
  %idxprom126 = sext i32 %85 to i64
  %arrayidx127 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom126
  %86 = load i32, i32* %tx, align 4
  %idxprom128 = sext i32 %86 to i64
  %arrayidx129 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx127, i64 0, i64 %idxprom128
  %87 = load float, float* %arrayidx129, align 4
  store float %87, float* %jc, align 4
  %88 = load i32, i32* %ty, align 4
  %cmp130 = icmp eq i32 %88, 0
  br i1 %cmp130, label %land.lhs.true, label %if.else155

land.lhs.true:                                    ; preds = %if.end119
  %89 = load i32, i32* %tx, align 4
  %cmp131 = icmp eq i32 %89, 0
  br i1 %cmp131, label %if.then132, label %if.else155

if.then132:                                       ; preds = %land.lhs.true
  %90 = load i32, i32* %ty, align 4
  %idxprom133 = sext i32 %90 to i64
  %arrayidx134 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom133
  %91 = load i32, i32* %tx, align 4
  %idxprom135 = sext i32 %91 to i64
  %arrayidx136 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx134, i64 0, i64 %idxprom135
  %92 = load float, float* %arrayidx136, align 4
  %93 = load float, float* %jc, align 4
  %sub137 = fsub contract float %92, %93
  store float %sub137, float* %n, align 4
  %94 = load i32, i32* %ty, align 4
  %add138 = add nsw i32 %94, 1
  %idxprom139 = sext i32 %add138 to i64
  %arrayidx140 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom139
  %95 = load i32, i32* %tx, align 4
  %idxprom141 = sext i32 %95 to i64
  %arrayidx142 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx140, i64 0, i64 %idxprom141
  %96 = load float, float* %arrayidx142, align 4
  %97 = load float, float* %jc, align 4
  %sub143 = fsub contract float %96, %97
  store float %sub143, float* %s, align 4
  %98 = load i32, i32* %ty, align 4
  %idxprom144 = sext i32 %98 to i64
  %arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom144
  %99 = load i32, i32* %tx, align 4
  %idxprom146 = sext i32 %99 to i64
  %arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
  %100 = load float, float* %arrayidx147, align 4
  %101 = load float, float* %jc, align 4
  %sub148 = fsub contract float %100, %101
  store float %sub148, float* %w, align 4
  %102 = load i32, i32* %ty, align 4
  %idxprom149 = sext i32 %102 to i64
  %arrayidx150 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom149
  %103 = load i32, i32* %tx, align 4
  %add151 = add nsw i32 %103, 1
  %idxprom152 = sext i32 %add151 to i64
  %arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx150, i64 0, i64 %idxprom152
  %104 = load float, float* %arrayidx153, align 4
  %105 = load float, float* %jc, align 4
  %sub154 = fsub contract float %104, %105
  store float %sub154, float* %e, align 4
  br label %if.end372

if.else155:                                       ; preds = %land.lhs.true, %if.end119
  %106 = load i32, i32* %ty, align 4
  %cmp156 = icmp eq i32 %106, 0
  br i1 %cmp156, label %land.lhs.true157, label %if.else182

land.lhs.true157:                                 ; preds = %if.else155
  %107 = load i32, i32* %tx, align 4
  %cmp158 = icmp eq i32 %107, 15
  br i1 %cmp158, label %if.then159, label %if.else182

if.then159:                                       ; preds = %land.lhs.true157
  %108 = load i32, i32* %ty, align 4
  %idxprom160 = sext i32 %108 to i64
  %arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom160
  %109 = load i32, i32* %tx, align 4
  %idxprom162 = sext i32 %109 to i64
  %arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
  %110 = load float, float* %arrayidx163, align 4
  %111 = load float, float* %jc, align 4
  %sub164 = fsub contract float %110, %111
  store float %sub164, float* %n, align 4
  %112 = load i32, i32* %ty, align 4
  %add165 = add nsw i32 %112, 1
  %idxprom166 = sext i32 %add165 to i64
  %arrayidx167 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom166
  %113 = load i32, i32* %tx, align 4
  %idxprom168 = sext i32 %113 to i64
  %arrayidx169 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx167, i64 0, i64 %idxprom168
  %114 = load float, float* %arrayidx169, align 4
  %115 = load float, float* %jc, align 4
  %sub170 = fsub contract float %114, %115
  store float %sub170, float* %s, align 4
  %116 = load i32, i32* %ty, align 4
  %idxprom171 = sext i32 %116 to i64
  %arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom171
  %117 = load i32, i32* %tx, align 4
  %sub173 = sub nsw i32 %117, 1
  %idxprom174 = sext i32 %sub173 to i64
  %arrayidx175 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom174
  %118 = load float, float* %arrayidx175, align 4
  %119 = load float, float* %jc, align 4
  %sub176 = fsub contract float %118, %119
  store float %sub176, float* %w, align 4
  %120 = load i32, i32* %ty, align 4
  %idxprom177 = sext i32 %120 to i64
  %arrayidx178 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom177
  %121 = load i32, i32* %tx, align 4
  %idxprom179 = sext i32 %121 to i64
  %arrayidx180 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx178, i64 0, i64 %idxprom179
  %122 = load float, float* %arrayidx180, align 4
  %123 = load float, float* %jc, align 4
  %sub181 = fsub contract float %122, %123
  store float %sub181, float* %e, align 4
  br label %if.end371

if.else182:                                       ; preds = %land.lhs.true157, %if.else155
  %124 = load i32, i32* %ty, align 4
  %cmp183 = icmp eq i32 %124, 15
  br i1 %cmp183, label %land.lhs.true184, label %if.else209

land.lhs.true184:                                 ; preds = %if.else182
  %125 = load i32, i32* %tx, align 4
  %cmp185 = icmp eq i32 %125, 15
  br i1 %cmp185, label %if.then186, label %if.else209

if.then186:                                       ; preds = %land.lhs.true184
  %126 = load i32, i32* %ty, align 4
  %sub187 = sub nsw i32 %126, 1
  %idxprom188 = sext i32 %sub187 to i64
  %arrayidx189 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom188
  %127 = load i32, i32* %tx, align 4
  %idxprom190 = sext i32 %127 to i64
  %arrayidx191 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx189, i64 0, i64 %idxprom190
  %128 = load float, float* %arrayidx191, align 4
  %129 = load float, float* %jc, align 4
  %sub192 = fsub contract float %128, %129
  store float %sub192, float* %n, align 4
  %130 = load i32, i32* %ty, align 4
  %idxprom193 = sext i32 %130 to i64
  %arrayidx194 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom193
  %131 = load i32, i32* %tx, align 4
  %idxprom195 = sext i32 %131 to i64
  %arrayidx196 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx194, i64 0, i64 %idxprom195
  %132 = load float, float* %arrayidx196, align 4
  %133 = load float, float* %jc, align 4
  %sub197 = fsub contract float %132, %133
  store float %sub197, float* %s, align 4
  %134 = load i32, i32* %ty, align 4
  %idxprom198 = sext i32 %134 to i64
  %arrayidx199 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom198
  %135 = load i32, i32* %tx, align 4
  %sub200 = sub nsw i32 %135, 1
  %idxprom201 = sext i32 %sub200 to i64
  %arrayidx202 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx199, i64 0, i64 %idxprom201
  %136 = load float, float* %arrayidx202, align 4
  %137 = load float, float* %jc, align 4
  %sub203 = fsub contract float %136, %137
  store float %sub203, float* %w, align 4
  %138 = load i32, i32* %ty, align 4
  %idxprom204 = sext i32 %138 to i64
  %arrayidx205 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom204
  %139 = load i32, i32* %tx, align 4
  %idxprom206 = sext i32 %139 to i64
  %arrayidx207 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx205, i64 0, i64 %idxprom206
  %140 = load float, float* %arrayidx207, align 4
  %141 = load float, float* %jc, align 4
  %sub208 = fsub contract float %140, %141
  store float %sub208, float* %e, align 4
  br label %if.end370

if.else209:                                       ; preds = %land.lhs.true184, %if.else182
  %142 = load i32, i32* %ty, align 4
  %cmp210 = icmp eq i32 %142, 15
  br i1 %cmp210, label %land.lhs.true211, label %if.else236

land.lhs.true211:                                 ; preds = %if.else209
  %143 = load i32, i32* %tx, align 4
  %cmp212 = icmp eq i32 %143, 0
  br i1 %cmp212, label %if.then213, label %if.else236

if.then213:                                       ; preds = %land.lhs.true211
  %144 = load i32, i32* %ty, align 4
  %sub214 = sub nsw i32 %144, 1
  %idxprom215 = sext i32 %sub214 to i64
  %arrayidx216 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom215
  %145 = load i32, i32* %tx, align 4
  %idxprom217 = sext i32 %145 to i64
  %arrayidx218 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx216, i64 0, i64 %idxprom217
  %146 = load float, float* %arrayidx218, align 4
  %147 = load float, float* %jc, align 4
  %sub219 = fsub contract float %146, %147
  store float %sub219, float* %n, align 4
  %148 = load i32, i32* %ty, align 4
  %idxprom220 = sext i32 %148 to i64
  %arrayidx221 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom220
  %149 = load i32, i32* %tx, align 4
  %idxprom222 = sext i32 %149 to i64
  %arrayidx223 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx221, i64 0, i64 %idxprom222
  %150 = load float, float* %arrayidx223, align 4
  %151 = load float, float* %jc, align 4
  %sub224 = fsub contract float %150, %151
  store float %sub224, float* %s, align 4
  %152 = load i32, i32* %ty, align 4
  %idxprom225 = sext i32 %152 to i64
  %arrayidx226 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom225
  %153 = load i32, i32* %tx, align 4
  %idxprom227 = sext i32 %153 to i64
  %arrayidx228 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx226, i64 0, i64 %idxprom227
  %154 = load float, float* %arrayidx228, align 4
  %155 = load float, float* %jc, align 4
  %sub229 = fsub contract float %154, %155
  store float %sub229, float* %w, align 4
  %156 = load i32, i32* %ty, align 4
  %idxprom230 = sext i32 %156 to i64
  %arrayidx231 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom230
  %157 = load i32, i32* %tx, align 4
  %add232 = add nsw i32 %157, 1
  %idxprom233 = sext i32 %add232 to i64
  %arrayidx234 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx231, i64 0, i64 %idxprom233
  %158 = load float, float* %arrayidx234, align 4
  %159 = load float, float* %jc, align 4
  %sub235 = fsub contract float %158, %159
  store float %sub235, float* %e, align 4
  br label %if.end369

if.else236:                                       ; preds = %land.lhs.true211, %if.else209
  %160 = load i32, i32* %ty, align 4
  %cmp237 = icmp eq i32 %160, 0
  br i1 %cmp237, label %if.then238, label %if.else262

if.then238:                                       ; preds = %if.else236
  %161 = load i32, i32* %ty, align 4
  %idxprom239 = sext i32 %161 to i64
  %arrayidx240 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5north to [16 x [16 x float]]*), i64 0, i64 %idxprom239
  %162 = load i32, i32* %tx, align 4
  %idxprom241 = sext i32 %162 to i64
  %arrayidx242 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx240, i64 0, i64 %idxprom241
  %163 = load float, float* %arrayidx242, align 4
  %164 = load float, float* %jc, align 4
  %sub243 = fsub contract float %163, %164
  store float %sub243, float* %n, align 4
  %165 = load i32, i32* %ty, align 4
  %add244 = add nsw i32 %165, 1
  %idxprom245 = sext i32 %add244 to i64
  %arrayidx246 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom245
  %166 = load i32, i32* %tx, align 4
  %idxprom247 = sext i32 %166 to i64
  %arrayidx248 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx246, i64 0, i64 %idxprom247
  %167 = load float, float* %arrayidx248, align 4
  %168 = load float, float* %jc, align 4
  %sub249 = fsub contract float %167, %168
  store float %sub249, float* %s, align 4
  %169 = load i32, i32* %ty, align 4
  %idxprom250 = sext i32 %169 to i64
  %arrayidx251 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom250
  %170 = load i32, i32* %tx, align 4
  %sub252 = sub nsw i32 %170, 1
  %idxprom253 = sext i32 %sub252 to i64
  %arrayidx254 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx251, i64 0, i64 %idxprom253
  %171 = load float, float* %arrayidx254, align 4
  %172 = load float, float* %jc, align 4
  %sub255 = fsub contract float %171, %172
  store float %sub255, float* %w, align 4
  %173 = load i32, i32* %ty, align 4
  %idxprom256 = sext i32 %173 to i64
  %arrayidx257 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom256
  %174 = load i32, i32* %tx, align 4
  %add258 = add nsw i32 %174, 1
  %idxprom259 = sext i32 %add258 to i64
  %arrayidx260 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx257, i64 0, i64 %idxprom259
  %175 = load float, float* %arrayidx260, align 4
  %176 = load float, float* %jc, align 4
  %sub261 = fsub contract float %175, %176
  store float %sub261, float* %e, align 4
  br label %if.end368

if.else262:                                       ; preds = %if.else236
  %177 = load i32, i32* %tx, align 4
  %cmp263 = icmp eq i32 %177, 15
  br i1 %cmp263, label %if.then264, label %if.else288

if.then264:                                       ; preds = %if.else262
  %178 = load i32, i32* %ty, align 4
  %sub265 = sub nsw i32 %178, 1
  %idxprom266 = sext i32 %sub265 to i64
  %arrayidx267 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom266
  %179 = load i32, i32* %tx, align 4
  %idxprom268 = sext i32 %179 to i64
  %arrayidx269 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx267, i64 0, i64 %idxprom268
  %180 = load float, float* %arrayidx269, align 4
  %181 = load float, float* %jc, align 4
  %sub270 = fsub contract float %180, %181
  store float %sub270, float* %n, align 4
  %182 = load i32, i32* %ty, align 4
  %add271 = add nsw i32 %182, 1
  %idxprom272 = sext i32 %add271 to i64
  %arrayidx273 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom272
  %183 = load i32, i32* %tx, align 4
  %idxprom274 = sext i32 %183 to i64
  %arrayidx275 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx273, i64 0, i64 %idxprom274
  %184 = load float, float* %arrayidx275, align 4
  %185 = load float, float* %jc, align 4
  %sub276 = fsub contract float %184, %185
  store float %sub276, float* %s, align 4
  %186 = load i32, i32* %ty, align 4
  %idxprom277 = sext i32 %186 to i64
  %arrayidx278 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom277
  %187 = load i32, i32* %tx, align 4
  %sub279 = sub nsw i32 %187, 1
  %idxprom280 = sext i32 %sub279 to i64
  %arrayidx281 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx278, i64 0, i64 %idxprom280
  %188 = load float, float* %arrayidx281, align 4
  %189 = load float, float* %jc, align 4
  %sub282 = fsub contract float %188, %189
  store float %sub282, float* %w, align 4
  %190 = load i32, i32* %ty, align 4
  %idxprom283 = sext i32 %190 to i64
  %arrayidx284 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4east to [16 x [16 x float]]*), i64 0, i64 %idxprom283
  %191 = load i32, i32* %tx, align 4
  %idxprom285 = sext i32 %191 to i64
  %arrayidx286 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx284, i64 0, i64 %idxprom285
  %192 = load float, float* %arrayidx286, align 4
  %193 = load float, float* %jc, align 4
  %sub287 = fsub contract float %192, %193
  store float %sub287, float* %e, align 4
  br label %if.end367

if.else288:                                       ; preds = %if.else262
  %194 = load i32, i32* %ty, align 4
  %cmp289 = icmp eq i32 %194, 15
  br i1 %cmp289, label %if.then290, label %if.else314

if.then290:                                       ; preds = %if.else288
  %195 = load i32, i32* %ty, align 4
  %sub291 = sub nsw i32 %195, 1
  %idxprom292 = sext i32 %sub291 to i64
  %arrayidx293 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom292
  %196 = load i32, i32* %tx, align 4
  %idxprom294 = sext i32 %196 to i64
  %arrayidx295 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx293, i64 0, i64 %idxprom294
  %197 = load float, float* %arrayidx295, align 4
  %198 = load float, float* %jc, align 4
  %sub296 = fsub contract float %197, %198
  store float %sub296, float* %n, align 4
  %199 = load i32, i32* %ty, align 4
  %idxprom297 = sext i32 %199 to i64
  %arrayidx298 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE5south to [16 x [16 x float]]*), i64 0, i64 %idxprom297
  %200 = load i32, i32* %tx, align 4
  %idxprom299 = sext i32 %200 to i64
  %arrayidx300 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx298, i64 0, i64 %idxprom299
  %201 = load float, float* %arrayidx300, align 4
  %202 = load float, float* %jc, align 4
  %sub301 = fsub contract float %201, %202
  store float %sub301, float* %s, align 4
  %203 = load i32, i32* %ty, align 4
  %idxprom302 = sext i32 %203 to i64
  %arrayidx303 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom302
  %204 = load i32, i32* %tx, align 4
  %sub304 = sub nsw i32 %204, 1
  %idxprom305 = sext i32 %sub304 to i64
  %arrayidx306 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx303, i64 0, i64 %idxprom305
  %205 = load float, float* %arrayidx306, align 4
  %206 = load float, float* %jc, align 4
  %sub307 = fsub contract float %205, %206
  store float %sub307, float* %w, align 4
  %207 = load i32, i32* %ty, align 4
  %idxprom308 = sext i32 %207 to i64
  %arrayidx309 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom308
  %208 = load i32, i32* %tx, align 4
  %add310 = add nsw i32 %208, 1
  %idxprom311 = sext i32 %add310 to i64
  %arrayidx312 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx309, i64 0, i64 %idxprom311
  %209 = load float, float* %arrayidx312, align 4
  %210 = load float, float* %jc, align 4
  %sub313 = fsub contract float %209, %210
  store float %sub313, float* %e, align 4
  br label %if.end366

if.else314:                                       ; preds = %if.else288
  %211 = load i32, i32* %tx, align 4
  %cmp315 = icmp eq i32 %211, 0
  br i1 %cmp315, label %if.then316, label %if.else340

if.then316:                                       ; preds = %if.else314
  %212 = load i32, i32* %ty, align 4
  %sub317 = sub nsw i32 %212, 1
  %idxprom318 = sext i32 %sub317 to i64
  %arrayidx319 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom318
  %213 = load i32, i32* %tx, align 4
  %idxprom320 = sext i32 %213 to i64
  %arrayidx321 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx319, i64 0, i64 %idxprom320
  %214 = load float, float* %arrayidx321, align 4
  %215 = load float, float* %jc, align 4
  %sub322 = fsub contract float %214, %215
  store float %sub322, float* %n, align 4
  %216 = load i32, i32* %ty, align 4
  %add323 = add nsw i32 %216, 1
  %idxprom324 = sext i32 %add323 to i64
  %arrayidx325 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom324
  %217 = load i32, i32* %tx, align 4
  %idxprom326 = sext i32 %217 to i64
  %arrayidx327 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx325, i64 0, i64 %idxprom326
  %218 = load float, float* %arrayidx327, align 4
  %219 = load float, float* %jc, align 4
  %sub328 = fsub contract float %218, %219
  store float %sub328, float* %s, align 4
  %220 = load i32, i32* %ty, align 4
  %idxprom329 = sext i32 %220 to i64
  %arrayidx330 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4west to [16 x [16 x float]]*), i64 0, i64 %idxprom329
  %221 = load i32, i32* %tx, align 4
  %idxprom331 = sext i32 %221 to i64
  %arrayidx332 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx330, i64 0, i64 %idxprom331
  %222 = load float, float* %arrayidx332, align 4
  %223 = load float, float* %jc, align 4
  %sub333 = fsub contract float %222, %223
  store float %sub333, float* %w, align 4
  %224 = load i32, i32* %ty, align 4
  %idxprom334 = sext i32 %224 to i64
  %arrayidx335 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom334
  %225 = load i32, i32* %tx, align 4
  %add336 = add nsw i32 %225, 1
  %idxprom337 = sext i32 %add336 to i64
  %arrayidx338 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx335, i64 0, i64 %idxprom337
  %226 = load float, float* %arrayidx338, align 4
  %227 = load float, float* %jc, align 4
  %sub339 = fsub contract float %226, %227
  store float %sub339, float* %e, align 4
  br label %if.end365

if.else340:                                       ; preds = %if.else314
  %228 = load i32, i32* %ty, align 4
  %sub341 = sub nsw i32 %228, 1
  %idxprom342 = sext i32 %sub341 to i64
  %arrayidx343 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom342
  %229 = load i32, i32* %tx, align 4
  %idxprom344 = sext i32 %229 to i64
  %arrayidx345 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx343, i64 0, i64 %idxprom344
  %230 = load float, float* %arrayidx345, align 4
  %231 = load float, float* %jc, align 4
  %sub346 = fsub contract float %230, %231
  store float %sub346, float* %n, align 4
  %232 = load i32, i32* %ty, align 4
  %add347 = add nsw i32 %232, 1
  %idxprom348 = sext i32 %add347 to i64
  %arrayidx349 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom348
  %233 = load i32, i32* %tx, align 4
  %idxprom350 = sext i32 %233 to i64
  %arrayidx351 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx349, i64 0, i64 %idxprom350
  %234 = load float, float* %arrayidx351, align 4
  %235 = load float, float* %jc, align 4
  %sub352 = fsub contract float %234, %235
  store float %sub352, float* %s, align 4
  %236 = load i32, i32* %ty, align 4
  %idxprom353 = sext i32 %236 to i64
  %arrayidx354 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom353
  %237 = load i32, i32* %tx, align 4
  %sub355 = sub nsw i32 %237, 1
  %idxprom356 = sext i32 %sub355 to i64
  %arrayidx357 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx354, i64 0, i64 %idxprom356
  %238 = load float, float* %arrayidx357, align 4
  %239 = load float, float* %jc, align 4
  %sub358 = fsub contract float %238, %239
  store float %sub358, float* %w, align 4
  %240 = load i32, i32* %ty, align 4
  %idxprom359 = sext i32 %240 to i64
  %arrayidx360 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom359
  %241 = load i32, i32* %tx, align 4
  %add361 = add nsw i32 %241, 1
  %idxprom362 = sext i32 %add361 to i64
  %arrayidx363 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx360, i64 0, i64 %idxprom362
  %242 = load float, float* %arrayidx363, align 4
  %243 = load float, float* %jc, align 4
  %sub364 = fsub contract float %242, %243
  store float %sub364, float* %e, align 4
  br label %if.end365

if.end365:                                        ; preds = %if.else340, %if.then316
  br label %if.end366

if.end366:                                        ; preds = %if.end365, %if.then290
  br label %if.end367

if.end367:                                        ; preds = %if.end366, %if.then264
  br label %if.end368

if.end368:                                        ; preds = %if.end367, %if.then238
  br label %if.end369

if.end369:                                        ; preds = %if.end368, %if.then213
  br label %if.end370

if.end370:                                        ; preds = %if.end369, %if.then186
  br label %if.end371

if.end371:                                        ; preds = %if.end370, %if.then159
  br label %if.end372

if.end372:                                        ; preds = %if.end371, %if.then132
  %244 = load float, float* %n, align 4
  %245 = load float, float* %n, align 4
  %mul373 = fmul contract float %244, %245
  %246 = load float, float* %s, align 4
  %247 = load float, float* %s, align 4
  %mul374 = fmul contract float %246, %247
  %add375 = fadd contract float %mul373, %mul374
  %248 = load float, float* %w, align 4
  %249 = load float, float* %w, align 4
  %mul376 = fmul contract float %248, %249
  %add377 = fadd contract float %add375, %mul376
  %250 = load float, float* %e, align 4
  %251 = load float, float* %e, align 4
  %mul378 = fmul contract float %250, %251
  %add379 = fadd contract float %add377, %mul378
  %252 = load float, float* %jc, align 4
  %253 = load float, float* %jc, align 4
  %mul380 = fmul contract float %252, %253
  %div = fdiv float %add379, %mul380
  store float %div, float* %g2, align 4
  %254 = load float, float* %n, align 4
  %255 = load float, float* %s, align 4
  %add381 = fadd contract float %254, %255
  %256 = load float, float* %w, align 4
  %add382 = fadd contract float %add381, %256
  %257 = load float, float* %e, align 4
  %add383 = fadd contract float %add382, %257
  %258 = load float, float* %jc, align 4
  %div384 = fdiv float %add383, %258
  store float %div384, float* %l, align 4
  %259 = load float, float* %g2, align 4
  %conv = fpext float %259 to double
  %mul385 = fmul contract double 5.000000e-01, %conv
  %260 = load float, float* %l, align 4
  %261 = load float, float* %l, align 4
  %mul386 = fmul contract float %260, %261
  %conv387 = fpext float %mul386 to double
  %mul388 = fmul contract double 6.250000e-02, %conv387
  %sub389 = fsub contract double %mul385, %mul388
  %conv390 = fptrunc double %sub389 to float
  store float %conv390, float* %num, align 4
  %262 = load float, float* %l, align 4
  %conv391 = fpext float %262 to double
  %mul392 = fmul contract double 2.500000e-01, %conv391
  %add393 = fadd contract double 1.000000e+00, %mul392
  %conv394 = fptrunc double %add393 to float
  store float %conv394, float* %den, align 4
  %263 = load float, float* %num, align 4
  %264 = load float, float* %den, align 4
  %265 = load float, float* %den, align 4
  %mul395 = fmul contract float %264, %265
  %div396 = fdiv float %263, %mul395
  store float %div396, float* %qsqr, align 4
  %266 = load float, float* %qsqr, align 4
  %267 = load float, float* %q0sqr.addr, align 4
  %sub397 = fsub contract float %266, %267
  %268 = load float, float* %q0sqr.addr, align 4
  %269 = load float, float* %q0sqr.addr, align 4
  %add398 = fadd contract float 1.000000e+00, %269
  %mul399 = fmul contract float %268, %add398
  %div400 = fdiv float %sub397, %mul399
  store float %div400, float* %den, align 4
  %270 = load float, float* %den, align 4
  %conv401 = fpext float %270 to double
  %add402 = fadd contract double 1.000000e+00, %conv401
  %div403 = fdiv double 1.000000e+00, %add402
  %conv404 = fptrunc double %div403 to float
  store float %conv404, float* %c, align 4
  %271 = load float, float* %c, align 4
  %cmp405 = fcmp olt float %271, 0.000000e+00
  br i1 %cmp405, label %if.then406, label %if.else411

if.then406:                                       ; preds = %if.end372
  %272 = load i32, i32* %ty, align 4
  %idxprom407 = sext i32 %272 to i64
  %arrayidx408 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom407
  %273 = load i32, i32* %tx, align 4
  %idxprom409 = sext i32 %273 to i64
  %arrayidx410 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx408, i64 0, i64 %idxprom409
  store float 0.000000e+00, float* %arrayidx410, align 4
  br label %if.end424

if.else411:                                       ; preds = %if.end372
  %274 = load float, float* %c, align 4
  %cmp412 = fcmp ogt float %274, 1.000000e+00
  br i1 %cmp412, label %if.then413, label %if.else418

if.then413:                                       ; preds = %if.else411
  %275 = load i32, i32* %ty, align 4
  %idxprom414 = sext i32 %275 to i64
  %arrayidx415 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom414
  %276 = load i32, i32* %tx, align 4
  %idxprom416 = sext i32 %276 to i64
  %arrayidx417 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx415, i64 0, i64 %idxprom416
  store float 1.000000e+00, float* %arrayidx417, align 4
  br label %if.end423

if.else418:                                       ; preds = %if.else411
  %277 = load float, float* %c, align 4
  %278 = load i32, i32* %ty, align 4
  %idxprom419 = sext i32 %278 to i64
  %arrayidx420 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom419
  %279 = load i32, i32* %tx, align 4
  %idxprom421 = sext i32 %279 to i64
  %arrayidx422 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx420, i64 0, i64 %idxprom421
  store float %277, float* %arrayidx422, align 4
  br label %if.end423

if.end423:                                        ; preds = %if.else418, %if.then413
  br label %if.end424

if.end424:                                        ; preds = %if.end423, %if.then406
  call void @llvm.nvvm.barrier0()
  %280 = load i32, i32* %ty, align 4
  %idxprom425 = sext i32 %280 to i64
  %arrayidx426 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_1PfS_S_S_S_S_iifE11temp_result to [16 x [16 x float]]*), i64 0, i64 %idxprom425
  %281 = load i32, i32* %tx, align 4
  %idxprom427 = sext i32 %281 to i64
  %arrayidx428 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx426, i64 0, i64 %idxprom427
  %282 = load float, float* %arrayidx428, align 4
  %283 = load float*, float** %C_cuda.addr, align 8
  %284 = load i32, i32* %index, align 4
  %idxprom429 = sext i32 %284 to i64
  %arrayidx430 = getelementptr inbounds float, float* %283, i64 %idxprom429
  store float %282, float* %arrayidx430, align 4
  %285 = load float, float* %e, align 4
  %286 = load float*, float** %E_C.addr, align 8
  %287 = load i32, i32* %index, align 4
  %idxprom431 = sext i32 %287 to i64
  %arrayidx432 = getelementptr inbounds float, float* %286, i64 %idxprom431
  store float %285, float* %arrayidx432, align 4
  %288 = load float, float* %w, align 4
  %289 = load float*, float** %W_C.addr, align 8
  %290 = load i32, i32* %index, align 4
  %idxprom433 = sext i32 %290 to i64
  %arrayidx434 = getelementptr inbounds float, float* %289, i64 %idxprom433
  store float %288, float* %arrayidx434, align 4
  %291 = load float, float* %s, align 4
  %292 = load float*, float** %S_C.addr, align 8
  %293 = load i32, i32* %index, align 4
  %idxprom435 = sext i32 %293 to i64
  %arrayidx436 = getelementptr inbounds float, float* %292, i64 %idxprom435
  store float %291, float* %arrayidx436, align 4
  %294 = load float, float* %n, align 4
  %295 = load float*, float** %N_C.addr, align 8
  %296 = load i32, i32* %index, align 4
  %idxprom437 = sext i32 %296 to i64
  %arrayidx438 = getelementptr inbounds float, float* %295, i64 %idxprom437
  store float %294, float* %arrayidx438, align 4
  ret void
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  ret i32 %0
}

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
  ret i32 %0
}

; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2

; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  ret i32 %0
}

; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z11srad_cuda_2PfS_S_S_S_S_iiff(float* %E_C, float* %W_C, float* %N_C, float* %S_C, float* %J_cuda, float* %C_cuda, i32 %cols, i32 %rows, float %lambda, float %q0sqr) #0 {
entry:
  %E_C.addr = alloca float*, align 8
  %W_C.addr = alloca float*, align 8
  %N_C.addr = alloca float*, align 8
  %S_C.addr = alloca float*, align 8
  %J_cuda.addr = alloca float*, align 8
  %C_cuda.addr = alloca float*, align 8
  %cols.addr = alloca i32, align 4
  %rows.addr = alloca i32, align 4
  %lambda.addr = alloca float, align 4
  %q0sqr.addr = alloca float, align 4
  %bx = alloca i32, align 4
  %by = alloca i32, align 4
  %tx = alloca i32, align 4
  %ty = alloca i32, align 4
  %index = alloca i32, align 4
  %index_s = alloca i32, align 4
  %index_e = alloca i32, align 4
  %cc = alloca float, align 4
  %cn = alloca float, align 4
  %cs = alloca float, align 4
  %ce = alloca float, align 4
  %cw = alloca float, align 4
  %d_sum = alloca float, align 4
  store float* %E_C, float** %E_C.addr, align 8
  store float* %W_C, float** %W_C.addr, align 8
  store float* %N_C, float** %N_C.addr, align 8
  store float* %S_C, float** %S_C.addr, align 8
  store float* %J_cuda, float** %J_cuda.addr, align 8
  store float* %C_cuda, float** %C_cuda.addr, align 8
  store i32 %cols, i32* %cols.addr, align 4
  store i32 %rows, i32* %rows.addr, align 4
  store float %lambda, float* %lambda.addr, align 4
  store float %q0sqr, float* %q0sqr.addr, align 4
  %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
  store i32 %call, i32* %bx, align 4
  %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
  store i32 %call1, i32* %by, align 4
  %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
  store i32 %call2, i32* %tx, align 4
  %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
  store i32 %call3, i32* %ty, align 4
  %0 = load i32, i32* %cols.addr, align 4
  %mul = mul nsw i32 %0, 16
  %1 = load i32, i32* %by, align 4
  %mul4 = mul nsw i32 %mul, %1
  %2 = load i32, i32* %bx, align 4
  %mul5 = mul nsw i32 16, %2
  %add = add nsw i32 %mul4, %mul5
  %3 = load i32, i32* %cols.addr, align 4
  %4 = load i32, i32* %ty, align 4
  %mul6 = mul nsw i32 %3, %4
  %add7 = add nsw i32 %add, %mul6
  %5 = load i32, i32* %tx, align 4
  %add8 = add nsw i32 %add7, %5
  store i32 %add8, i32* %index, align 4
  %6 = load i32, i32* %cols.addr, align 4
  %mul9 = mul nsw i32 %6, 16
  %7 = load i32, i32* %by, align 4
  %mul10 = mul nsw i32 %mul9, %7
  %8 = load i32, i32* %bx, align 4
  %mul11 = mul nsw i32 16, %8
  %add12 = add nsw i32 %mul10, %mul11
  %9 = load i32, i32* %cols.addr, align 4
  %mul13 = mul nsw i32 %9, 16
  %add14 = add nsw i32 %add12, %mul13
  %10 = load i32, i32* %tx, align 4
  %add15 = add nsw i32 %add14, %10
  store i32 %add15, i32* %index_s, align 4
  %11 = load i32, i32* %cols.addr, align 4
  %mul16 = mul nsw i32 %11, 16
  %12 = load i32, i32* %by, align 4
  %mul17 = mul nsw i32 %mul16, %12
  %13 = load i32, i32* %bx, align 4
  %mul18 = mul nsw i32 16, %13
  %add19 = add nsw i32 %mul17, %mul18
  %14 = load i32, i32* %cols.addr, align 4
  %15 = load i32, i32* %ty, align 4
  %mul20 = mul nsw i32 %14, %15
  %add21 = add nsw i32 %add19, %mul20
  %add22 = add nsw i32 %add21, 16
  store i32 %add22, i32* %index_e, align 4
  %16 = load float*, float** %J_cuda.addr, align 8
  %17 = load i32, i32* %index, align 4
  %idxprom = sext i32 %17 to i64
  %arrayidx = getelementptr inbounds float, float* %16, i64 %idxprom
  %18 = load float, float* %arrayidx, align 4
  %19 = load i32, i32* %ty, align 4
  %idxprom23 = sext i32 %19 to i64
  %arrayidx24 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom23
  %20 = load i32, i32* %tx, align 4
  %idxprom25 = sext i32 %20 to i64
  %arrayidx26 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx24, i64 0, i64 %idxprom25
  store float %18, float* %arrayidx26, align 4
  call void @llvm.nvvm.barrier0()
  %21 = load float*, float** %C_cuda.addr, align 8
  %22 = load i32, i32* %index_s, align 4
  %idxprom27 = sext i32 %22 to i64
  %arrayidx28 = getelementptr inbounds float, float* %21, i64 %idxprom27
  %23 = load float, float* %arrayidx28, align 4
  %24 = load i32, i32* %ty, align 4
  %idxprom29 = sext i32 %24 to i64
  %arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom29
  %25 = load i32, i32* %tx, align 4
  %idxprom31 = sext i32 %25 to i64
  %arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
  store float %23, float* %arrayidx32, align 4
  %26 = load i32, i32* %by, align 4
  %call33 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2
  %sub = sub i32 %call33, 1
  %cmp = icmp eq i32 %26, %sub
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %27 = load float*, float** %C_cuda.addr, align 8
  %28 = load i32, i32* %cols.addr, align 4
  %mul34 = mul nsw i32 %28, 16
  %call35 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_yEv() #2
  %sub36 = sub i32 %call35, 1
  %mul37 = mul i32 %mul34, %sub36
  %29 = load i32, i32* %bx, align 4
  %mul38 = mul nsw i32 16, %29
  %add39 = add i32 %mul37, %mul38
  %30 = load i32, i32* %cols.addr, align 4
  %mul40 = mul nsw i32 %30, 15
  %add41 = add i32 %add39, %mul40
  %31 = load i32, i32* %tx, align 4
  %add42 = add i32 %add41, %31
  %idxprom43 = zext i32 %add42 to i64
  %arrayidx44 = getelementptr inbounds float, float* %27, i64 %idxprom43
  %32 = load float, float* %arrayidx44, align 4
  %33 = load i32, i32* %ty, align 4
  %idxprom45 = sext i32 %33 to i64
  %arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom45
  %34 = load i32, i32* %tx, align 4
  %idxprom47 = sext i32 %34 to i64
  %arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
  store float %32, float* %arrayidx48, align 4
  br label %if.end

if.end:                                           ; preds = %if.then, %entry
  call void @llvm.nvvm.barrier0()
  %35 = load float*, float** %C_cuda.addr, align 8
  %36 = load i32, i32* %index_e, align 4
  %idxprom49 = sext i32 %36 to i64
  %arrayidx50 = getelementptr inbounds float, float* %35, i64 %idxprom49
  %37 = load float, float* %arrayidx50, align 4
  %38 = load i32, i32* %ty, align 4
  %idxprom51 = sext i32 %38 to i64
  %arrayidx52 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom51
  %39 = load i32, i32* %tx, align 4
  %idxprom53 = sext i32 %39 to i64
  %arrayidx54 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx52, i64 0, i64 %idxprom53
  store float %37, float* %arrayidx54, align 4
  %40 = load i32, i32* %bx, align 4
  %call55 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2
  %sub56 = sub i32 %call55, 1
  %cmp57 = icmp eq i32 %40, %sub56
  br i1 %cmp57, label %if.then58, label %if.end75

if.then58:                                        ; preds = %if.end
  %41 = load float*, float** %C_cuda.addr, align 8
  %42 = load i32, i32* %cols.addr, align 4
  %mul59 = mul nsw i32 %42, 16
  %43 = load i32, i32* %by, align 4
  %mul60 = mul nsw i32 %mul59, %43
  %call61 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2
  %sub62 = sub i32 %call61, 1
  %mul63 = mul i32 16, %sub62
  %add64 = add i32 %mul60, %mul63
  %44 = load i32, i32* %cols.addr, align 4
  %45 = load i32, i32* %ty, align 4
  %mul65 = mul nsw i32 %44, %45
  %add66 = add i32 %add64, %mul65
  %add67 = add i32 %add66, 16
  %sub68 = sub i32 %add67, 1
  %idxprom69 = zext i32 %sub68 to i64
  %arrayidx70 = getelementptr inbounds float, float* %41, i64 %idxprom69
  %46 = load float, float* %arrayidx70, align 4
  %47 = load i32, i32* %ty, align 4
  %idxprom71 = sext i32 %47 to i64
  %arrayidx72 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom71
  %48 = load i32, i32* %tx, align 4
  %idxprom73 = sext i32 %48 to i64
  %arrayidx74 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx72, i64 0, i64 %idxprom73
  store float %46, float* %arrayidx74, align 4
  br label %if.end75

if.end75:                                         ; preds = %if.then58, %if.end
  call void @llvm.nvvm.barrier0()
  %49 = load float*, float** %C_cuda.addr, align 8
  %50 = load i32, i32* %index, align 4
  %idxprom76 = sext i32 %50 to i64
  %arrayidx77 = getelementptr inbounds float, float* %49, i64 %idxprom76
  %51 = load float, float* %arrayidx77, align 4
  %52 = load i32, i32* %ty, align 4
  %idxprom78 = sext i32 %52 to i64
  %arrayidx79 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom78
  %53 = load i32, i32* %tx, align 4
  %idxprom80 = sext i32 %53 to i64
  %arrayidx81 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx79, i64 0, i64 %idxprom80
  store float %51, float* %arrayidx81, align 4
  call void @llvm.nvvm.barrier0()
  %54 = load i32, i32* %ty, align 4
  %idxprom82 = sext i32 %54 to i64
  %arrayidx83 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom82
  %55 = load i32, i32* %tx, align 4
  %idxprom84 = sext i32 %55 to i64
  %arrayidx85 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx83, i64 0, i64 %idxprom84
  %56 = load float, float* %arrayidx85, align 4
  store float %56, float* %cc, align 4
  %57 = load i32, i32* %ty, align 4
  %cmp86 = icmp eq i32 %57, 15
  br i1 %cmp86, label %land.lhs.true, label %if.else

land.lhs.true:                                    ; preds = %if.end75
  %58 = load i32, i32* %tx, align 4
  %cmp87 = icmp eq i32 %58, 15
  br i1 %cmp87, label %if.then88, label %if.else

if.then88:                                        ; preds = %land.lhs.true
  %59 = load float, float* %cc, align 4
  store float %59, float* %cn, align 4
  %60 = load i32, i32* %ty, align 4
  %idxprom89 = sext i32 %60 to i64
  %arrayidx90 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom89
  %61 = load i32, i32* %tx, align 4
  %idxprom91 = sext i32 %61 to i64
  %arrayidx92 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx90, i64 0, i64 %idxprom91
  %62 = load float, float* %arrayidx92, align 4
  store float %62, float* %cs, align 4
  %63 = load float, float* %cc, align 4
  store float %63, float* %cw, align 4
  %64 = load i32, i32* %ty, align 4
  %idxprom93 = sext i32 %64 to i64
  %arrayidx94 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom93
  %65 = load i32, i32* %tx, align 4
  %idxprom95 = sext i32 %65 to i64
  %arrayidx96 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx94, i64 0, i64 %idxprom95
  %66 = load float, float* %arrayidx96, align 4
  store float %66, float* %ce, align 4
  br label %if.end133

if.else:                                          ; preds = %land.lhs.true, %if.end75
  %67 = load i32, i32* %tx, align 4
  %cmp97 = icmp eq i32 %67, 15
  br i1 %cmp97, label %if.then98, label %if.else108

if.then98:                                        ; preds = %if.else
  %68 = load float, float* %cc, align 4
  store float %68, float* %cn, align 4
  %69 = load i32, i32* %ty, align 4
  %add99 = add nsw i32 %69, 1
  %idxprom100 = sext i32 %add99 to i64
  %arrayidx101 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom100
  %70 = load i32, i32* %tx, align 4
  %idxprom102 = sext i32 %70 to i64
  %arrayidx103 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx101, i64 0, i64 %idxprom102
  %71 = load float, float* %arrayidx103, align 4
  store float %71, float* %cs, align 4
  %72 = load float, float* %cc, align 4
  store float %72, float* %cw, align 4
  %73 = load i32, i32* %ty, align 4
  %idxprom104 = sext i32 %73 to i64
  %arrayidx105 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE6east_c to [16 x [16 x float]]*), i64 0, i64 %idxprom104
  %74 = load i32, i32* %tx, align 4
  %idxprom106 = sext i32 %74 to i64
  %arrayidx107 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx105, i64 0, i64 %idxprom106
  %75 = load float, float* %arrayidx107, align 4
  store float %75, float* %ce, align 4
  br label %if.end132

if.else108:                                       ; preds = %if.else
  %76 = load i32, i32* %ty, align 4
  %cmp109 = icmp eq i32 %76, 15
  br i1 %cmp109, label %if.then110, label %if.else120

if.then110:                                       ; preds = %if.else108
  %77 = load float, float* %cc, align 4
  store float %77, float* %cn, align 4
  %78 = load i32, i32* %ty, align 4
  %idxprom111 = sext i32 %78 to i64
  %arrayidx112 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE7south_c to [16 x [16 x float]]*), i64 0, i64 %idxprom111
  %79 = load i32, i32* %tx, align 4
  %idxprom113 = sext i32 %79 to i64
  %arrayidx114 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx112, i64 0, i64 %idxprom113
  %80 = load float, float* %arrayidx114, align 4
  store float %80, float* %cs, align 4
  %81 = load float, float* %cc, align 4
  store float %81, float* %cw, align 4
  %82 = load i32, i32* %ty, align 4
  %idxprom115 = sext i32 %82 to i64
  %arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom115
  %83 = load i32, i32* %tx, align 4
  %add117 = add nsw i32 %83, 1
  %idxprom118 = sext i32 %add117 to i64
  %arrayidx119 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom118
  %84 = load float, float* %arrayidx119, align 4
  store float %84, float* %ce, align 4
  br label %if.end131

if.else120:                                       ; preds = %if.else108
  %85 = load float, float* %cc, align 4
  store float %85, float* %cn, align 4
  %86 = load i32, i32* %ty, align 4
  %add121 = add nsw i32 %86, 1
  %idxprom122 = sext i32 %add121 to i64
  %arrayidx123 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom122
  %87 = load i32, i32* %tx, align 4
  %idxprom124 = sext i32 %87 to i64
  %arrayidx125 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx123, i64 0, i64 %idxprom124
  %88 = load float, float* %arrayidx125, align 4
  store float %88, float* %cs, align 4
  %89 = load float, float* %cc, align 4
  store float %89, float* %cw, align 4
  %90 = load i32, i32* %ty, align 4
  %idxprom126 = sext i32 %90 to i64
  %arrayidx127 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE11c_cuda_temp to [16 x [16 x float]]*), i64 0, i64 %idxprom126
  %91 = load i32, i32* %tx, align 4
  %add128 = add nsw i32 %91, 1
  %idxprom129 = sext i32 %add128 to i64
  %arrayidx130 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx127, i64 0, i64 %idxprom129
  %92 = load float, float* %arrayidx130, align 4
  store float %92, float* %ce, align 4
  br label %if.end131

if.end131:                                        ; preds = %if.else120, %if.then110
  br label %if.end132

if.end132:                                        ; preds = %if.end131, %if.then98
  br label %if.end133

if.end133:                                        ; preds = %if.end132, %if.then88
  %93 = load float, float* %cn, align 4
  %94 = load float*, float** %N_C.addr, align 8
  %95 = load i32, i32* %index, align 4
  %idxprom134 = sext i32 %95 to i64
  %arrayidx135 = getelementptr inbounds float, float* %94, i64 %idxprom134
  %96 = load float, float* %arrayidx135, align 4
  %mul136 = fmul contract float %93, %96
  %97 = load float, float* %cs, align 4
  %98 = load float*, float** %S_C.addr, align 8
  %99 = load i32, i32* %index, align 4
  %idxprom137 = sext i32 %99 to i64
  %arrayidx138 = getelementptr inbounds float, float* %98, i64 %idxprom137
  %100 = load float, float* %arrayidx138, align 4
  %mul139 = fmul contract float %97, %100
  %add140 = fadd contract float %mul136, %mul139
  %101 = load float, float* %cw, align 4
  %102 = load float*, float** %W_C.addr, align 8
  %103 = load i32, i32* %index, align 4
  %idxprom141 = sext i32 %103 to i64
  %arrayidx142 = getelementptr inbounds float, float* %102, i64 %idxprom141
  %104 = load float, float* %arrayidx142, align 4
  %mul143 = fmul contract float %101, %104
  %add144 = fadd contract float %add140, %mul143
  %105 = load float, float* %ce, align 4
  %106 = load float*, float** %E_C.addr, align 8
  %107 = load i32, i32* %index, align 4
  %idxprom145 = sext i32 %107 to i64
  %arrayidx146 = getelementptr inbounds float, float* %106, i64 %idxprom145
  %108 = load float, float* %arrayidx146, align 4
  %mul147 = fmul contract float %105, %108
  %add148 = fadd contract float %add144, %mul147
  store float %add148, float* %d_sum, align 4
  %109 = load i32, i32* %ty, align 4
  %idxprom149 = sext i32 %109 to i64
  %arrayidx150 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE4temp to [16 x [16 x float]]*), i64 0, i64 %idxprom149
  %110 = load i32, i32* %tx, align 4
  %idxprom151 = sext i32 %110 to i64
  %arrayidx152 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx150, i64 0, i64 %idxprom151
  %111 = load float, float* %arrayidx152, align 4
  %conv = fpext float %111 to double
  %112 = load float, float* %lambda.addr, align 4
  %conv153 = fpext float %112 to double
  %mul154 = fmul contract double 2.500000e-01, %conv153
  %113 = load float, float* %d_sum, align 4
  %conv155 = fpext float %113 to double
  %mul156 = fmul contract double %mul154, %conv155
  %add157 = fadd contract double %conv, %mul156
  %conv158 = fptrunc double %add157 to float
  %114 = load i32, i32* %ty, align 4
  %idxprom159 = sext i32 %114 to i64
  %arrayidx160 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result to [16 x [16 x float]]*), i64 0, i64 %idxprom159
  %115 = load i32, i32* %tx, align 4
  %idxprom161 = sext i32 %115 to i64
  %arrayidx162 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx160, i64 0, i64 %idxprom161
  store float %conv158, float* %arrayidx162, align 4
  call void @llvm.nvvm.barrier0()
  %116 = load i32, i32* %ty, align 4
  %idxprom163 = sext i32 %116 to i64
  %arrayidx164 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ11srad_cuda_2PfS_S_S_S_S_iiffE13c_cuda_result to [16 x [16 x float]]*), i64 0, i64 %idxprom163
  %117 = load i32, i32* %tx, align 4
  %idxprom165 = sext i32 %117 to i64
  %arrayidx166 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx164, i64 0, i64 %idxprom165
  %118 = load float, float* %arrayidx166, align 4
  %119 = load float*, float** %J_cuda.addr, align 8
  %120 = load i32, i32* %index, align 4
  %idxprom167 = sext i32 %120 to i64
  %arrayidx168 = getelementptr inbounds float, float* %119, i64 %idxprom167
  store float %118, float* %arrayidx168, align 4
  ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #3

; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3

attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }

!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, float*, float*, float*, i32, i32, float)* @_Z11srad_cuda_1PfS_S_S_S_S_iif, !"kernel", i32 1}
!4 = !{void (float*, float*, float*, float*, float*, float*, i32, i32, float, float)* @_Z11srad_cuda_2PfS_S_S_S_S_iiff, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}