CuPBoP/examples/lud/lud_kernel-host-x86_64-unkn...

453 lines
156 KiB
LLVM
Raw Normal View History

2022-05-04 20:59:38 +08:00
; ModuleID = 'lud_kernel-host-x86_64-unknown-linux-gnu.bc'
source_filename = "cuda/lud_kernel.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
$_ZN4dim3C2Ejjj = comdat any
@0 = private unnamed_addr constant [21 x i8] c"_Z12lud_diagonalPfii\00", align 1
@1 = private unnamed_addr constant [22 x i8] c"_Z13lud_perimeterPfii\00", align 1
@2 = private unnamed_addr constant [21 x i8] c"_Z12lud_internalPfii\00", align 1
@3 = private constant [51057 x i8] c"P\EDU\BA\01\00\10\00`\C7\00\00\00\00\00\00\02\00\01\01@\00\00\00\E8\AE\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\002\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00@\AE\00\00\00\00\00\00\C0\A9\00\00\00\00\00\002\052\00@\008\00\03\00@\00\12\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z12lud_internalPfii\00.nv.info._Z12lud_internalPfii\00.nv.shared._Z12lud_internalPfii\00.nv.global\00.nv.constant0._Z12lud_internalPfii\00.text._Z13lud_perimeterPfii\00.nv.info._Z13lud_perimeterPfii\00.nv.shared._Z13lud_perimeterPfii\00.nv.constant0._Z13lud_perimeterPfii\00.text._Z12lud_diagonalPfii\00.nv.info._Z12lud_diagonalPfii\00.nv.shared._Z12lud_diagonalPfii\00.nv.constant0._Z12lud_diagonalPfii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z12lud_internalPfii\00.text._Z12lud_internalPfii\00.nv.info._Z12lud_internalPfii\00.nv.shared._Z12lud_internalPfii\00.nv.global\00threadIdx\00blockIdx\00$___ZZ12lud_internalPfiiE8peri_row__905\00$___ZZ12lud_internalPfiiE8peri_col__907\00.nv.constant0._Z12lud_internalPfii\00_param\00_Z13lud_perimeterPfii\00.text._Z13lud_perimeterPfii\00.nv.info._Z13lud_perimeterPfii\00.nv.shared._Z13lud_perimeterPfii\00$_Z13lud_perimeterPfii$__cuda_sm3x_div_rn_noftz_f32\00$_Z13lud_perimeterPfii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ13lud_perimeterPfiiE3dia__430\00$___ZZ13lud_perimeterPfiiE8peri_row__432\00$___ZZ13lud_perimeterPfiiE8peri_col__434\00.nv.constant0._Z13lud_perimeterPfii\00_Z12lud_diagonalPfii\00.text._Z12lud_diagonalPfii\00.nv.info._Z12lud_diagonalPfii\00.nv.shared._Z12lud_diagonalPfii\00$_Z12lud_diagonalPfii$__cuda_sm3x_div_rn_noftz_f32\00$_Z12lud_diagonalPfii$__cuda_sm3x_div_rn_noftz_f32_slowpath\00$___ZZ12lud_diagonalPfiiE6shadow__186\00.nv.constant0._Z12lud_diagonalPfii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00G\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\00\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\00\00\00\03\00\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AB\00\00\00\01\00\0F\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B5\00\00\00\01\00\0F\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\0E\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00N\01\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\89\01\00\00\03\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AA\01\00\00\22\00\0C\00PN\00\00\00\00\00\00`\01\00\00\00\00\00\00\DE\01\00\00\22\00\0C\00\B0O\00\00\00\00\00\00P\08\00\00\00\00\00\00\91\02\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\CA\02\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\03\00\00\03\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00#\03\00\00\22\00\0D\00\E0$\00\00\00\00\00\00`\01\00\00\00\00\00\00V\03\00\00\22\00\0D\00@&\00\00\00\00\00\00@\08\00\00\00\00\00\00\B8\03\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0B\00\00\00\00\00\00\00\00\00@\15\00\00\00\00\00\008\01\00\00\12\10\0C\00\00\00\00\00\00\00\00\00\00X\00\00\00\00\00\00\B5\02\00\00\12\10\0D\00\00\00\00\00\00\00\00\00\80.\00\00\00\00\00\00\04/\08\00\13\00\00\00\11\00\00\00\04#\08\00\0F\00\00\00\00\00\00\00\04\12\08\00\0F\00\00\00\00\00\00\00\04\11\08\00\0F\00\00\00\00\00\00\00\04#\08\00\0E\00\00\00\00\00\00\00\04\12\08\00\0E\00\00\00\00\00\00\00\04\11\08\00\0E\00\00\00\00\00\00\00\04#\08\00\13\00\00\00\00\00\00\00\04\12\08\00\13\00\00\00 \00\00\00\04\11\08\00\13\00\00\00 \00\00\00\04/\08\00\12\00\00\00\11\00\00\00\04#\08\00\0A\00\00\00\00\00\00\00\04\12\08\00\0A\00\00\00\00\00\00\00\04\11\08\00\0A\00\00\00\00\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00\00\00\00\00\04\11\08\00\09\00\00\00\00\00\00\00\04#\08\00\12\00\00\00\00\00\00\00\04\12\08\00\12\00\00\00 \00\00\00\04\11\08\00\12\00\00\00 \00\00\00\04/\08\00\11\00\00\00\0E\00
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([51057 x i8], [51057 x i8]* @3, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z12lud_diagonalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 {
entry:
%m.addr = alloca float*, align 8
%matrix_dim.addr = alloca i32, align 4
%offset.addr = alloca i32, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %m, float** %m.addr, align 8
store i32 %matrix_dim, i32* %matrix_dim.addr, align 4
store i32 %offset, i32* %offset.addr, align 4
%kernel_args = alloca i8*, i64 3, align 16
%0 = bitcast float** %m.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast i32* %matrix_dim.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast i32* %offset.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%7 = load i64, i64* %shmem_size, align 8
%8 = load i8*, i8** %stream, align 8
%9 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%10 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false)
%11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%12 = load i64, i64* %11, align 8
%13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%14 = load i32, i32* %13, align 8
%15 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%16 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false)
%17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%18 = load i64, i64* %17, align 8
%19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%20 = load i32, i32* %19, align 8
%21 = bitcast i8* %8 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z12lud_diagonalPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)
declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z13lud_perimeterPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 {
entry:
%m.addr = alloca float*, align 8
%matrix_dim.addr = alloca i32, align 4
%offset.addr = alloca i32, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %m, float** %m.addr, align 8
store i32 %matrix_dim, i32* %matrix_dim.addr, align 4
store i32 %offset, i32* %offset.addr, align 4
%kernel_args = alloca i8*, i64 3, align 16
%0 = bitcast float** %m.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast i32* %matrix_dim.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast i32* %offset.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%7 = load i64, i64* %shmem_size, align 8
%8 = load i8*, i8** %stream, align 8
%9 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%10 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false)
%11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%12 = load i64, i64* %11, align 8
%13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%14 = load i32, i32* %13, align 8
%15 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%16 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false)
%17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%18 = load i64, i64* %17, align 8
%19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%20 = load i32, i32* %19, align 8
%21 = bitcast i8* %8 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z13lud_perimeterPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z12lud_internalPfii(float* %m, i32 %matrix_dim, i32 %offset) #0 {
entry:
%m.addr = alloca float*, align 8
%matrix_dim.addr = alloca i32, align 4
%offset.addr = alloca i32, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %m, float** %m.addr, align 8
store i32 %matrix_dim, i32* %matrix_dim.addr, align 4
store i32 %offset, i32* %offset.addr, align 4
%kernel_args = alloca i8*, i64 3, align 16
%0 = bitcast float** %m.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast i32* %matrix_dim.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast i32* %offset.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%7 = load i64, i64* %shmem_size, align 8
%8 = load i8*, i8** %stream, align 8
%9 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%10 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %10, i64 12, i1 false)
%11 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%12 = load i64, i64* %11, align 8
%13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%14 = load i32, i32* %13, align 8
%15 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%16 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false)
%17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%18 = load i64, i64* %17, align 8
%19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%20 = load i32, i32* %19, align 8
%21 = bitcast i8* %8 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, i32)* @_Z12lud_internalPfii to i8*), i64 %12, i32 %14, i64 %18, i32 %20, i8** %kernel_args, i64 %7, %struct.CUstream_st* %21)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z8lud_cudaPfi(float* %m, i32 %matrix_dim) #0 {
entry:
%m.addr = alloca float*, align 8
%matrix_dim.addr = alloca i32, align 4
%i = alloca i32, align 4
%dimBlock = alloca %struct.dim3, align 4
%m_debug = alloca float*, align 8
%agg.tmp = alloca %struct.dim3, align 4
%agg.tmp2 = alloca %struct.dim3, align 4
%agg.tmp.coerce = alloca { i64, i32 }, align 4
%agg.tmp2.coerce = alloca { i64, i32 }, align 4
%agg.tmp5 = alloca %struct.dim3, align 4
%agg.tmp8 = alloca %struct.dim3, align 4
%agg.tmp5.coerce = alloca { i64, i32 }, align 4
%agg.tmp8.coerce = alloca { i64, i32 }, align 4
%dimGrid = alloca %struct.dim3, align 4
%agg.tmp20 = alloca %struct.dim3, align 4
%agg.tmp21 = alloca %struct.dim3, align 4
%agg.tmp20.coerce = alloca { i64, i32 }, align 4
%agg.tmp21.coerce = alloca { i64, i32 }, align 4
%agg.tmp27 = alloca %struct.dim3, align 4
%agg.tmp28 = alloca %struct.dim3, align 4
%agg.tmp27.coerce = alloca { i64, i32 }, align 4
%agg.tmp28.coerce = alloca { i64, i32 }, align 4
store float* %m, float** %m.addr, align 8
store i32 %matrix_dim, i32* %matrix_dim.addr, align 4
store i32 0, i32* %i, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimBlock, i32 16, i32 16, i32 1)
%0 = load i32, i32* %matrix_dim.addr, align 4
%1 = load i32, i32* %matrix_dim.addr, align 4
%mul = mul nsw i32 %0, %1
%conv = sext i32 %mul to i64
%mul1 = mul i64 %conv, 4
%call = call noalias i8* @malloc(i64 %mul1) #5
%2 = bitcast i8* %call to float*
store float* %2, float** %m_debug, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%3 = load i32, i32* %i, align 4
%4 = load i32, i32* %matrix_dim.addr, align 4
%sub = sub nsw i32 %4, 16
%cmp = icmp slt i32 %3, %sub
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp, i32 1, i32 1, i32 1)
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp2, i32 16, i32 1, i32 1)
%5 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
%6 = bitcast %struct.dim3* %agg.tmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %5, i8* align 4 %6, i64 12, i1 false)
%7 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
%8 = load i64, i64* %7, align 4
%9 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
%10 = load i32, i32* %9, align 4
%11 = bitcast { i64, i32 }* %agg.tmp2.coerce to i8*
%12 = bitcast %struct.dim3* %agg.tmp2 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %11, i8* align 4 %12, i64 12, i1 false)
%13 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp2.coerce, i32 0, i32 0
%14 = load i64, i64* %13, align 4
%15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp2.coerce, i32 0, i32 1
%16 = load i32, i32* %15, align 4
%call3 = call i32 @__cudaPushCallConfiguration(i64 %8, i32 %10, i64 %14, i32 %16, i64 0, i8* null)
%tobool = icmp ne i32 %call3, 0
br i1 %tobool, label %kcall.end, label %kcall.configok
kcall.configok: ; preds = %for.body
%17 = load float*, float** %m.addr, align 8
%18 = load i32, i32* %matrix_dim.addr, align 4
%19 = load i32, i32* %i, align 4
call void @_Z12lud_diagonalPfii(float* %17, i32 %18, i32 %19)
br label %kcall.end
kcall.end: ; preds = %kcall.configok, %for.body
%call4 = call i32 @cudaDeviceSynchronize()
%20 = load i32, i32* %matrix_dim.addr, align 4
%21 = load i32, i32* %i, align 4
%sub6 = sub nsw i32 %20, %21
%div = sdiv i32 %sub6, 16
%sub7 = sub nsw i32 %div, 1
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp5, i32 %sub7, i32 1, i32 1)
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp8, i32 32, i32 1, i32 1)
%22 = bitcast { i64, i32 }* %agg.tmp5.coerce to i8*
%23 = bitcast %struct.dim3* %agg.tmp5 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %22, i8* align 4 %23, i64 12, i1 false)
%24 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp5.coerce, i32 0, i32 0
%25 = load i64, i64* %24, align 4
%26 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp5.coerce, i32 0, i32 1
%27 = load i32, i32* %26, align 4
%28 = bitcast { i64, i32 }* %agg.tmp8.coerce to i8*
%29 = bitcast %struct.dim3* %agg.tmp8 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %28, i8* align 4 %29, i64 12, i1 false)
%30 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp8.coerce, i32 0, i32 0
%31 = load i64, i64* %30, align 4
%32 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp8.coerce, i32 0, i32 1
%33 = load i32, i32* %32, align 4
%call9 = call i32 @__cudaPushCallConfiguration(i64 %25, i32 %27, i64 %31, i32 %33, i64 0, i8* null)
%tobool10 = icmp ne i32 %call9, 0
br i1 %tobool10, label %kcall.end12, label %kcall.configok11
kcall.configok11: ; preds = %kcall.end
%34 = load float*, float** %m.addr, align 8
%35 = load i32, i32* %matrix_dim.addr, align 4
%36 = load i32, i32* %i, align 4
call void @_Z13lud_perimeterPfii(float* %34, i32 %35, i32 %36)
br label %kcall.end12
kcall.end12: ; preds = %kcall.configok11, %kcall.end
%call13 = call i32 @cudaDeviceSynchronize()
%37 = load i32, i32* %matrix_dim.addr, align 4
%38 = load i32, i32* %i, align 4
%sub14 = sub nsw i32 %37, %38
%div15 = sdiv i32 %sub14, 16
%sub16 = sub nsw i32 %div15, 1
%39 = load i32, i32* %matrix_dim.addr, align 4
%40 = load i32, i32* %i, align 4
%sub17 = sub nsw i32 %39, %40
%div18 = sdiv i32 %sub17, 16
%sub19 = sub nsw i32 %div18, 1
call void @_ZN4dim3C2Ejjj(%struct.dim3* %dimGrid, i32 %sub16, i32 %sub19, i32 1)
%41 = bitcast %struct.dim3* %agg.tmp20 to i8*
%42 = bitcast %struct.dim3* %dimGrid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %41, i8* align 4 %42, i64 12, i1 false)
%43 = bitcast %struct.dim3* %agg.tmp21 to i8*
%44 = bitcast %struct.dim3* %dimBlock to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %43, i8* align 4 %44, i64 12, i1 false)
%45 = bitcast { i64, i32 }* %agg.tmp20.coerce to i8*
%46 = bitcast %struct.dim3* %agg.tmp20 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %45, i8* align 4 %46, i64 12, i1 false)
%47 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp20.coerce, i32 0, i32 0
%48 = load i64, i64* %47, align 4
%49 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp20.coerce, i32 0, i32 1
%50 = load i32, i32* %49, align 4
%51 = bitcast { i64, i32 }* %agg.tmp21.coerce to i8*
%52 = bitcast %struct.dim3* %agg.tmp21 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %51, i8* align 4 %52, i64 12, i1 false)
%53 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp21.coerce, i32 0, i32 0
%54 = load i64, i64* %53, align 4
%55 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp21.coerce, i32 0, i32 1
%56 = load i32, i32* %55, align 4
%call22 = call i32 @__cudaPushCallConfiguration(i64 %48, i32 %50, i64 %54, i32 %56, i64 0, i8* null)
%tobool23 = icmp ne i32 %call22, 0
br i1 %tobool23, label %kcall.end25, label %kcall.configok24
kcall.configok24: ; preds = %kcall.end12
%57 = load float*, float** %m.addr, align 8
%58 = load i32, i32* %matrix_dim.addr, align 4
%59 = load i32, i32* %i, align 4
call void @_Z12lud_internalPfii(float* %57, i32 %58, i32 %59)
br label %kcall.end25
kcall.end25: ; preds = %kcall.configok24, %kcall.end12
%call26 = call i32 @cudaDeviceSynchronize()
br label %for.inc
for.inc: ; preds = %kcall.end25
%60 = load i32, i32* %i, align 4
%add = add nsw i32 %60, 16
store i32 %add, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp27, i32 1, i32 1, i32 1)
call void @_ZN4dim3C2Ejjj(%struct.dim3* %agg.tmp28, i32 16, i32 1, i32 1)
%61 = bitcast { i64, i32 }* %agg.tmp27.coerce to i8*
%62 = bitcast %struct.dim3* %agg.tmp27 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %61, i8* align 4 %62, i64 12, i1 false)
%63 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp27.coerce, i32 0, i32 0
%64 = load i64, i64* %63, align 4
%65 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp27.coerce, i32 0, i32 1
%66 = load i32, i32* %65, align 4
%67 = bitcast { i64, i32 }* %agg.tmp28.coerce to i8*
%68 = bitcast %struct.dim3* %agg.tmp28 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %67, i8* align 4 %68, i64 12, i1 false)
%69 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp28.coerce, i32 0, i32 0
%70 = load i64, i64* %69, align 4
%71 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp28.coerce, i32 0, i32 1
%72 = load i32, i32* %71, align 4
%call29 = call i32 @__cudaPushCallConfiguration(i64 %64, i32 %66, i64 %70, i32 %72, i64 0, i8* null)
%tobool30 = icmp ne i32 %call29, 0
br i1 %tobool30, label %kcall.end32, label %kcall.configok31
kcall.configok31: ; preds = %for.end
%73 = load float*, float** %m.addr, align 8
%74 = load i32, i32* %matrix_dim.addr, align 4
%75 = load i32, i32* %i, align 4
call void @_Z12lud_diagonalPfii(float* %73, i32 %74, i32 %75)
br label %kcall.end32
kcall.end32: ; preds = %kcall.configok31, %for.end
%call33 = call i32 @cudaDeviceSynchronize()
ret void
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
%x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
%0 = load i32, i32* %vx.addr, align 4
store i32 %0, i32* %x, align 4
%y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
%1 = load i32, i32* %vy.addr, align 4
store i32 %1, i32* %y, align 4
%z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
%2 = load i32, i32* %vz.addr, align 4
store i32 %2, i32* %z, align 4
ret void
}
; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #3
declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #4
declare dso_local i32 @cudaDeviceSynchronize() #4
define internal void @__cuda_register_globals(i8** %0) {
entry:
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z12lud_diagonalPfii to i8*), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
%2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z13lud_perimeterPfii to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
%3 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, i32)* @_Z12lud_internalPfii to i8*), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @2, i64 0, i64 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @2, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
ret void
}
declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)
declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)
declare dso_local i8** @__cudaRegisterFatBinary(i8*)
define internal void @__cuda_module_ctor(i8* %0) {
entry:
%1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
store i8** %1, i8*** @__cuda_gpubin_handle, align 8
call void @__cuda_register_globals(i8** %1)
call void @__cudaRegisterFatBinaryEnd(i8** %1)
%2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
ret void
}
declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)
declare dso_local void @__cudaUnregisterFatBinary(i8**)
define internal void @__cuda_module_dtor(i8* %0) {
entry:
%1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
call void @__cudaUnregisterFatBinary(i8** %1)
ret void
}
declare dso_local i32 @atexit(void (i8*)*)
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}