1934 lines
83 KiB
LLVM
1934 lines
83 KiB
LLVM
|
; ModuleID = 'main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||
|
source_filename = "main_test_cu.cu"
|
||
|
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||
|
target triple = "nvptx64-nvidia-cuda"
|
||
|
|
||
|
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||
|
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||
|
%struct.__cuda_builtin_blockDim_t = type { i8 }
|
||
|
%struct.__cuda_builtin_gridDim_t = type { i8 }
|
||
|
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||
|
|
||
|
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||
|
|
||
|
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||
|
|
||
|
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
|
||
|
|
||
|
$_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any
|
||
|
|
||
|
@_ZZ12histo_kernelPhlPjE4temp = internal addrspace(3) global [256 x i32] undef, align 4
|
||
|
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||
|
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||
|
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
|
||
|
@gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1
|
||
|
@_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm = internal addrspace(3) global [3072 x i32] undef, align 4
|
||
|
@_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax = internal addrspace(3) global i32 undef, align 4
|
||
|
@_ZZL10uniformAddPjS_iiiE3uni = internal addrspace(3) global i32 undef, align 4
|
||
|
@_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4
|
||
|
@_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4
|
||
|
@_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4
|
||
|
@_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||
|
entry:
|
||
|
%p.addr = alloca i8**, align 8
|
||
|
%s.addr = alloca i64, align 8
|
||
|
store i8** %p, i8*** %p.addr, align 8
|
||
|
store i64 %s, i64* %s.addr, align 8
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||
|
entry:
|
||
|
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||
|
%c.addr = alloca i8*, align 8
|
||
|
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||
|
store i8* %c, i8** %c.addr, align 8
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||
|
entry:
|
||
|
%value.addr = alloca i32*, align 8
|
||
|
%attr.addr = alloca i32, align 4
|
||
|
%device.addr = alloca i32, align 4
|
||
|
store i32* %value, i32** %value.addr, align 8
|
||
|
store i32 %attr, i32* %attr.addr, align 4
|
||
|
store i32 %device, i32* %device.addr, align 4
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||
|
entry:
|
||
|
%device.addr = alloca i32*, align 8
|
||
|
store i32* %device, i32** %device.addr, align 8
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||
|
entry:
|
||
|
%numBlocks.addr = alloca i32*, align 8
|
||
|
%func.addr = alloca i8*, align 8
|
||
|
%blockSize.addr = alloca i32, align 4
|
||
|
%dynamicSmemSize.addr = alloca i64, align 8
|
||
|
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||
|
store i8* %func, i8** %func.addr, align 8
|
||
|
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||
|
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||
|
entry:
|
||
|
%numBlocks.addr = alloca i32*, align 8
|
||
|
%func.addr = alloca i8*, align 8
|
||
|
%blockSize.addr = alloca i32, align 4
|
||
|
%dynamicSmemSize.addr = alloca i64, align 8
|
||
|
%flags.addr = alloca i32, align 4
|
||
|
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||
|
store i8* %func, i8** %func.addr, align 8
|
||
|
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||
|
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||
|
store i32 %flags, i32* %flags.addr, align 4
|
||
|
ret i32 999
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_Z12histo_kernelPhlPj(i8* %buffer, i64 %size, i32* %histo) #0 {
|
||
|
entry:
|
||
|
%buffer.addr = alloca i8*, align 8
|
||
|
%size.addr = alloca i64, align 8
|
||
|
%histo.addr = alloca i32*, align 8
|
||
|
%i = alloca i32, align 4
|
||
|
%offset = alloca i32, align 4
|
||
|
store i8* %buffer, i8** %buffer.addr, align 8
|
||
|
store i64 %size, i64* %size.addr, align 8
|
||
|
store i32* %histo, i32** %histo.addr, align 8
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%idxprom = zext i32 %call to i64
|
||
|
%arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom
|
||
|
store i32 0, i32* %arrayidx, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%mul = mul i32 %call2, %call3
|
||
|
%add = add i32 %call1, %mul
|
||
|
store i32 %add, i32* %i, align 4
|
||
|
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%call5 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2
|
||
|
%mul6 = mul i32 %call4, %call5
|
||
|
store i32 %mul6, i32* %offset, align 4
|
||
|
br label %while.cond
|
||
|
|
||
|
while.cond: ; preds = %while.body, %entry
|
||
|
%0 = load i32, i32* %i, align 4
|
||
|
%conv = sext i32 %0 to i64
|
||
|
%1 = load i64, i64* %size.addr, align 8
|
||
|
%cmp = icmp slt i64 %conv, %1
|
||
|
br i1 %cmp, label %while.body, label %while.end
|
||
|
|
||
|
while.body: ; preds = %while.cond
|
||
|
%2 = load i8*, i8** %buffer.addr, align 8
|
||
|
%3 = load i32, i32* %i, align 4
|
||
|
%idxprom7 = sext i32 %3 to i64
|
||
|
%arrayidx8 = getelementptr inbounds i8, i8* %2, i64 %idxprom7
|
||
|
%4 = load i8, i8* %arrayidx8, align 1
|
||
|
%idxprom9 = zext i8 %4 to i64
|
||
|
%arrayidx10 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom9
|
||
|
%call11 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx10, i32 1) #2
|
||
|
%5 = load i32, i32* %offset, align 4
|
||
|
%6 = load i32, i32* %i, align 4
|
||
|
%add12 = add nsw i32 %6, %5
|
||
|
store i32 %add12, i32* %i, align 4
|
||
|
br label %while.cond
|
||
|
|
||
|
while.end: ; preds = %while.cond
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%7 = load i32*, i32** %histo.addr, align 8
|
||
|
%call13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%idxprom14 = zext i32 %call13 to i64
|
||
|
%arrayidx15 = getelementptr inbounds i32, i32* %7, i64 %idxprom14
|
||
|
%call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%idxprom17 = zext i32 %call16 to i64
|
||
|
%arrayidx18 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom17
|
||
|
%8 = load i32, i32* %arrayidx18, align 4
|
||
|
%call19 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx15, i32 %8) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||
|
entry:
|
||
|
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||
|
ret i32 %0
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent nounwind
|
||
|
declare void @llvm.nvvm.barrier0() #2
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||
|
entry:
|
||
|
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||
|
ret i32 %0
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||
|
entry:
|
||
|
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||
|
ret i32 %0
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||
|
entry:
|
||
|
%0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||
|
ret i32 %0
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal i32 @_ZL9atomicAddPjj(i32* %address, i32 %val) #0 {
|
||
|
entry:
|
||
|
%address.addr = alloca i32*, align 8
|
||
|
%val.addr = alloca i32, align 4
|
||
|
store i32* %address, i32** %address.addr, align 8
|
||
|
store i32 %val, i32* %val.addr, align 4
|
||
|
%0 = load i32*, i32** %address.addr, align 8
|
||
|
%1 = load i32, i32* %val.addr, align 4
|
||
|
%call = call i32 @_ZL12__uAtomicAddPjj(i32* %0, i32 %1) #2
|
||
|
ret i32 %call
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_(i32* %data, i32* %gm_codewords, i32* %gm_codewordlens, i32* %cw32, i32* %cw32len, i32* %cw32idx, i32* %out, i32* %outidx) #0 {
|
||
|
entry:
|
||
|
%data.addr = alloca i32*, align 8
|
||
|
%gm_codewords.addr = alloca i32*, align 8
|
||
|
%gm_codewordlens.addr = alloca i32*, align 8
|
||
|
%cw32.addr = alloca i32*, align 8
|
||
|
%cw32len.addr = alloca i32*, align 8
|
||
|
%cw32idx.addr = alloca i32*, align 8
|
||
|
%out.addr = alloca i32*, align 8
|
||
|
%outidx.addr = alloca i32*, align 8
|
||
|
%kn = alloca i32, align 4
|
||
|
%k = alloca i32, align 4
|
||
|
%kc = alloca i32, align 4
|
||
|
%startbit = alloca i32, align 4
|
||
|
%wrbits = alloca i32, align 4
|
||
|
%cw64 = alloca i64, align 8
|
||
|
%val32 = alloca i32, align 4
|
||
|
%codewordlen = alloca i32, align 4
|
||
|
%tmpbyte = alloca i8, align 1
|
||
|
%tmpcwlen = alloca i8, align 1
|
||
|
%tmpcw32 = alloca i32, align 4
|
||
|
%codewords = alloca i32*, align 8
|
||
|
%codewordlens = alloca i32*, align 8
|
||
|
%as = alloca i32*, align 8
|
||
|
%i = alloca i32, align 4
|
||
|
%offset = alloca i32, align 4
|
||
|
%d = alloca i32, align 4
|
||
|
%ai = alloca i8, align 1
|
||
|
%bi = alloca i8, align 1
|
||
|
%d56 = alloca i32, align 4
|
||
|
%ai64 = alloca i8, align 1
|
||
|
%bi70 = alloca i8, align 1
|
||
|
%t = alloca i32, align 4
|
||
|
store i32* %data, i32** %data.addr, align 8
|
||
|
store i32* %gm_codewords, i32** %gm_codewords.addr, align 8
|
||
|
store i32* %gm_codewordlens, i32** %gm_codewordlens.addr, align 8
|
||
|
store i32* %cw32, i32** %cw32.addr, align 8
|
||
|
store i32* %cw32len, i32** %cw32len.addr, align 8
|
||
|
store i32* %cw32idx, i32** %cw32idx.addr, align 8
|
||
|
store i32* %out, i32** %out.addr, align 8
|
||
|
store i32* %outidx, i32** %outidx.addr, align 8
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%mul = mul i32 %call, %call1
|
||
|
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%add = add i32 %mul, %call2
|
||
|
store i32 %add, i32* %kn, align 4
|
||
|
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call3, i32* %k, align 4
|
||
|
store i64 0, i64* %cw64, align 8
|
||
|
store i32 0, i32* %codewordlen, align 4
|
||
|
store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 0), i32** %codewords, align 8
|
||
|
store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 256), i32** %codewordlens, align 8
|
||
|
store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 512), i32** %as, align 8
|
||
|
%0 = load i32*, i32** %gm_codewords.addr, align 8
|
||
|
%1 = load i32, i32* %k, align 4
|
||
|
%idxprom = zext i32 %1 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom
|
||
|
%2 = load i32, i32* %arrayidx, align 4
|
||
|
%3 = load i32*, i32** %codewords, align 8
|
||
|
%4 = load i32, i32* %k, align 4
|
||
|
%idxprom4 = zext i32 %4 to i64
|
||
|
%arrayidx5 = getelementptr inbounds i32, i32* %3, i64 %idxprom4
|
||
|
store i32 %2, i32* %arrayidx5, align 4
|
||
|
%5 = load i32*, i32** %gm_codewordlens.addr, align 8
|
||
|
%6 = load i32, i32* %k, align 4
|
||
|
%idxprom6 = zext i32 %6 to i64
|
||
|
%arrayidx7 = getelementptr inbounds i32, i32* %5, i64 %idxprom6
|
||
|
%7 = load i32, i32* %arrayidx7, align 4
|
||
|
%8 = load i32*, i32** %codewordlens, align 8
|
||
|
%9 = load i32, i32* %k, align 4
|
||
|
%idxprom8 = zext i32 %9 to i64
|
||
|
%arrayidx9 = getelementptr inbounds i32, i32* %8, i64 %idxprom8
|
||
|
store i32 %7, i32* %arrayidx9, align 4
|
||
|
%10 = load i32*, i32** %data.addr, align 8
|
||
|
%11 = load i32, i32* %kn, align 4
|
||
|
%idxprom10 = zext i32 %11 to i64
|
||
|
%arrayidx11 = getelementptr inbounds i32, i32* %10, i64 %idxprom10
|
||
|
%12 = load i32, i32* %arrayidx11, align 4
|
||
|
store i32 %12, i32* %val32, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
store i32 0, i32* %i, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.cond: ; preds = %for.inc, %entry
|
||
|
%13 = load i32, i32* %i, align 4
|
||
|
%cmp = icmp ult i32 %13, 4
|
||
|
br i1 %cmp, label %for.body, label %for.end
|
||
|
|
||
|
for.body: ; preds = %for.cond
|
||
|
%14 = load i32, i32* %val32, align 4
|
||
|
%15 = load i32, i32* %i, align 4
|
||
|
%sub = sub i32 3, %15
|
||
|
%mul12 = mul i32 %sub, 8
|
||
|
%shr = lshr i32 %14, %mul12
|
||
|
%conv = trunc i32 %shr to i8
|
||
|
store i8 %conv, i8* %tmpbyte, align 1
|
||
|
%16 = load i32*, i32** %codewords, align 8
|
||
|
%17 = load i8, i8* %tmpbyte, align 1
|
||
|
%idxprom13 = zext i8 %17 to i64
|
||
|
%arrayidx14 = getelementptr inbounds i32, i32* %16, i64 %idxprom13
|
||
|
%18 = load i32, i32* %arrayidx14, align 4
|
||
|
store i32 %18, i32* %tmpcw32, align 4
|
||
|
%19 = load i32*, i32** %codewordlens, align 8
|
||
|
%20 = load i8, i8* %tmpbyte, align 1
|
||
|
%idxprom15 = zext i8 %20 to i64
|
||
|
%arrayidx16 = getelementptr inbounds i32, i32* %19, i64 %idxprom15
|
||
|
%21 = load i32, i32* %arrayidx16, align 4
|
||
|
%conv17 = trunc i32 %21 to i8
|
||
|
store i8 %conv17, i8* %tmpcwlen, align 1
|
||
|
%22 = load i64, i64* %cw64, align 8
|
||
|
%23 = load i8, i8* %tmpcwlen, align 1
|
||
|
%conv18 = zext i8 %23 to i32
|
||
|
%sh_prom = zext i32 %conv18 to i64
|
||
|
%shl = shl i64 %22, %sh_prom
|
||
|
%24 = load i32, i32* %tmpcw32, align 4
|
||
|
%conv19 = zext i32 %24 to i64
|
||
|
%or = or i64 %shl, %conv19
|
||
|
store i64 %or, i64* %cw64, align 8
|
||
|
%25 = load i8, i8* %tmpcwlen, align 1
|
||
|
%conv20 = zext i8 %25 to i32
|
||
|
%26 = load i32, i32* %codewordlen, align 4
|
||
|
%add21 = add i32 %26, %conv20
|
||
|
store i32 %add21, i32* %codewordlen, align 4
|
||
|
br label %for.inc
|
||
|
|
||
|
for.inc: ; preds = %for.body
|
||
|
%27 = load i32, i32* %i, align 4
|
||
|
%inc = add i32 %27, 1
|
||
|
store i32 %inc, i32* %i, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.end: ; preds = %for.cond
|
||
|
%28 = load i32, i32* %codewordlen, align 4
|
||
|
%29 = load i32*, i32** %as, align 8
|
||
|
%30 = load i32, i32* %k, align 4
|
||
|
%idxprom22 = zext i32 %30 to i64
|
||
|
%arrayidx23 = getelementptr inbounds i32, i32* %29, i64 %idxprom22
|
||
|
store i32 %28, i32* %arrayidx23, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
store i32 1, i32* %offset, align 4
|
||
|
%call24 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shr25 = lshr i32 %call24, 1
|
||
|
store i32 %shr25, i32* %d, align 4
|
||
|
br label %for.cond26
|
||
|
|
||
|
for.cond26: ; preds = %for.inc46, %for.end
|
||
|
%31 = load i32, i32* %d, align 4
|
||
|
%cmp27 = icmp ugt i32 %31, 0
|
||
|
br i1 %cmp27, label %for.body28, label %for.end48
|
||
|
|
||
|
for.body28: ; preds = %for.cond26
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%32 = load i32, i32* %k, align 4
|
||
|
%33 = load i32, i32* %d, align 4
|
||
|
%cmp29 = icmp ult i32 %32, %33
|
||
|
br i1 %cmp29, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %for.body28
|
||
|
%34 = load i32, i32* %offset, align 4
|
||
|
%35 = load i32, i32* %k, align 4
|
||
|
%mul30 = mul i32 2, %35
|
||
|
%add31 = add i32 %mul30, 1
|
||
|
%mul32 = mul i32 %34, %add31
|
||
|
%sub33 = sub i32 %mul32, 1
|
||
|
%conv34 = trunc i32 %sub33 to i8
|
||
|
store i8 %conv34, i8* %ai, align 1
|
||
|
%36 = load i32, i32* %offset, align 4
|
||
|
%37 = load i32, i32* %k, align 4
|
||
|
%mul35 = mul i32 2, %37
|
||
|
%add36 = add i32 %mul35, 2
|
||
|
%mul37 = mul i32 %36, %add36
|
||
|
%sub38 = sub i32 %mul37, 1
|
||
|
%conv39 = trunc i32 %sub38 to i8
|
||
|
store i8 %conv39, i8* %bi, align 1
|
||
|
%38 = load i32*, i32** %as, align 8
|
||
|
%39 = load i8, i8* %ai, align 1
|
||
|
%idxprom40 = zext i8 %39 to i64
|
||
|
%arrayidx41 = getelementptr inbounds i32, i32* %38, i64 %idxprom40
|
||
|
%40 = load i32, i32* %arrayidx41, align 4
|
||
|
%41 = load i32*, i32** %as, align 8
|
||
|
%42 = load i8, i8* %bi, align 1
|
||
|
%idxprom42 = zext i8 %42 to i64
|
||
|
%arrayidx43 = getelementptr inbounds i32, i32* %41, i64 %idxprom42
|
||
|
%43 = load i32, i32* %arrayidx43, align 4
|
||
|
%add44 = add i32 %43, %40
|
||
|
store i32 %add44, i32* %arrayidx43, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %for.body28
|
||
|
%44 = load i32, i32* %offset, align 4
|
||
|
%mul45 = mul i32 %44, 2
|
||
|
store i32 %mul45, i32* %offset, align 4
|
||
|
br label %for.inc46
|
||
|
|
||
|
for.inc46: ; preds = %if.end
|
||
|
%45 = load i32, i32* %d, align 4
|
||
|
%shr47 = lshr i32 %45, 1
|
||
|
store i32 %shr47, i32* %d, align 4
|
||
|
br label %for.cond26
|
||
|
|
||
|
for.end48: ; preds = %for.cond26
|
||
|
%46 = load i32, i32* %k, align 4
|
||
|
%cmp49 = icmp eq i32 %46, 0
|
||
|
br i1 %cmp49, label %if.then50, label %if.end55
|
||
|
|
||
|
if.then50: ; preds = %for.end48
|
||
|
%47 = load i32*, i32** %as, align 8
|
||
|
%call51 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%sub52 = sub i32 %call51, 1
|
||
|
%idxprom53 = zext i32 %sub52 to i64
|
||
|
%arrayidx54 = getelementptr inbounds i32, i32* %47, i64 %idxprom53
|
||
|
store i32 0, i32* %arrayidx54, align 4
|
||
|
br label %if.end55
|
||
|
|
||
|
if.end55: ; preds = %if.then50, %for.end48
|
||
|
store i32 1, i32* %d56, align 4
|
||
|
br label %for.cond57
|
||
|
|
||
|
for.cond57: ; preds = %for.inc86, %if.end55
|
||
|
%48 = load i32, i32* %d56, align 4
|
||
|
%call58 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%cmp59 = icmp ult i32 %48, %call58
|
||
|
br i1 %cmp59, label %for.body60, label %for.end88
|
||
|
|
||
|
for.body60: ; preds = %for.cond57
|
||
|
%49 = load i32, i32* %offset, align 4
|
||
|
%shr61 = lshr i32 %49, 1
|
||
|
store i32 %shr61, i32* %offset, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%50 = load i32, i32* %k, align 4
|
||
|
%51 = load i32, i32* %d56, align 4
|
||
|
%cmp62 = icmp ult i32 %50, %51
|
||
|
br i1 %cmp62, label %if.then63, label %if.end85
|
||
|
|
||
|
if.then63: ; preds = %for.body60
|
||
|
%52 = load i32, i32* %offset, align 4
|
||
|
%53 = load i32, i32* %k, align 4
|
||
|
%mul65 = mul i32 2, %53
|
||
|
%add66 = add i32 %mul65, 1
|
||
|
%mul67 = mul i32 %52, %add66
|
||
|
%sub68 = sub i32 %mul67, 1
|
||
|
%conv69 = trunc i32 %sub68 to i8
|
||
|
store i8 %conv69, i8* %ai64, align 1
|
||
|
%54 = load i32, i32* %offset, align 4
|
||
|
%55 = load i32, i32* %k, align 4
|
||
|
%mul71 = mul i32 2, %55
|
||
|
%add72 = add i32 %mul71, 2
|
||
|
%mul73 = mul i32 %54, %add72
|
||
|
%sub74 = sub i32 %mul73, 1
|
||
|
%conv75 = trunc i32 %sub74 to i8
|
||
|
store i8 %conv75, i8* %bi70, align 1
|
||
|
%56 = load i32*, i32** %as, align 8
|
||
|
%57 = load i8, i8* %ai64, align 1
|
||
|
%idxprom76 = zext i8 %57 to i64
|
||
|
%arrayidx77 = getelementptr inbounds i32, i32* %56, i64 %idxprom76
|
||
|
%58 = load i32, i32* %arrayidx77, align 4
|
||
|
store i32 %58, i32* %t, align 4
|
||
|
%59 = load i32*, i32** %as, align 8
|
||
|
%60 = load i8, i8* %bi70, align 1
|
||
|
%idxprom78 = zext i8 %60 to i64
|
||
|
%arrayidx79 = getelementptr inbounds i32, i32* %59, i64 %idxprom78
|
||
|
%61 = load i32, i32* %arrayidx79, align 4
|
||
|
%62 = load i32*, i32** %as, align 8
|
||
|
%63 = load i8, i8* %ai64, align 1
|
||
|
%idxprom80 = zext i8 %63 to i64
|
||
|
%arrayidx81 = getelementptr inbounds i32, i32* %62, i64 %idxprom80
|
||
|
store i32 %61, i32* %arrayidx81, align 4
|
||
|
%64 = load i32, i32* %t, align 4
|
||
|
%65 = load i32*, i32** %as, align 8
|
||
|
%66 = load i8, i8* %bi70, align 1
|
||
|
%idxprom82 = zext i8 %66 to i64
|
||
|
%arrayidx83 = getelementptr inbounds i32, i32* %65, i64 %idxprom82
|
||
|
%67 = load i32, i32* %arrayidx83, align 4
|
||
|
%add84 = add i32 %67, %64
|
||
|
store i32 %add84, i32* %arrayidx83, align 4
|
||
|
br label %if.end85
|
||
|
|
||
|
if.end85: ; preds = %if.then63, %for.body60
|
||
|
br label %for.inc86
|
||
|
|
||
|
for.inc86: ; preds = %if.end85
|
||
|
%68 = load i32, i32* %d56, align 4
|
||
|
%mul87 = mul i32 %68, 2
|
||
|
store i32 %mul87, i32* %d56, align 4
|
||
|
br label %for.cond57
|
||
|
|
||
|
for.end88: ; preds = %for.cond57
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%69 = load i32, i32* %k, align 4
|
||
|
%call89 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%sub90 = sub i32 %call89, 1
|
||
|
%cmp91 = icmp eq i32 %69, %sub90
|
||
|
br i1 %cmp91, label %if.then92, label %if.end102
|
||
|
|
||
|
if.then92: ; preds = %for.end88
|
||
|
%70 = load i32*, i32** %as, align 8
|
||
|
%71 = load i32, i32* %k, align 4
|
||
|
%idxprom93 = zext i32 %71 to i64
|
||
|
%arrayidx94 = getelementptr inbounds i32, i32* %70, i64 %idxprom93
|
||
|
%72 = load i32, i32* %arrayidx94, align 4
|
||
|
%73 = load i32, i32* %codewordlen, align 4
|
||
|
%add95 = add i32 %72, %73
|
||
|
%74 = load i32*, i32** %outidx.addr, align 8
|
||
|
%call96 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%idxprom97 = zext i32 %call96 to i64
|
||
|
%arrayidx98 = getelementptr inbounds i32, i32* %74, i64 %idxprom97
|
||
|
store i32 %add95, i32* %arrayidx98, align 4
|
||
|
%75 = load i32*, i32** %as, align 8
|
||
|
%76 = load i32, i32* %k, align 4
|
||
|
%idxprom99 = zext i32 %76 to i64
|
||
|
%arrayidx100 = getelementptr inbounds i32, i32* %75, i64 %idxprom99
|
||
|
%77 = load i32, i32* %arrayidx100, align 4
|
||
|
%78 = load i32, i32* %codewordlen, align 4
|
||
|
%add101 = add i32 %77, %78
|
||
|
%div = udiv i32 %add101, 32
|
||
|
store i32 %div, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4
|
||
|
br label %if.end102
|
||
|
|
||
|
if.end102: ; preds = %if.then92, %for.end88
|
||
|
%79 = load i32*, i32** %as, align 8
|
||
|
%80 = load i32, i32* %k, align 4
|
||
|
%idxprom103 = zext i32 %80 to i64
|
||
|
%arrayidx104 = getelementptr inbounds i32, i32* %79, i64 %idxprom103
|
||
|
%81 = load i32, i32* %arrayidx104, align 4
|
||
|
%div105 = udiv i32 %81, 32
|
||
|
store i32 %div105, i32* %kc, align 4
|
||
|
%82 = load i32*, i32** %as, align 8
|
||
|
%83 = load i32, i32* %k, align 4
|
||
|
%idxprom106 = zext i32 %83 to i64
|
||
|
%arrayidx107 = getelementptr inbounds i32, i32* %82, i64 %idxprom106
|
||
|
%84 = load i32, i32* %arrayidx107, align 4
|
||
|
%rem = urem i32 %84, 32
|
||
|
store i32 %rem, i32* %startbit, align 4
|
||
|
%85 = load i32*, i32** %as, align 8
|
||
|
%86 = load i32, i32* %k, align 4
|
||
|
%idxprom108 = zext i32 %86 to i64
|
||
|
%arrayidx109 = getelementptr inbounds i32, i32* %85, i64 %idxprom108
|
||
|
store i32 0, i32* %arrayidx109, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%87 = load i32, i32* %codewordlen, align 4
|
||
|
%88 = load i32, i32* %startbit, align 4
|
||
|
%sub110 = sub i32 32, %88
|
||
|
%cmp111 = icmp ugt i32 %87, %sub110
|
||
|
br i1 %cmp111, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %if.end102
|
||
|
%89 = load i32, i32* %startbit, align 4
|
||
|
%sub112 = sub i32 32, %89
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %if.end102
|
||
|
%90 = load i32, i32* %codewordlen, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %sub112, %cond.true ], [ %90, %cond.false ]
|
||
|
store i32 %cond, i32* %wrbits, align 4
|
||
|
%91 = load i64, i64* %cw64, align 8
|
||
|
%92 = load i32, i32* %codewordlen, align 4
|
||
|
%93 = load i32, i32* %wrbits, align 4
|
||
|
%sub113 = sub i32 %92, %93
|
||
|
%sh_prom114 = zext i32 %sub113 to i64
|
||
|
%shr115 = lshr i64 %91, %sh_prom114
|
||
|
%conv116 = trunc i64 %shr115 to i32
|
||
|
store i32 %conv116, i32* %tmpcw32, align 4
|
||
|
%94 = load i32*, i32** %as, align 8
|
||
|
%95 = load i32, i32* %kc, align 4
|
||
|
%idxprom117 = zext i32 %95 to i64
|
||
|
%arrayidx118 = getelementptr inbounds i32, i32* %94, i64 %idxprom117
|
||
|
%96 = load i32, i32* %tmpcw32, align 4
|
||
|
%97 = load i32, i32* %startbit, align 4
|
||
|
%sub119 = sub i32 32, %97
|
||
|
%98 = load i32, i32* %wrbits, align 4
|
||
|
%sub120 = sub i32 %sub119, %98
|
||
|
%shl121 = shl i32 %96, %sub120
|
||
|
%call122 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx118, i32 %shl121) #2
|
||
|
%99 = load i32, i32* %wrbits, align 4
|
||
|
%100 = load i32, i32* %codewordlen, align 4
|
||
|
%sub123 = sub i32 %100, %99
|
||
|
store i32 %sub123, i32* %codewordlen, align 4
|
||
|
%101 = load i32, i32* %codewordlen, align 4
|
||
|
%tobool = icmp ne i32 %101, 0
|
||
|
br i1 %tobool, label %if.then124, label %if.end143
|
||
|
|
||
|
if.then124: ; preds = %cond.end
|
||
|
%102 = load i32, i32* %codewordlen, align 4
|
||
|
%cmp125 = icmp ugt i32 %102, 32
|
||
|
br i1 %cmp125, label %cond.true126, label %cond.false127
|
||
|
|
||
|
cond.true126: ; preds = %if.then124
|
||
|
br label %cond.end128
|
||
|
|
||
|
cond.false127: ; preds = %if.then124
|
||
|
%103 = load i32, i32* %codewordlen, align 4
|
||
|
br label %cond.end128
|
||
|
|
||
|
cond.end128: ; preds = %cond.false127, %cond.true126
|
||
|
%cond129 = phi i32 [ 32, %cond.true126 ], [ %103, %cond.false127 ]
|
||
|
store i32 %cond129, i32* %wrbits, align 4
|
||
|
%104 = load i64, i64* %cw64, align 8
|
||
|
%105 = load i32, i32* %codewordlen, align 4
|
||
|
%106 = load i32, i32* %wrbits, align 4
|
||
|
%sub130 = sub i32 %105, %106
|
||
|
%sh_prom131 = zext i32 %sub130 to i64
|
||
|
%shr132 = lshr i64 %104, %sh_prom131
|
||
|
%conv133 = trunc i64 %shr132 to i32
|
||
|
%107 = load i32, i32* %wrbits, align 4
|
||
|
%shl134 = shl i32 1, %107
|
||
|
%sub135 = sub nsw i32 %shl134, 1
|
||
|
%and = and i32 %conv133, %sub135
|
||
|
store i32 %and, i32* %tmpcw32, align 4
|
||
|
%108 = load i32*, i32** %as, align 8
|
||
|
%109 = load i32, i32* %kc, align 4
|
||
|
%add136 = add i32 %109, 1
|
||
|
%idxprom137 = zext i32 %add136 to i64
|
||
|
%arrayidx138 = getelementptr inbounds i32, i32* %108, i64 %idxprom137
|
||
|
%110 = load i32, i32* %tmpcw32, align 4
|
||
|
%111 = load i32, i32* %wrbits, align 4
|
||
|
%sub139 = sub i32 32, %111
|
||
|
%shl140 = shl i32 %110, %sub139
|
||
|
%call141 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx138, i32 %shl140) #2
|
||
|
%112 = load i32, i32* %wrbits, align 4
|
||
|
%113 = load i32, i32* %codewordlen, align 4
|
||
|
%sub142 = sub i32 %113, %112
|
||
|
store i32 %sub142, i32* %codewordlen, align 4
|
||
|
br label %if.end143
|
||
|
|
||
|
if.end143: ; preds = %cond.end128, %cond.end
|
||
|
%114 = load i32, i32* %codewordlen, align 4
|
||
|
%tobool144 = icmp ne i32 %114, 0
|
||
|
br i1 %tobool144, label %if.then145, label %if.end157
|
||
|
|
||
|
if.then145: ; preds = %if.end143
|
||
|
%115 = load i64, i64* %cw64, align 8
|
||
|
%116 = load i32, i32* %codewordlen, align 4
|
||
|
%shl146 = shl i32 1, %116
|
||
|
%sub147 = sub nsw i32 %shl146, 1
|
||
|
%conv148 = sext i32 %sub147 to i64
|
||
|
%and149 = and i64 %115, %conv148
|
||
|
%conv150 = trunc i64 %and149 to i32
|
||
|
store i32 %conv150, i32* %tmpcw32, align 4
|
||
|
%117 = load i32*, i32** %as, align 8
|
||
|
%118 = load i32, i32* %kc, align 4
|
||
|
%add151 = add i32 %118, 2
|
||
|
%idxprom152 = zext i32 %add151 to i64
|
||
|
%arrayidx153 = getelementptr inbounds i32, i32* %117, i64 %idxprom152
|
||
|
%119 = load i32, i32* %tmpcw32, align 4
|
||
|
%120 = load i32, i32* %codewordlen, align 4
|
||
|
%sub154 = sub i32 32, %120
|
||
|
%shl155 = shl i32 %119, %sub154
|
||
|
%call156 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx153, i32 %shl155) #2
|
||
|
br label %if.end157
|
||
|
|
||
|
if.end157: ; preds = %if.then145, %if.end143
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%121 = load i32, i32* %k, align 4
|
||
|
%122 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4
|
||
|
%cmp158 = icmp ule i32 %121, %122
|
||
|
br i1 %cmp158, label %if.then159, label %if.end164
|
||
|
|
||
|
if.then159: ; preds = %if.end157
|
||
|
%123 = load i32*, i32** %as, align 8
|
||
|
%124 = load i32, i32* %k, align 4
|
||
|
%idxprom160 = zext i32 %124 to i64
|
||
|
%arrayidx161 = getelementptr inbounds i32, i32* %123, i64 %idxprom160
|
||
|
%125 = load i32, i32* %arrayidx161, align 4
|
||
|
%126 = load i32*, i32** %out.addr, align 8
|
||
|
%127 = load i32, i32* %kn, align 4
|
||
|
%idxprom162 = zext i32 %127 to i64
|
||
|
%arrayidx163 = getelementptr inbounds i32, i32* %126, i64 %idxprom162
|
||
|
store i32 %125, i32* %arrayidx163, align 4
|
||
|
br label %if.end164
|
||
|
|
||
|
if.end164: ; preds = %if.then159, %if.end157
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal i32 @_ZL8atomicOrPjj(i32* %address, i32 %val) #0 {
|
||
|
entry:
|
||
|
%address.addr = alloca i32*, align 8
|
||
|
%val.addr = alloca i32, align 4
|
||
|
store i32* %address, i32** %address.addr, align 8
|
||
|
store i32 %val, i32* %val.addr, align 4
|
||
|
%0 = load i32*, i32** %address.addr, align 8
|
||
|
%1 = load i32, i32* %val.addr, align 4
|
||
|
%call = call i32 @_ZL11__uAtomicOrPjj(i32* %0, i32 %1) #2
|
||
|
ret i32 %call
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL5pack2PjS_S_S_j(i32* %srcData, i32* %cindex, i32* %cindex2, i32* %dstData, i32 %original_num_block_elements) #0 {
|
||
|
entry:
|
||
|
%srcData.addr = alloca i32*, align 8
|
||
|
%cindex.addr = alloca i32*, align 8
|
||
|
%cindex2.addr = alloca i32*, align 8
|
||
|
%dstData.addr = alloca i32*, align 8
|
||
|
%original_num_block_elements.addr = alloca i32, align 4
|
||
|
%tid = alloca i32, align 4
|
||
|
%offset = alloca i32, align 4
|
||
|
%bitsize = alloca i32, align 4
|
||
|
%pos = alloca i32, align 4
|
||
|
%dword = alloca i32, align 4
|
||
|
%bit = alloca i32, align 4
|
||
|
%i = alloca i32, align 4
|
||
|
%dw = alloca i32, align 4
|
||
|
%tmp = alloca i32, align 4
|
||
|
store i32* %srcData, i32** %srcData.addr, align 8
|
||
|
store i32* %cindex, i32** %cindex.addr, align 8
|
||
|
store i32* %cindex2, i32** %cindex2.addr, align 8
|
||
|
store i32* %dstData, i32** %dstData.addr, align 8
|
||
|
store i32 %original_num_block_elements, i32* %original_num_block_elements.addr, align 4
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%mul = mul i32 %call, %call1
|
||
|
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%add = add i32 %mul, %call2
|
||
|
store i32 %add, i32* %tid, align 4
|
||
|
%0 = load i32, i32* %tid, align 4
|
||
|
%1 = load i32, i32* %original_num_block_elements.addr, align 4
|
||
|
%mul3 = mul i32 %0, %1
|
||
|
store i32 %mul3, i32* %offset, align 4
|
||
|
%2 = load i32*, i32** %cindex.addr, align 8
|
||
|
%3 = load i32, i32* %tid, align 4
|
||
|
%idxprom = zext i32 %3 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
|
||
|
%4 = load i32, i32* %arrayidx, align 4
|
||
|
store i32 %4, i32* %bitsize, align 4
|
||
|
%5 = load i32*, i32** %cindex2.addr, align 8
|
||
|
%6 = load i32, i32* %tid, align 4
|
||
|
%idxprom4 = zext i32 %6 to i64
|
||
|
%arrayidx5 = getelementptr inbounds i32, i32* %5, i64 %idxprom4
|
||
|
%7 = load i32, i32* %arrayidx5, align 4
|
||
|
store i32 %7, i32* %pos, align 4
|
||
|
%8 = load i32, i32* %pos, align 4
|
||
|
%div = udiv i32 %8, 32
|
||
|
store i32 %div, i32* %dword, align 4
|
||
|
%9 = load i32, i32* %pos, align 4
|
||
|
%rem = urem i32 %9, 32
|
||
|
store i32 %rem, i32* %bit, align 4
|
||
|
%10 = load i32*, i32** %srcData.addr, align 8
|
||
|
%11 = load i32, i32* %offset, align 4
|
||
|
%idxprom6 = zext i32 %11 to i64
|
||
|
%arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6
|
||
|
%12 = load i32, i32* %arrayidx7, align 4
|
||
|
store i32 %12, i32* %dw, align 4
|
||
|
%13 = load i32, i32* %dw, align 4
|
||
|
%14 = load i32, i32* %bit, align 4
|
||
|
%shr = lshr i32 %13, %14
|
||
|
store i32 %shr, i32* %tmp, align 4
|
||
|
%15 = load i32*, i32** %dstData.addr, align 8
|
||
|
%16 = load i32, i32* %dword, align 4
|
||
|
%idxprom8 = zext i32 %16 to i64
|
||
|
%arrayidx9 = getelementptr inbounds i32, i32* %15, i64 %idxprom8
|
||
|
%17 = load i32, i32* %tmp, align 4
|
||
|
%call10 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx9, i32 %17) #2
|
||
|
%18 = load i32, i32* %bit, align 4
|
||
|
%cmp = icmp eq i32 %18, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%19 = load i32, i32* %dw, align 4
|
||
|
%20 = load i32, i32* %bit, align 4
|
||
|
%sub = sub i32 32, %20
|
||
|
%shl = shl i32 %19, %sub
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ 0, %cond.true ], [ %shl, %cond.false ]
|
||
|
store i32 %cond, i32* %tmp, align 4
|
||
|
store i32 1, i32* %i, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.cond: ; preds = %for.inc, %cond.end
|
||
|
%21 = load i32, i32* %i, align 4
|
||
|
%22 = load i32, i32* %bitsize, align 4
|
||
|
%div11 = udiv i32 %22, 32
|
||
|
%cmp12 = icmp ult i32 %21, %div11
|
||
|
br i1 %cmp12, label %for.body, label %for.end
|
||
|
|
||
|
for.body: ; preds = %for.cond
|
||
|
%23 = load i32*, i32** %srcData.addr, align 8
|
||
|
%24 = load i32, i32* %offset, align 4
|
||
|
%25 = load i32, i32* %i, align 4
|
||
|
%add13 = add i32 %24, %25
|
||
|
%idxprom14 = zext i32 %add13 to i64
|
||
|
%arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14
|
||
|
%26 = load i32, i32* %arrayidx15, align 4
|
||
|
store i32 %26, i32* %dw, align 4
|
||
|
%27 = load i32, i32* %dw, align 4
|
||
|
%28 = load i32, i32* %bit, align 4
|
||
|
%shr16 = lshr i32 %27, %28
|
||
|
%29 = load i32, i32* %tmp, align 4
|
||
|
%or = or i32 %29, %shr16
|
||
|
store i32 %or, i32* %tmp, align 4
|
||
|
%30 = load i32, i32* %tmp, align 4
|
||
|
%31 = load i32*, i32** %dstData.addr, align 8
|
||
|
%32 = load i32, i32* %dword, align 4
|
||
|
%33 = load i32, i32* %i, align 4
|
||
|
%add17 = add i32 %32, %33
|
||
|
%idxprom18 = zext i32 %add17 to i64
|
||
|
%arrayidx19 = getelementptr inbounds i32, i32* %31, i64 %idxprom18
|
||
|
store i32 %30, i32* %arrayidx19, align 4
|
||
|
%34 = load i32, i32* %bit, align 4
|
||
|
%cmp20 = icmp eq i32 %34, 0
|
||
|
br i1 %cmp20, label %cond.true21, label %cond.false22
|
||
|
|
||
|
cond.true21: ; preds = %for.body
|
||
|
br label %cond.end25
|
||
|
|
||
|
cond.false22: ; preds = %for.body
|
||
|
%35 = load i32, i32* %dw, align 4
|
||
|
%36 = load i32, i32* %bit, align 4
|
||
|
%sub23 = sub i32 32, %36
|
||
|
%shl24 = shl i32 %35, %sub23
|
||
|
br label %cond.end25
|
||
|
|
||
|
cond.end25: ; preds = %cond.false22, %cond.true21
|
||
|
%cond26 = phi i32 [ 0, %cond.true21 ], [ %shl24, %cond.false22 ]
|
||
|
store i32 %cond26, i32* %tmp, align 4
|
||
|
br label %for.inc
|
||
|
|
||
|
for.inc: ; preds = %cond.end25
|
||
|
%37 = load i32, i32* %i, align 4
|
||
|
%inc = add i32 %37, 1
|
||
|
store i32 %inc, i32* %i, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.end: ; preds = %for.cond
|
||
|
%38 = load i32, i32* %bit, align 4
|
||
|
%cmp27 = icmp ne i32 %38, 0
|
||
|
br i1 %cmp27, label %if.then, label %lor.lhs.false
|
||
|
|
||
|
lor.lhs.false: ; preds = %for.end
|
||
|
%39 = load i32, i32* %bitsize, align 4
|
||
|
%rem28 = urem i32 %39, 32
|
||
|
%cmp29 = icmp ne i32 %rem28, 0
|
||
|
br i1 %cmp29, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %lor.lhs.false, %for.end
|
||
|
%40 = load i32*, i32** %dstData.addr, align 8
|
||
|
%41 = load i32, i32* %dword, align 4
|
||
|
%42 = load i32, i32* %i, align 4
|
||
|
%add30 = add i32 %41, %42
|
||
|
%idxprom31 = zext i32 %add30 to i64
|
||
|
%arrayidx32 = getelementptr inbounds i32, i32* %40, i64 %idxprom31
|
||
|
%43 = load i32, i32* %tmp, align 4
|
||
|
%call33 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx32, i32 %43) #2
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %lor.lhs.false
|
||
|
%44 = load i32, i32* %bitsize, align 4
|
||
|
%rem34 = urem i32 %44, 32
|
||
|
%cmp35 = icmp ne i32 %rem34, 0
|
||
|
br i1 %cmp35, label %if.then36, label %if.end57
|
||
|
|
||
|
if.then36: ; preds = %if.end
|
||
|
%45 = load i32*, i32** %srcData.addr, align 8
|
||
|
%46 = load i32, i32* %offset, align 4
|
||
|
%47 = load i32, i32* %i, align 4
|
||
|
%add37 = add i32 %46, %47
|
||
|
%idxprom38 = zext i32 %add37 to i64
|
||
|
%arrayidx39 = getelementptr inbounds i32, i32* %45, i64 %idxprom38
|
||
|
%48 = load i32, i32* %arrayidx39, align 4
|
||
|
store i32 %48, i32* %dw, align 4
|
||
|
%49 = load i32*, i32** %dstData.addr, align 8
|
||
|
%50 = load i32, i32* %dword, align 4
|
||
|
%51 = load i32, i32* %i, align 4
|
||
|
%add40 = add i32 %50, %51
|
||
|
%idxprom41 = zext i32 %add40 to i64
|
||
|
%arrayidx42 = getelementptr inbounds i32, i32* %49, i64 %idxprom41
|
||
|
%52 = load i32, i32* %dw, align 4
|
||
|
%53 = load i32, i32* %bit, align 4
|
||
|
%shr43 = lshr i32 %52, %53
|
||
|
%call44 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx42, i32 %shr43) #2
|
||
|
%54 = load i32*, i32** %dstData.addr, align 8
|
||
|
%55 = load i32, i32* %dword, align 4
|
||
|
%56 = load i32, i32* %i, align 4
|
||
|
%add45 = add i32 %55, %56
|
||
|
%add46 = add i32 %add45, 1
|
||
|
%idxprom47 = zext i32 %add46 to i64
|
||
|
%arrayidx48 = getelementptr inbounds i32, i32* %54, i64 %idxprom47
|
||
|
%57 = load i32, i32* %bit, align 4
|
||
|
%cmp49 = icmp eq i32 %57, 0
|
||
|
br i1 %cmp49, label %cond.true50, label %cond.false51
|
||
|
|
||
|
cond.true50: ; preds = %if.then36
|
||
|
br label %cond.end54
|
||
|
|
||
|
cond.false51: ; preds = %if.then36
|
||
|
%58 = load i32, i32* %dw, align 4
|
||
|
%59 = load i32, i32* %bit, align 4
|
||
|
%sub52 = sub i32 32, %59
|
||
|
%shl53 = shl i32 %58, %sub52
|
||
|
br label %cond.end54
|
||
|
|
||
|
cond.end54: ; preds = %cond.false51, %cond.true50
|
||
|
%cond55 = phi i32 [ 0, %cond.true50 ], [ %shl53, %cond.false51 ]
|
||
|
%call56 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx48, i32 %cond55) #2
|
||
|
br label %if.end57
|
||
|
|
||
|
if.end57: ; preds = %cond.end54, %if.end
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL10uniformAddPjS_iii(i32* %g_data, i32* %uniforms, i32 %n, i32 %blockOffset, i32 %baseIndex) #0 {
|
||
|
entry:
|
||
|
%g_data.addr = alloca i32*, align 8
|
||
|
%uniforms.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%blockOffset.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%address = alloca i32, align 4
|
||
|
store i32* %g_data, i32** %g_data.addr, align 8
|
||
|
store i32* %uniforms, i32** %uniforms.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %blockOffset, i32* %blockOffset.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%cmp = icmp eq i32 %call, 0
|
||
|
br i1 %cmp, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %entry
|
||
|
%0 = load i32*, i32** %uniforms.addr, align 8
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%1 = load i32, i32* %blockOffset.addr, align 4
|
||
|
%add = add i32 %call1, %1
|
||
|
%idxprom = zext i32 %add to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom
|
||
|
%2 = load i32, i32* %arrayidx, align 4
|
||
|
store i32 %2, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %entry
|
||
|
%call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call3, 1
|
||
|
%call4 = call i32 @_ZL7__mul24ii(i32 %call2, i32 %shl) #2
|
||
|
%3 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%add5 = add nsw i32 %call4, %3
|
||
|
%call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%add7 = add i32 %add5, %call6
|
||
|
store i32 %add7, i32* %address, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%4 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4
|
||
|
%5 = load i32*, i32** %g_data.addr, align 8
|
||
|
%6 = load i32, i32* %address, align 4
|
||
|
%idxprom8 = zext i32 %6 to i64
|
||
|
%arrayidx9 = getelementptr inbounds i32, i32* %5, i64 %idxprom8
|
||
|
%7 = load i32, i32* %arrayidx9, align 4
|
||
|
%add10 = add i32 %7, %4
|
||
|
store i32 %add10, i32* %arrayidx9, align 4
|
||
|
%call11 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call12 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add13 = add i32 %call11, %call12
|
||
|
%8 = load i32, i32* %n.addr, align 4
|
||
|
%cmp14 = icmp ult i32 %add13, %8
|
||
|
%conv = zext i1 %cmp14 to i32
|
||
|
%9 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4
|
||
|
%mul = mul i32 %conv, %9
|
||
|
%10 = load i32*, i32** %g_data.addr, align 8
|
||
|
%11 = load i32, i32* %address, align 4
|
||
|
%call15 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add16 = add i32 %11, %call15
|
||
|
%idxprom17 = zext i32 %add16 to i64
|
||
|
%arrayidx18 = getelementptr inbounds i32, i32* %10, i64 %idxprom17
|
||
|
%12 = load i32, i32* %arrayidx18, align 4
|
||
|
%add19 = add i32 %12, %mul
|
||
|
store i32 %add19, i32* %arrayidx18, align 4
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define internal i32 @_ZL7__mul24ii(i32 %__a, i32 %__b) #1 {
|
||
|
entry:
|
||
|
%__a.addr = alloca i32, align 4
|
||
|
%__b.addr = alloca i32, align 4
|
||
|
store i32 %__a, i32* %__a.addr, align 4
|
||
|
store i32 %__b, i32* %__b.addr, align 4
|
||
|
%0 = load i32, i32* %__a.addr, align 4
|
||
|
%1 = load i32, i32* %__b.addr, align 4
|
||
|
%call = call i32 @__nv_mul24(i32 %0, i32 %1) #2
|
||
|
ret i32 %call
|
||
|
}
|
||
|
|
||
|
; Function Attrs: nounwind readnone
|
||
|
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||
|
|
||
|
; Function Attrs: nounwind readnone
|
||
|
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||
|
|
||
|
; Function Attrs: nounwind readnone
|
||
|
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3
|
||
|
|
||
|
; Function Attrs: nounwind readnone
|
||
|
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define internal i32 @_ZL12__uAtomicAddPjj(i32* %__p, i32 %__v) #1 {
|
||
|
entry:
|
||
|
%__p.addr = alloca i32*, align 8
|
||
|
%__v.addr = alloca i32, align 4
|
||
|
store i32* %__p, i32** %__p.addr, align 8
|
||
|
store i32 %__v, i32* %__v.addr, align 4
|
||
|
%0 = load i32*, i32** %__p.addr, align 8
|
||
|
%1 = load i32, i32* %__v.addr, align 4
|
||
|
%2 = atomicrmw add i32* %0, i32 %1 seq_cst
|
||
|
ret i32 %2
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent nounwind
|
||
|
define internal i32 @_ZL11__uAtomicOrPjj(i32* %__p, i32 %__v) #1 {
|
||
|
entry:
|
||
|
%__p.addr = alloca i32*, align 8
|
||
|
%__v.addr = alloca i32, align 4
|
||
|
store i32* %__p, i32** %__p.addr, align 8
|
||
|
store i32 %__v, i32* %__v.addr, align 4
|
||
|
%0 = load i32*, i32** %__p.addr, align 8
|
||
|
%1 = load i32, i32* %__v.addr, align 4
|
||
|
%2 = atomicrmw or i32* %0, i32 %1 seq_cst
|
||
|
ret i32 %2
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL7prescanILb1ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
%mem_ai = alloca i32, align 4
|
||
|
%mem_bi = alloca i32, align 4
|
||
|
%bankOffsetA = alloca i32, align 4
|
||
|
%bankOffsetB = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
%0 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%1 = load i32, i32* %n.addr, align 4
|
||
|
%2 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %2, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%3 = load i32, i32* %baseIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ]
|
||
|
call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%5 = load i32*, i32** %g_blockSums.addr, align 8
|
||
|
call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2
|
||
|
%6 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%7 = load i32, i32* %n.addr, align 4
|
||
|
%8 = load i32, i32* %ai, align 4
|
||
|
%9 = load i32, i32* %bi, align 4
|
||
|
%10 = load i32, i32* %mem_ai, align 4
|
||
|
%11 = load i32, i32* %mem_bi, align 4
|
||
|
%12 = load i32, i32* %bankOffsetA, align 4
|
||
|
%13 = load i32, i32* %bankOffsetB, align 4
|
||
|
call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai.addr = alloca i32*, align 8
|
||
|
%bi.addr = alloca i32*, align 8
|
||
|
%mem_ai.addr = alloca i32*, align 8
|
||
|
%mem_bi.addr = alloca i32*, align 8
|
||
|
%bankOffsetA.addr = alloca i32*, align 8
|
||
|
%bankOffsetB.addr = alloca i32*, align 8
|
||
|
%thid = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
store i32* %ai, i32** %ai.addr, align 8
|
||
|
store i32* %bi, i32** %bi.addr, align 8
|
||
|
store i32* %mem_ai, i32** %mem_ai.addr, align 8
|
||
|
store i32* %mem_bi, i32** %mem_bi.addr, align 8
|
||
|
store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8
|
||
|
store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call, i32* %thid, align 4
|
||
|
%0 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%add = add i32 %0, %call1
|
||
|
%1 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
store i32 %add, i32* %1, align 4
|
||
|
%2 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
%3 = load i32, i32* %2, align 4
|
||
|
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add3 = add i32 %3, %call2
|
||
|
%4 = load i32*, i32** %mem_bi.addr, align 8
|
||
|
store i32 %add3, i32* %4, align 4
|
||
|
%5 = load i32, i32* %thid, align 4
|
||
|
%6 = load i32*, i32** %ai.addr, align 8
|
||
|
store i32 %5, i32* %6, align 4
|
||
|
%7 = load i32, i32* %thid, align 4
|
||
|
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add5 = add i32 %7, %call4
|
||
|
%8 = load i32*, i32** %bi.addr, align 8
|
||
|
store i32 %add5, i32* %8, align 4
|
||
|
%9 = load i32*, i32** %ai.addr, align 8
|
||
|
%10 = load i32, i32* %9, align 4
|
||
|
%shr = ashr i32 %10, 4
|
||
|
%11 = load i32*, i32** %bankOffsetA.addr, align 8
|
||
|
store i32 %shr, i32* %11, align 4
|
||
|
%12 = load i32*, i32** %bi.addr, align 8
|
||
|
%13 = load i32, i32* %12, align 4
|
||
|
%shr6 = ashr i32 %13, 4
|
||
|
%14 = load i32*, i32** %bankOffsetB.addr, align 8
|
||
|
store i32 %shr6, i32* %14, align 4
|
||
|
%15 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%16 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
%17 = load i32, i32* %16, align 4
|
||
|
%idxprom = sext i32 %17 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom
|
||
|
%18 = load i32, i32* %arrayidx, align 4
|
||
|
%19 = load i32*, i32** %s_data.addr, align 8
|
||
|
%20 = load i32*, i32** %ai.addr, align 8
|
||
|
%21 = load i32, i32* %20, align 4
|
||
|
%22 = load i32*, i32** %bankOffsetA.addr, align 8
|
||
|
%23 = load i32, i32* %22, align 4
|
||
|
%add7 = add nsw i32 %21, %23
|
||
|
%idxprom8 = sext i32 %add7 to i64
|
||
|
%arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8
|
||
|
store i32 %18, i32* %arrayidx9, align 4
|
||
|
%24 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%25 = load i32*, i32** %mem_bi.addr, align 8
|
||
|
%26 = load i32, i32* %25, align 4
|
||
|
%idxprom10 = sext i32 %26 to i64
|
||
|
%arrayidx11 = getelementptr inbounds i32, i32* %24, i64 %idxprom10
|
||
|
%27 = load i32, i32* %arrayidx11, align 4
|
||
|
%28 = load i32*, i32** %s_data.addr, align 8
|
||
|
%29 = load i32*, i32** %bi.addr, align 8
|
||
|
%30 = load i32, i32* %29, align 4
|
||
|
%31 = load i32*, i32** %bankOffsetB.addr, align 8
|
||
|
%32 = load i32, i32* %31, align 4
|
||
|
%add12 = add nsw i32 %30, %32
|
||
|
%idxprom13 = sext i32 %add12 to i64
|
||
|
%arrayidx14 = getelementptr inbounds i32, i32* %28, i64 %idxprom13
|
||
|
store i32 %27, i32* %arrayidx14, align 4
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL12prescanBlockILb1EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 {
|
||
|
entry:
|
||
|
%data.addr = alloca i32*, align 8
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%blockSums.addr = alloca i32*, align 8
|
||
|
%stride = alloca i32, align 4
|
||
|
store i32* %data, i32** %data.addr, align 8
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32* %blockSums, i32** %blockSums.addr, align 8
|
||
|
%0 = load i32*, i32** %data.addr, align 8
|
||
|
%call = call i32 @_ZL8buildSumPj(i32* %0) #2
|
||
|
store i32 %call, i32* %stride, align 4
|
||
|
%1 = load i32*, i32** %data.addr, align 8
|
||
|
%2 = load i32*, i32** %blockSums.addr, align 8
|
||
|
%3 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %3, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ]
|
||
|
call void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2
|
||
|
%5 = load i32*, i32** %data.addr, align 8
|
||
|
%6 = load i32, i32* %stride, align 4
|
||
|
call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%ai.addr = alloca i32, align 4
|
||
|
%bi.addr = alloca i32, align 4
|
||
|
%mem_ai.addr = alloca i32, align 4
|
||
|
%mem_bi.addr = alloca i32, align 4
|
||
|
%bankOffsetA.addr = alloca i32, align 4
|
||
|
%bankOffsetB.addr = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %ai, i32* %ai.addr, align 4
|
||
|
store i32 %bi, i32* %bi.addr, align 4
|
||
|
store i32 %mem_ai, i32* %mem_ai.addr, align 4
|
||
|
store i32 %mem_bi, i32* %mem_bi.addr, align 4
|
||
|
store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4
|
||
|
store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%0 = load i32*, i32** %s_data.addr, align 8
|
||
|
%1 = load i32, i32* %ai.addr, align 4
|
||
|
%2 = load i32, i32* %bankOffsetA.addr, align 4
|
||
|
%add = add nsw i32 %1, %2
|
||
|
%idxprom = sext i32 %add to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom
|
||
|
%3 = load i32, i32* %arrayidx, align 4
|
||
|
%4 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%5 = load i32, i32* %mem_ai.addr, align 4
|
||
|
%idxprom1 = sext i32 %5 to i64
|
||
|
%arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1
|
||
|
store i32 %3, i32* %arrayidx2, align 4
|
||
|
%6 = load i32*, i32** %s_data.addr, align 8
|
||
|
%7 = load i32, i32* %bi.addr, align 4
|
||
|
%8 = load i32, i32* %bankOffsetB.addr, align 4
|
||
|
%add3 = add nsw i32 %7, %8
|
||
|
%idxprom4 = sext i32 %add3 to i64
|
||
|
%arrayidx5 = getelementptr inbounds i32, i32* %6, i64 %idxprom4
|
||
|
%9 = load i32, i32* %arrayidx5, align 4
|
||
|
%10 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%11 = load i32, i32* %mem_bi.addr, align 4
|
||
|
%idxprom6 = sext i32 %11 to i64
|
||
|
%arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6
|
||
|
store i32 %9, i32* %arrayidx7, align 4
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal i32 @_ZL8buildSumPj(i32* %s_data) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%thid = alloca i32, align 4
|
||
|
%stride = alloca i32, align 4
|
||
|
%d = alloca i32, align 4
|
||
|
%i = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call, i32* %thid, align 4
|
||
|
store i32 1, i32* %stride, align 4
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call1, i32* %d, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.cond: ; preds = %for.inc, %entry
|
||
|
%0 = load i32, i32* %d, align 4
|
||
|
%cmp = icmp sgt i32 %0, 0
|
||
|
br i1 %cmp, label %for.body, label %for.end
|
||
|
|
||
|
for.body: ; preds = %for.cond
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%1 = load i32, i32* %thid, align 4
|
||
|
%2 = load i32, i32* %d, align 4
|
||
|
%cmp2 = icmp ult i32 %1, %2
|
||
|
br i1 %cmp2, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %for.body
|
||
|
%3 = load i32, i32* %stride, align 4
|
||
|
%call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %3) #2
|
||
|
%4 = load i32, i32* %thid, align 4
|
||
|
%call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %4) #2
|
||
|
store i32 %call4, i32* %i, align 4
|
||
|
%5 = load i32, i32* %i, align 4
|
||
|
%6 = load i32, i32* %stride, align 4
|
||
|
%add = add i32 %5, %6
|
||
|
%sub = sub i32 %add, 1
|
||
|
store i32 %sub, i32* %ai, align 4
|
||
|
%7 = load i32, i32* %ai, align 4
|
||
|
%8 = load i32, i32* %stride, align 4
|
||
|
%add5 = add i32 %7, %8
|
||
|
store i32 %add5, i32* %bi, align 4
|
||
|
%9 = load i32, i32* %ai, align 4
|
||
|
%shr = ashr i32 %9, 4
|
||
|
%10 = load i32, i32* %ai, align 4
|
||
|
%add6 = add nsw i32 %10, %shr
|
||
|
store i32 %add6, i32* %ai, align 4
|
||
|
%11 = load i32, i32* %bi, align 4
|
||
|
%shr7 = ashr i32 %11, 4
|
||
|
%12 = load i32, i32* %bi, align 4
|
||
|
%add8 = add nsw i32 %12, %shr7
|
||
|
store i32 %add8, i32* %bi, align 4
|
||
|
%13 = load i32*, i32** %s_data.addr, align 8
|
||
|
%14 = load i32, i32* %ai, align 4
|
||
|
%idxprom = sext i32 %14 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom
|
||
|
%15 = load i32, i32* %arrayidx, align 4
|
||
|
%16 = load i32*, i32** %s_data.addr, align 8
|
||
|
%17 = load i32, i32* %bi, align 4
|
||
|
%idxprom9 = sext i32 %17 to i64
|
||
|
%arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9
|
||
|
%18 = load i32, i32* %arrayidx10, align 4
|
||
|
%add11 = add i32 %18, %15
|
||
|
store i32 %add11, i32* %arrayidx10, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %for.body
|
||
|
%19 = load i32, i32* %stride, align 4
|
||
|
%mul = mul i32 %19, 2
|
||
|
store i32 %mul, i32* %stride, align 4
|
||
|
br label %for.inc
|
||
|
|
||
|
for.inc: ; preds = %if.end
|
||
|
%20 = load i32, i32* %d, align 4
|
||
|
%shr12 = ashr i32 %20, 1
|
||
|
store i32 %shr12, i32* %d, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.end: ; preds = %for.cond
|
||
|
%21 = load i32, i32* %stride, align 4
|
||
|
ret i32 %21
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%index = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%cmp = icmp eq i32 %call, 0
|
||
|
br i1 %cmp, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %entry
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%sub = sub i32 %shl, 1
|
||
|
store i32 %sub, i32* %index, align 4
|
||
|
%0 = load i32, i32* %index, align 4
|
||
|
%shr = ashr i32 %0, 4
|
||
|
%1 = load i32, i32* %index, align 4
|
||
|
%add = add nsw i32 %1, %shr
|
||
|
store i32 %add, i32* %index, align 4
|
||
|
%2 = load i32*, i32** %s_data.addr, align 8
|
||
|
%3 = load i32, i32* %index, align 4
|
||
|
%idxprom = sext i32 %3 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
|
||
|
%4 = load i32, i32* %arrayidx, align 4
|
||
|
%5 = load i32*, i32** %g_blockSums.addr, align 8
|
||
|
%6 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%idxprom2 = sext i32 %6 to i64
|
||
|
%arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
|
||
|
store i32 %4, i32* %arrayidx3, align 4
|
||
|
%7 = load i32*, i32** %s_data.addr, align 8
|
||
|
%8 = load i32, i32* %index, align 4
|
||
|
%idxprom4 = sext i32 %8 to i64
|
||
|
%arrayidx5 = getelementptr inbounds i32, i32* %7, i64 %idxprom4
|
||
|
store i32 0, i32* %arrayidx5, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %entry
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL16scanRootToLeavesPjj(i32* %s_data, i32 %stride) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%stride.addr = alloca i32, align 4
|
||
|
%thid = alloca i32, align 4
|
||
|
%d = alloca i32, align 4
|
||
|
%i = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
%t = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32 %stride, i32* %stride.addr, align 4
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call, i32* %thid, align 4
|
||
|
store i32 1, i32* %d, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.cond: ; preds = %for.inc, %entry
|
||
|
%0 = load i32, i32* %d, align 4
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%cmp = icmp ule i32 %0, %call1
|
||
|
br i1 %cmp, label %for.body, label %for.end
|
||
|
|
||
|
for.body: ; preds = %for.cond
|
||
|
%1 = load i32, i32* %stride.addr, align 4
|
||
|
%shr = lshr i32 %1, 1
|
||
|
store i32 %shr, i32* %stride.addr, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%2 = load i32, i32* %thid, align 4
|
||
|
%3 = load i32, i32* %d, align 4
|
||
|
%cmp2 = icmp ult i32 %2, %3
|
||
|
br i1 %cmp2, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %for.body
|
||
|
%4 = load i32, i32* %stride.addr, align 4
|
||
|
%call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %4) #2
|
||
|
%5 = load i32, i32* %thid, align 4
|
||
|
%call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %5) #2
|
||
|
store i32 %call4, i32* %i, align 4
|
||
|
%6 = load i32, i32* %i, align 4
|
||
|
%7 = load i32, i32* %stride.addr, align 4
|
||
|
%add = add i32 %6, %7
|
||
|
%sub = sub i32 %add, 1
|
||
|
store i32 %sub, i32* %ai, align 4
|
||
|
%8 = load i32, i32* %ai, align 4
|
||
|
%9 = load i32, i32* %stride.addr, align 4
|
||
|
%add5 = add i32 %8, %9
|
||
|
store i32 %add5, i32* %bi, align 4
|
||
|
%10 = load i32, i32* %ai, align 4
|
||
|
%shr6 = ashr i32 %10, 4
|
||
|
%11 = load i32, i32* %ai, align 4
|
||
|
%add7 = add nsw i32 %11, %shr6
|
||
|
store i32 %add7, i32* %ai, align 4
|
||
|
%12 = load i32, i32* %bi, align 4
|
||
|
%shr8 = ashr i32 %12, 4
|
||
|
%13 = load i32, i32* %bi, align 4
|
||
|
%add9 = add nsw i32 %13, %shr8
|
||
|
store i32 %add9, i32* %bi, align 4
|
||
|
%14 = load i32*, i32** %s_data.addr, align 8
|
||
|
%15 = load i32, i32* %ai, align 4
|
||
|
%idxprom = sext i32 %15 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %14, i64 %idxprom
|
||
|
%16 = load i32, i32* %arrayidx, align 4
|
||
|
store i32 %16, i32* %t, align 4
|
||
|
%17 = load i32*, i32** %s_data.addr, align 8
|
||
|
%18 = load i32, i32* %bi, align 4
|
||
|
%idxprom10 = sext i32 %18 to i64
|
||
|
%arrayidx11 = getelementptr inbounds i32, i32* %17, i64 %idxprom10
|
||
|
%19 = load i32, i32* %arrayidx11, align 4
|
||
|
%20 = load i32*, i32** %s_data.addr, align 8
|
||
|
%21 = load i32, i32* %ai, align 4
|
||
|
%idxprom12 = sext i32 %21 to i64
|
||
|
%arrayidx13 = getelementptr inbounds i32, i32* %20, i64 %idxprom12
|
||
|
store i32 %19, i32* %arrayidx13, align 4
|
||
|
%22 = load i32, i32* %t, align 4
|
||
|
%23 = load i32*, i32** %s_data.addr, align 8
|
||
|
%24 = load i32, i32* %bi, align 4
|
||
|
%idxprom14 = sext i32 %24 to i64
|
||
|
%arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14
|
||
|
%25 = load i32, i32* %arrayidx15, align 4
|
||
|
%add16 = add i32 %25, %22
|
||
|
store i32 %add16, i32* %arrayidx15, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %for.body
|
||
|
br label %for.inc
|
||
|
|
||
|
for.inc: ; preds = %if.end
|
||
|
%26 = load i32, i32* %d, align 4
|
||
|
%mul = mul nsw i32 %26, 2
|
||
|
store i32 %mul, i32* %d, align 4
|
||
|
br label %for.cond
|
||
|
|
||
|
for.end: ; preds = %for.cond
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL7prescanILb1ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
%mem_ai = alloca i32, align 4
|
||
|
%mem_bi = alloca i32, align 4
|
||
|
%bankOffsetA = alloca i32, align 4
|
||
|
%bankOffsetB = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
%0 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%1 = load i32, i32* %n.addr, align 4
|
||
|
%2 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %2, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%3 = load i32, i32* %baseIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ]
|
||
|
call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%5 = load i32*, i32** %g_blockSums.addr, align 8
|
||
|
call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2
|
||
|
%6 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%7 = load i32, i32* %n.addr, align 4
|
||
|
%8 = load i32, i32* %ai, align 4
|
||
|
%9 = load i32, i32* %bi, align 4
|
||
|
%10 = load i32, i32* %mem_ai, align 4
|
||
|
%11 = load i32, i32* %mem_bi, align 4
|
||
|
%12 = load i32, i32* %bankOffsetA, align 4
|
||
|
%13 = load i32, i32* %bankOffsetB, align 4
|
||
|
call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai.addr = alloca i32*, align 8
|
||
|
%bi.addr = alloca i32*, align 8
|
||
|
%mem_ai.addr = alloca i32*, align 8
|
||
|
%mem_bi.addr = alloca i32*, align 8
|
||
|
%bankOffsetA.addr = alloca i32*, align 8
|
||
|
%bankOffsetB.addr = alloca i32*, align 8
|
||
|
%thid = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
store i32* %ai, i32** %ai.addr, align 8
|
||
|
store i32* %bi, i32** %bi.addr, align 8
|
||
|
store i32* %mem_ai, i32** %mem_ai.addr, align 8
|
||
|
store i32* %mem_bi, i32** %mem_bi.addr, align 8
|
||
|
store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8
|
||
|
store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
store i32 %call, i32* %thid, align 4
|
||
|
%0 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%add = add i32 %0, %call1
|
||
|
%1 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
store i32 %add, i32* %1, align 4
|
||
|
%2 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
%3 = load i32, i32* %2, align 4
|
||
|
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add3 = add i32 %3, %call2
|
||
|
%4 = load i32*, i32** %mem_bi.addr, align 8
|
||
|
store i32 %add3, i32* %4, align 4
|
||
|
%5 = load i32, i32* %thid, align 4
|
||
|
%6 = load i32*, i32** %ai.addr, align 8
|
||
|
store i32 %5, i32* %6, align 4
|
||
|
%7 = load i32, i32* %thid, align 4
|
||
|
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%add5 = add i32 %7, %call4
|
||
|
%8 = load i32*, i32** %bi.addr, align 8
|
||
|
store i32 %add5, i32* %8, align 4
|
||
|
%9 = load i32*, i32** %ai.addr, align 8
|
||
|
%10 = load i32, i32* %9, align 4
|
||
|
%shr = ashr i32 %10, 4
|
||
|
%11 = load i32*, i32** %bankOffsetA.addr, align 8
|
||
|
store i32 %shr, i32* %11, align 4
|
||
|
%12 = load i32*, i32** %bi.addr, align 8
|
||
|
%13 = load i32, i32* %12, align 4
|
||
|
%shr6 = ashr i32 %13, 4
|
||
|
%14 = load i32*, i32** %bankOffsetB.addr, align 8
|
||
|
store i32 %shr6, i32* %14, align 4
|
||
|
%15 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%16 = load i32*, i32** %mem_ai.addr, align 8
|
||
|
%17 = load i32, i32* %16, align 4
|
||
|
%idxprom = sext i32 %17 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom
|
||
|
%18 = load i32, i32* %arrayidx, align 4
|
||
|
%19 = load i32*, i32** %s_data.addr, align 8
|
||
|
%20 = load i32*, i32** %ai.addr, align 8
|
||
|
%21 = load i32, i32* %20, align 4
|
||
|
%22 = load i32*, i32** %bankOffsetA.addr, align 8
|
||
|
%23 = load i32, i32* %22, align 4
|
||
|
%add7 = add nsw i32 %21, %23
|
||
|
%idxprom8 = sext i32 %add7 to i64
|
||
|
%arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8
|
||
|
store i32 %18, i32* %arrayidx9, align 4
|
||
|
%24 = load i32*, i32** %bi.addr, align 8
|
||
|
%25 = load i32, i32* %24, align 4
|
||
|
%26 = load i32, i32* %n.addr, align 4
|
||
|
%cmp = icmp slt i32 %25, %26
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%27 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%28 = load i32*, i32** %mem_bi.addr, align 8
|
||
|
%29 = load i32, i32* %28, align 4
|
||
|
%idxprom10 = sext i32 %29 to i64
|
||
|
%arrayidx11 = getelementptr inbounds i32, i32* %27, i64 %idxprom10
|
||
|
%30 = load i32, i32* %arrayidx11, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %30, %cond.true ], [ 0, %cond.false ]
|
||
|
%31 = load i32*, i32** %s_data.addr, align 8
|
||
|
%32 = load i32*, i32** %bi.addr, align 8
|
||
|
%33 = load i32, i32* %32, align 4
|
||
|
%34 = load i32*, i32** %bankOffsetB.addr, align 8
|
||
|
%35 = load i32, i32* %34, align 4
|
||
|
%add12 = add nsw i32 %33, %35
|
||
|
%idxprom13 = sext i32 %add12 to i64
|
||
|
%arrayidx14 = getelementptr inbounds i32, i32* %31, i64 %idxprom13
|
||
|
store i32 %cond, i32* %arrayidx14, align 4
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%ai.addr = alloca i32, align 4
|
||
|
%bi.addr = alloca i32, align 4
|
||
|
%mem_ai.addr = alloca i32, align 4
|
||
|
%mem_bi.addr = alloca i32, align 4
|
||
|
%bankOffsetA.addr = alloca i32, align 4
|
||
|
%bankOffsetB.addr = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %ai, i32* %ai.addr, align 4
|
||
|
store i32 %bi, i32* %bi.addr, align 4
|
||
|
store i32 %mem_ai, i32* %mem_ai.addr, align 4
|
||
|
store i32 %mem_bi, i32* %mem_bi.addr, align 4
|
||
|
store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4
|
||
|
store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4
|
||
|
call void @llvm.nvvm.barrier0()
|
||
|
%0 = load i32*, i32** %s_data.addr, align 8
|
||
|
%1 = load i32, i32* %ai.addr, align 4
|
||
|
%2 = load i32, i32* %bankOffsetA.addr, align 4
|
||
|
%add = add nsw i32 %1, %2
|
||
|
%idxprom = sext i32 %add to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom
|
||
|
%3 = load i32, i32* %arrayidx, align 4
|
||
|
%4 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%5 = load i32, i32* %mem_ai.addr, align 4
|
||
|
%idxprom1 = sext i32 %5 to i64
|
||
|
%arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1
|
||
|
store i32 %3, i32* %arrayidx2, align 4
|
||
|
%6 = load i32, i32* %bi.addr, align 4
|
||
|
%7 = load i32, i32* %n.addr, align 4
|
||
|
%cmp = icmp slt i32 %6, %7
|
||
|
br i1 %cmp, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %entry
|
||
|
%8 = load i32*, i32** %s_data.addr, align 8
|
||
|
%9 = load i32, i32* %bi.addr, align 4
|
||
|
%10 = load i32, i32* %bankOffsetB.addr, align 4
|
||
|
%add3 = add nsw i32 %9, %10
|
||
|
%idxprom4 = sext i32 %add3 to i64
|
||
|
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
|
||
|
%11 = load i32, i32* %arrayidx5, align 4
|
||
|
%12 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%13 = load i32, i32* %mem_bi.addr, align 4
|
||
|
%idxprom6 = sext i32 %13 to i64
|
||
|
%arrayidx7 = getelementptr inbounds i32, i32* %12, i64 %idxprom6
|
||
|
store i32 %11, i32* %arrayidx7, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %entry
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL7prescanILb0ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
%mem_ai = alloca i32, align 4
|
||
|
%mem_bi = alloca i32, align 4
|
||
|
%bankOffsetA = alloca i32, align 4
|
||
|
%bankOffsetB = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
%0 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%1 = load i32, i32* %n.addr, align 4
|
||
|
%2 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %2, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%3 = load i32, i32* %baseIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ]
|
||
|
call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%5 = load i32*, i32** %g_blockSums.addr, align 8
|
||
|
call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2
|
||
|
%6 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%7 = load i32, i32* %n.addr, align 4
|
||
|
%8 = load i32, i32* %ai, align 4
|
||
|
%9 = load i32, i32* %bi, align 4
|
||
|
%10 = load i32, i32* %mem_ai, align 4
|
||
|
%11 = load i32, i32* %mem_bi, align 4
|
||
|
%12 = load i32, i32* %bankOffsetA, align 4
|
||
|
%13 = load i32, i32* %bankOffsetB, align 4
|
||
|
call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL12prescanBlockILb0EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 {
|
||
|
entry:
|
||
|
%data.addr = alloca i32*, align 8
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%blockSums.addr = alloca i32*, align 8
|
||
|
%stride = alloca i32, align 4
|
||
|
store i32* %data, i32** %data.addr, align 8
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32* %blockSums, i32** %blockSums.addr, align 8
|
||
|
%0 = load i32*, i32** %data.addr, align 8
|
||
|
%call = call i32 @_ZL8buildSumPj(i32* %0) #2
|
||
|
store i32 %call, i32* %stride, align 4
|
||
|
%1 = load i32*, i32** %data.addr, align 8
|
||
|
%2 = load i32*, i32** %blockSums.addr, align 8
|
||
|
%3 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %3, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ]
|
||
|
call void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2
|
||
|
%5 = load i32*, i32** %data.addr, align 8
|
||
|
%6 = load i32, i32* %stride, align 4
|
||
|
call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define internal void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 {
|
||
|
entry:
|
||
|
%s_data.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%index = alloca i32, align 4
|
||
|
store i32* %s_data, i32** %s_data.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||
|
%cmp = icmp eq i32 %call, 0
|
||
|
br i1 %cmp, label %if.then, label %if.end
|
||
|
|
||
|
if.then: ; preds = %entry
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%sub = sub i32 %shl, 1
|
||
|
store i32 %sub, i32* %index, align 4
|
||
|
%0 = load i32, i32* %index, align 4
|
||
|
%shr = ashr i32 %0, 4
|
||
|
%1 = load i32, i32* %index, align 4
|
||
|
%add = add nsw i32 %1, %shr
|
||
|
store i32 %add, i32* %index, align 4
|
||
|
%2 = load i32*, i32** %s_data.addr, align 8
|
||
|
%3 = load i32, i32* %index, align 4
|
||
|
%idxprom = sext i32 %3 to i64
|
||
|
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
|
||
|
store i32 0, i32* %arrayidx, align 4
|
||
|
br label %if.end
|
||
|
|
||
|
if.end: ; preds = %if.then, %entry
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: convergent noinline nounwind optnone
|
||
|
define dso_local void @_ZL7prescanILb0ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 {
|
||
|
entry:
|
||
|
%g_odata.addr = alloca i32*, align 8
|
||
|
%g_idata.addr = alloca i32*, align 8
|
||
|
%g_blockSums.addr = alloca i32*, align 8
|
||
|
%n.addr = alloca i32, align 4
|
||
|
%blockIndex.addr = alloca i32, align 4
|
||
|
%baseIndex.addr = alloca i32, align 4
|
||
|
%ai = alloca i32, align 4
|
||
|
%bi = alloca i32, align 4
|
||
|
%mem_ai = alloca i32, align 4
|
||
|
%mem_bi = alloca i32, align 4
|
||
|
%bankOffsetA = alloca i32, align 4
|
||
|
%bankOffsetB = alloca i32, align 4
|
||
|
store i32* %g_odata, i32** %g_odata.addr, align 8
|
||
|
store i32* %g_idata, i32** %g_idata.addr, align 8
|
||
|
store i32* %g_blockSums, i32** %g_blockSums.addr, align 8
|
||
|
store i32 %n, i32* %n.addr, align 4
|
||
|
store i32 %blockIndex, i32* %blockIndex.addr, align 4
|
||
|
store i32 %baseIndex, i32* %baseIndex.addr, align 4
|
||
|
%0 = load i32*, i32** %g_idata.addr, align 8
|
||
|
%1 = load i32, i32* %n.addr, align 4
|
||
|
%2 = load i32, i32* %baseIndex.addr, align 4
|
||
|
%cmp = icmp eq i32 %2, 0
|
||
|
br i1 %cmp, label %cond.true, label %cond.false
|
||
|
|
||
|
cond.true: ; preds = %entry
|
||
|
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||
|
%call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2
|
||
|
%shl = shl i32 %call1, 1
|
||
|
%call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.false: ; preds = %entry
|
||
|
%3 = load i32, i32* %baseIndex.addr, align 4
|
||
|
br label %cond.end
|
||
|
|
||
|
cond.end: ; preds = %cond.false, %cond.true
|
||
|
%cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ]
|
||
|
call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2
|
||
|
%4 = load i32, i32* %blockIndex.addr, align 4
|
||
|
%5 = load i32*, i32** %g_blockSums.addr, align 8
|
||
|
call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2
|
||
|
%6 = load i32*, i32** %g_odata.addr, align 8
|
||
|
%7 = load i32, i32* %n.addr, align 4
|
||
|
%8 = load i32, i32* %ai, align 4
|
||
|
%9 = load i32, i32* %bi, align 4
|
||
|
%10 = load i32, i32* %mem_ai, align 4
|
||
|
%11 = load i32, i32* %mem_bi, align 4
|
||
|
%12 = load i32, i32* %bankOffsetA, align 4
|
||
|
%13 = load i32, i32* %bankOffsetB, align 4
|
||
|
call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Function Attrs: alwaysinline convergent inlinehint nounwind
|
||
|
define internal i32 @__nv_mul24(i32 %x, i32 %y) #4 {
|
||
|
%1 = call i32 @llvm.nvvm.mul24.i(i32 %x, i32 %y)
|
||
|
ret i32 %1
|
||
|
}
|
||
|
|
||
|
; Function Attrs: nounwind readnone
|
||
|
declare i32 @llvm.nvvm.mul24.i(i32, i32) #3
|
||
|
|
||
|
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
attributes #2 = { convergent nounwind }
|
||
|
attributes #3 = { nounwind readnone }
|
||
|
attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||
|
|
||
|
!llvm.module.flags = !{!0, !1, !2}
|
||
|
!nvvm.annotations = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !11, !13, !13, !13, !13, !14, !14, !13}
|
||
|
!llvm.ident = !{!15}
|
||
|
!nvvmir.version = !{!16}
|
||
|
|
||
|
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||
|
!1 = !{i32 1, !"wchar_size", i32 4}
|
||
|
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||
|
!3 = !{void (i8*, i64, i32*)* @_Z12histo_kernelPhlPj, !"kernel", i32 1}
|
||
|
!4 = !{void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, i32*)* @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_, !"kernel", i32 1}
|
||
|
!5 = !{void (i32*, i32*, i32*, i32*, i32)* @_ZL5pack2PjS_S_S_j, !"kernel", i32 1}
|
||
|
!6 = !{void (i32*, i32*, i32, i32, i32)* @_ZL10uniformAddPjS_iii, !"kernel", i32 1}
|
||
|
!7 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb0EEvPjPKjS0_iii, !"kernel", i32 1}
|
||
|
!8 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb1EEvPjPKjS0_iii, !"kernel", i32 1}
|
||
|
!9 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb0EEvPjPKjS0_iii, !"kernel", i32 1}
|
||
|
!10 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb1EEvPjPKjS0_iii, !"kernel", i32 1}
|
||
|
!11 = !{null, !"align", i32 8}
|
||
|
!12 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||
|
!13 = !{null, !"align", i32 16}
|
||
|
!14 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||
|
!15 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||
|
!16 = !{i32 1, i32 4}
|