; ModuleID = 'main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc' source_filename = "main_test_cu.cu" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" %struct.__cuda_builtin_threadIdx_t = type { i8 } %struct.__cuda_builtin_blockIdx_t = type { i8 } %struct.__cuda_builtin_blockDim_t = type { i8 } %struct.__cuda_builtin_gridDim_t = type { i8 } %struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 } $_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any $_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any $_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any $_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv = comdat any @_ZZ12histo_kernelPhlPjE4temp = internal addrspace(3) global [256 x i32] undef, align 4 @threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1 @blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1 @blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1 @gridDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1 @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm = internal addrspace(3) global [3072 x i32] undef, align 4 @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax = internal addrspace(3) global i32 undef, align 4 @_ZZL10uniformAddPjS_iiiE3uni = internal addrspace(3) global i32 undef, align 4 @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data = internal addrspace(3) global [3072 x i32] undef, align 4 ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 { entry: %p.addr = alloca i8**, align 8 %s.addr = alloca i64, align 8 store i8** %p, i8*** %p.addr, align 8 store i64 %s, i64* %s.addr, align 8 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 { entry: %p.addr = alloca %struct.cudaFuncAttributes*, align 8 %c.addr = alloca i8*, align 8 store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8 store i8* %c, i8** %c.addr, align 8 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 { entry: %value.addr = alloca i32*, align 8 %attr.addr = alloca i32, align 4 %device.addr = alloca i32, align 4 store i32* %value, i32** %value.addr, align 8 store i32 %attr, i32* %attr.addr, align 4 store i32 %device, i32* %device.addr, align 4 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaGetDevice(i32* %device) #0 { entry: %device.addr = alloca i32*, align 8 store i32* %device, i32** %device.addr, align 8 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 { entry: %numBlocks.addr = alloca i32*, align 8 %func.addr = alloca i8*, align 8 %blockSize.addr = alloca i32, align 4 %dynamicSmemSize.addr = alloca i64, align 8 store i32* %numBlocks, i32** %numBlocks.addr, align 8 store i8* %func, i8** %func.addr, align 8 store i32 %blockSize, i32* %blockSize.addr, align 4 store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 { entry: %numBlocks.addr = alloca i32*, align 8 %func.addr = alloca i8*, align 8 %blockSize.addr = alloca i32, align 4 %dynamicSmemSize.addr = alloca i64, align 8 %flags.addr = alloca i32, align 4 store i32* %numBlocks, i32** %numBlocks.addr, align 8 store i8* %func, i8** %func.addr, align 8 store i32 %blockSize, i32* %blockSize.addr, align 4 store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8 store i32 %flags, i32* %flags.addr, align 4 ret i32 999 } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_Z12histo_kernelPhlPj(i8* %buffer, i64 %size, i32* %histo) #0 { entry: %buffer.addr = alloca i8*, align 8 %size.addr = alloca i64, align 8 %histo.addr = alloca i32*, align 8 %i = alloca i32, align 4 %offset = alloca i32, align 4 store i8* %buffer, i8** %buffer.addr, align 8 store i64 %size, i64* %size.addr, align 8 store i32* %histo, i32** %histo.addr, align 8 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %idxprom = zext i32 %call to i64 %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom store i32 0, i32* %arrayidx, align 4 call void @llvm.nvvm.barrier0() %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %mul = mul i32 %call2, %call3 %add = add i32 %call1, %mul store i32 %add, i32* %i, align 4 %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %call5 = call i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #2 %mul6 = mul i32 %call4, %call5 store i32 %mul6, i32* %offset, align 4 br label %while.cond while.cond: ; preds = %while.body, %entry %0 = load i32, i32* %i, align 4 %conv = sext i32 %0 to i64 %1 = load i64, i64* %size.addr, align 8 %cmp = icmp slt i64 %conv, %1 br i1 %cmp, label %while.body, label %while.end while.body: ; preds = %while.cond %2 = load i8*, i8** %buffer.addr, align 8 %3 = load i32, i32* %i, align 4 %idxprom7 = sext i32 %3 to i64 %arrayidx8 = getelementptr inbounds i8, i8* %2, i64 %idxprom7 %4 = load i8, i8* %arrayidx8, align 1 %idxprom9 = zext i8 %4 to i64 %arrayidx10 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom9 %call11 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx10, i32 1) #2 %5 = load i32, i32* %offset, align 4 %6 = load i32, i32* %i, align 4 %add12 = add nsw i32 %6, %5 store i32 %add12, i32* %i, align 4 br label %while.cond while.end: ; preds = %while.cond call void @llvm.nvvm.barrier0() %7 = load i32*, i32** %histo.addr, align 8 %call13 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %idxprom14 = zext i32 %call13 to i64 %arrayidx15 = getelementptr inbounds i32, i32* %7, i64 %idxprom14 %call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %idxprom17 = zext i32 %call16 to i64 %arrayidx18 = getelementptr inbounds [256 x i32], [256 x i32]* addrspacecast ([256 x i32] addrspace(3)* @_ZZ12histo_kernelPhlPjE4temp to [256 x i32]*), i64 0, i64 %idxprom17 %8 = load i32, i32* %arrayidx18, align 4 %call19 = call i32 @_ZL9atomicAddPjj(i32* %arrayidx15, i32 %8) #2 ret void } ; Function Attrs: alwaysinline convergent nounwind define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { entry: %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %0 } ; Function Attrs: convergent nounwind declare void @llvm.nvvm.barrier0() #2 ; Function Attrs: alwaysinline convergent nounwind define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 { entry: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %0 } ; Function Attrs: alwaysinline convergent nounwind define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 { entry: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %0 } ; Function Attrs: alwaysinline convergent nounwind define linkonce_odr dso_local i32 @_ZN24__cuda_builtin_gridDim_t17__fetch_builtin_xEv() #1 comdat align 2 { entry: %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %0 } ; Function Attrs: convergent noinline nounwind optnone define internal i32 @_ZL9atomicAddPjj(i32* %address, i32 %val) #0 { entry: %address.addr = alloca i32*, align 8 %val.addr = alloca i32, align 4 store i32* %address, i32** %address.addr, align 8 store i32 %val, i32* %val.addr, align 4 %0 = load i32*, i32** %address.addr, align 8 %1 = load i32, i32* %val.addr, align 4 %call = call i32 @_ZL12__uAtomicAddPjj(i32* %0, i32 %1) #2 ret i32 %call } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_(i32* %data, i32* %gm_codewords, i32* %gm_codewordlens, i32* %cw32, i32* %cw32len, i32* %cw32idx, i32* %out, i32* %outidx) #0 { entry: %data.addr = alloca i32*, align 8 %gm_codewords.addr = alloca i32*, align 8 %gm_codewordlens.addr = alloca i32*, align 8 %cw32.addr = alloca i32*, align 8 %cw32len.addr = alloca i32*, align 8 %cw32idx.addr = alloca i32*, align 8 %out.addr = alloca i32*, align 8 %outidx.addr = alloca i32*, align 8 %kn = alloca i32, align 4 %k = alloca i32, align 4 %kc = alloca i32, align 4 %startbit = alloca i32, align 4 %wrbits = alloca i32, align 4 %cw64 = alloca i64, align 8 %val32 = alloca i32, align 4 %codewordlen = alloca i32, align 4 %tmpbyte = alloca i8, align 1 %tmpcwlen = alloca i8, align 1 %tmpcw32 = alloca i32, align 4 %codewords = alloca i32*, align 8 %codewordlens = alloca i32*, align 8 %as = alloca i32*, align 8 %i = alloca i32, align 4 %offset = alloca i32, align 4 %d = alloca i32, align 4 %ai = alloca i8, align 1 %bi = alloca i8, align 1 %d56 = alloca i32, align 4 %ai64 = alloca i8, align 1 %bi70 = alloca i8, align 1 %t = alloca i32, align 4 store i32* %data, i32** %data.addr, align 8 store i32* %gm_codewords, i32** %gm_codewords.addr, align 8 store i32* %gm_codewordlens, i32** %gm_codewordlens.addr, align 8 store i32* %cw32, i32** %cw32.addr, align 8 store i32* %cw32len, i32** %cw32len.addr, align 8 store i32* %cw32idx, i32** %cw32idx.addr, align 8 store i32* %out, i32** %out.addr, align 8 store i32* %outidx, i32** %outidx.addr, align 8 %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %mul = mul i32 %call, %call1 %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %add = add i32 %mul, %call2 store i32 %add, i32* %kn, align 4 %call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 store i32 %call3, i32* %k, align 4 store i64 0, i64* %cw64, align 8 store i32 0, i32* %codewordlen, align 4 store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 0), i32** %codewords, align 8 store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 256), i32** %codewordlens, align 8 store i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E2sm to [3072 x i32]*), i64 0, i64 512), i32** %as, align 8 %0 = load i32*, i32** %gm_codewords.addr, align 8 %1 = load i32, i32* %k, align 4 %idxprom = zext i32 %1 to i64 %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom %2 = load i32, i32* %arrayidx, align 4 %3 = load i32*, i32** %codewords, align 8 %4 = load i32, i32* %k, align 4 %idxprom4 = zext i32 %4 to i64 %arrayidx5 = getelementptr inbounds i32, i32* %3, i64 %idxprom4 store i32 %2, i32* %arrayidx5, align 4 %5 = load i32*, i32** %gm_codewordlens.addr, align 8 %6 = load i32, i32* %k, align 4 %idxprom6 = zext i32 %6 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %5, i64 %idxprom6 %7 = load i32, i32* %arrayidx7, align 4 %8 = load i32*, i32** %codewordlens, align 8 %9 = load i32, i32* %k, align 4 %idxprom8 = zext i32 %9 to i64 %arrayidx9 = getelementptr inbounds i32, i32* %8, i64 %idxprom8 store i32 %7, i32* %arrayidx9, align 4 %10 = load i32*, i32** %data.addr, align 8 %11 = load i32, i32* %kn, align 4 %idxprom10 = zext i32 %11 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %10, i64 %idxprom10 %12 = load i32, i32* %arrayidx11, align 4 store i32 %12, i32* %val32, align 4 call void @llvm.nvvm.barrier0() store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %13 = load i32, i32* %i, align 4 %cmp = icmp ult i32 %13, 4 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %14 = load i32, i32* %val32, align 4 %15 = load i32, i32* %i, align 4 %sub = sub i32 3, %15 %mul12 = mul i32 %sub, 8 %shr = lshr i32 %14, %mul12 %conv = trunc i32 %shr to i8 store i8 %conv, i8* %tmpbyte, align 1 %16 = load i32*, i32** %codewords, align 8 %17 = load i8, i8* %tmpbyte, align 1 %idxprom13 = zext i8 %17 to i64 %arrayidx14 = getelementptr inbounds i32, i32* %16, i64 %idxprom13 %18 = load i32, i32* %arrayidx14, align 4 store i32 %18, i32* %tmpcw32, align 4 %19 = load i32*, i32** %codewordlens, align 8 %20 = load i8, i8* %tmpbyte, align 1 %idxprom15 = zext i8 %20 to i64 %arrayidx16 = getelementptr inbounds i32, i32* %19, i64 %idxprom15 %21 = load i32, i32* %arrayidx16, align 4 %conv17 = trunc i32 %21 to i8 store i8 %conv17, i8* %tmpcwlen, align 1 %22 = load i64, i64* %cw64, align 8 %23 = load i8, i8* %tmpcwlen, align 1 %conv18 = zext i8 %23 to i32 %sh_prom = zext i32 %conv18 to i64 %shl = shl i64 %22, %sh_prom %24 = load i32, i32* %tmpcw32, align 4 %conv19 = zext i32 %24 to i64 %or = or i64 %shl, %conv19 store i64 %or, i64* %cw64, align 8 %25 = load i8, i8* %tmpcwlen, align 1 %conv20 = zext i8 %25 to i32 %26 = load i32, i32* %codewordlen, align 4 %add21 = add i32 %26, %conv20 store i32 %add21, i32* %codewordlen, align 4 br label %for.inc for.inc: ; preds = %for.body %27 = load i32, i32* %i, align 4 %inc = add i32 %27, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond %28 = load i32, i32* %codewordlen, align 4 %29 = load i32*, i32** %as, align 8 %30 = load i32, i32* %k, align 4 %idxprom22 = zext i32 %30 to i64 %arrayidx23 = getelementptr inbounds i32, i32* %29, i64 %idxprom22 store i32 %28, i32* %arrayidx23, align 4 call void @llvm.nvvm.barrier0() store i32 1, i32* %offset, align 4 %call24 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shr25 = lshr i32 %call24, 1 store i32 %shr25, i32* %d, align 4 br label %for.cond26 for.cond26: ; preds = %for.inc46, %for.end %31 = load i32, i32* %d, align 4 %cmp27 = icmp ugt i32 %31, 0 br i1 %cmp27, label %for.body28, label %for.end48 for.body28: ; preds = %for.cond26 call void @llvm.nvvm.barrier0() %32 = load i32, i32* %k, align 4 %33 = load i32, i32* %d, align 4 %cmp29 = icmp ult i32 %32, %33 br i1 %cmp29, label %if.then, label %if.end if.then: ; preds = %for.body28 %34 = load i32, i32* %offset, align 4 %35 = load i32, i32* %k, align 4 %mul30 = mul i32 2, %35 %add31 = add i32 %mul30, 1 %mul32 = mul i32 %34, %add31 %sub33 = sub i32 %mul32, 1 %conv34 = trunc i32 %sub33 to i8 store i8 %conv34, i8* %ai, align 1 %36 = load i32, i32* %offset, align 4 %37 = load i32, i32* %k, align 4 %mul35 = mul i32 2, %37 %add36 = add i32 %mul35, 2 %mul37 = mul i32 %36, %add36 %sub38 = sub i32 %mul37, 1 %conv39 = trunc i32 %sub38 to i8 store i8 %conv39, i8* %bi, align 1 %38 = load i32*, i32** %as, align 8 %39 = load i8, i8* %ai, align 1 %idxprom40 = zext i8 %39 to i64 %arrayidx41 = getelementptr inbounds i32, i32* %38, i64 %idxprom40 %40 = load i32, i32* %arrayidx41, align 4 %41 = load i32*, i32** %as, align 8 %42 = load i8, i8* %bi, align 1 %idxprom42 = zext i8 %42 to i64 %arrayidx43 = getelementptr inbounds i32, i32* %41, i64 %idxprom42 %43 = load i32, i32* %arrayidx43, align 4 %add44 = add i32 %43, %40 store i32 %add44, i32* %arrayidx43, align 4 br label %if.end if.end: ; preds = %if.then, %for.body28 %44 = load i32, i32* %offset, align 4 %mul45 = mul i32 %44, 2 store i32 %mul45, i32* %offset, align 4 br label %for.inc46 for.inc46: ; preds = %if.end %45 = load i32, i32* %d, align 4 %shr47 = lshr i32 %45, 1 store i32 %shr47, i32* %d, align 4 br label %for.cond26 for.end48: ; preds = %for.cond26 %46 = load i32, i32* %k, align 4 %cmp49 = icmp eq i32 %46, 0 br i1 %cmp49, label %if.then50, label %if.end55 if.then50: ; preds = %for.end48 %47 = load i32*, i32** %as, align 8 %call51 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %sub52 = sub i32 %call51, 1 %idxprom53 = zext i32 %sub52 to i64 %arrayidx54 = getelementptr inbounds i32, i32* %47, i64 %idxprom53 store i32 0, i32* %arrayidx54, align 4 br label %if.end55 if.end55: ; preds = %if.then50, %for.end48 store i32 1, i32* %d56, align 4 br label %for.cond57 for.cond57: ; preds = %for.inc86, %if.end55 %48 = load i32, i32* %d56, align 4 %call58 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %cmp59 = icmp ult i32 %48, %call58 br i1 %cmp59, label %for.body60, label %for.end88 for.body60: ; preds = %for.cond57 %49 = load i32, i32* %offset, align 4 %shr61 = lshr i32 %49, 1 store i32 %shr61, i32* %offset, align 4 call void @llvm.nvvm.barrier0() %50 = load i32, i32* %k, align 4 %51 = load i32, i32* %d56, align 4 %cmp62 = icmp ult i32 %50, %51 br i1 %cmp62, label %if.then63, label %if.end85 if.then63: ; preds = %for.body60 %52 = load i32, i32* %offset, align 4 %53 = load i32, i32* %k, align 4 %mul65 = mul i32 2, %53 %add66 = add i32 %mul65, 1 %mul67 = mul i32 %52, %add66 %sub68 = sub i32 %mul67, 1 %conv69 = trunc i32 %sub68 to i8 store i8 %conv69, i8* %ai64, align 1 %54 = load i32, i32* %offset, align 4 %55 = load i32, i32* %k, align 4 %mul71 = mul i32 2, %55 %add72 = add i32 %mul71, 2 %mul73 = mul i32 %54, %add72 %sub74 = sub i32 %mul73, 1 %conv75 = trunc i32 %sub74 to i8 store i8 %conv75, i8* %bi70, align 1 %56 = load i32*, i32** %as, align 8 %57 = load i8, i8* %ai64, align 1 %idxprom76 = zext i8 %57 to i64 %arrayidx77 = getelementptr inbounds i32, i32* %56, i64 %idxprom76 %58 = load i32, i32* %arrayidx77, align 4 store i32 %58, i32* %t, align 4 %59 = load i32*, i32** %as, align 8 %60 = load i8, i8* %bi70, align 1 %idxprom78 = zext i8 %60 to i64 %arrayidx79 = getelementptr inbounds i32, i32* %59, i64 %idxprom78 %61 = load i32, i32* %arrayidx79, align 4 %62 = load i32*, i32** %as, align 8 %63 = load i8, i8* %ai64, align 1 %idxprom80 = zext i8 %63 to i64 %arrayidx81 = getelementptr inbounds i32, i32* %62, i64 %idxprom80 store i32 %61, i32* %arrayidx81, align 4 %64 = load i32, i32* %t, align 4 %65 = load i32*, i32** %as, align 8 %66 = load i8, i8* %bi70, align 1 %idxprom82 = zext i8 %66 to i64 %arrayidx83 = getelementptr inbounds i32, i32* %65, i64 %idxprom82 %67 = load i32, i32* %arrayidx83, align 4 %add84 = add i32 %67, %64 store i32 %add84, i32* %arrayidx83, align 4 br label %if.end85 if.end85: ; preds = %if.then63, %for.body60 br label %for.inc86 for.inc86: ; preds = %if.end85 %68 = load i32, i32* %d56, align 4 %mul87 = mul i32 %68, 2 store i32 %mul87, i32* %d56, align 4 br label %for.cond57 for.end88: ; preds = %for.cond57 call void @llvm.nvvm.barrier0() %69 = load i32, i32* %k, align 4 %call89 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %sub90 = sub i32 %call89, 1 %cmp91 = icmp eq i32 %69, %sub90 br i1 %cmp91, label %if.then92, label %if.end102 if.then92: ; preds = %for.end88 %70 = load i32*, i32** %as, align 8 %71 = load i32, i32* %k, align 4 %idxprom93 = zext i32 %71 to i64 %arrayidx94 = getelementptr inbounds i32, i32* %70, i64 %idxprom93 %72 = load i32, i32* %arrayidx94, align 4 %73 = load i32, i32* %codewordlen, align 4 %add95 = add i32 %72, %73 %74 = load i32*, i32** %outidx.addr, align 8 %call96 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %idxprom97 = zext i32 %call96 to i64 %arrayidx98 = getelementptr inbounds i32, i32* %74, i64 %idxprom97 store i32 %add95, i32* %arrayidx98, align 4 %75 = load i32*, i32** %as, align 8 %76 = load i32, i32* %k, align 4 %idxprom99 = zext i32 %76 to i64 %arrayidx100 = getelementptr inbounds i32, i32* %75, i64 %idxprom99 %77 = load i32, i32* %arrayidx100, align 4 %78 = load i32, i32* %codewordlen, align 4 %add101 = add i32 %77, %78 %div = udiv i32 %add101, 32 store i32 %div, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4 br label %if.end102 if.end102: ; preds = %if.then92, %for.end88 %79 = load i32*, i32** %as, align 8 %80 = load i32, i32* %k, align 4 %idxprom103 = zext i32 %80 to i64 %arrayidx104 = getelementptr inbounds i32, i32* %79, i64 %idxprom103 %81 = load i32, i32* %arrayidx104, align 4 %div105 = udiv i32 %81, 32 store i32 %div105, i32* %kc, align 4 %82 = load i32*, i32** %as, align 8 %83 = load i32, i32* %k, align 4 %idxprom106 = zext i32 %83 to i64 %arrayidx107 = getelementptr inbounds i32, i32* %82, i64 %idxprom106 %84 = load i32, i32* %arrayidx107, align 4 %rem = urem i32 %84, 32 store i32 %rem, i32* %startbit, align 4 %85 = load i32*, i32** %as, align 8 %86 = load i32, i32* %k, align 4 %idxprom108 = zext i32 %86 to i64 %arrayidx109 = getelementptr inbounds i32, i32* %85, i64 %idxprom108 store i32 0, i32* %arrayidx109, align 4 call void @llvm.nvvm.barrier0() %87 = load i32, i32* %codewordlen, align 4 %88 = load i32, i32* %startbit, align 4 %sub110 = sub i32 32, %88 %cmp111 = icmp ugt i32 %87, %sub110 br i1 %cmp111, label %cond.true, label %cond.false cond.true: ; preds = %if.end102 %89 = load i32, i32* %startbit, align 4 %sub112 = sub i32 32, %89 br label %cond.end cond.false: ; preds = %if.end102 %90 = load i32, i32* %codewordlen, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %sub112, %cond.true ], [ %90, %cond.false ] store i32 %cond, i32* %wrbits, align 4 %91 = load i64, i64* %cw64, align 8 %92 = load i32, i32* %codewordlen, align 4 %93 = load i32, i32* %wrbits, align 4 %sub113 = sub i32 %92, %93 %sh_prom114 = zext i32 %sub113 to i64 %shr115 = lshr i64 %91, %sh_prom114 %conv116 = trunc i64 %shr115 to i32 store i32 %conv116, i32* %tmpcw32, align 4 %94 = load i32*, i32** %as, align 8 %95 = load i32, i32* %kc, align 4 %idxprom117 = zext i32 %95 to i64 %arrayidx118 = getelementptr inbounds i32, i32* %94, i64 %idxprom117 %96 = load i32, i32* %tmpcw32, align 4 %97 = load i32, i32* %startbit, align 4 %sub119 = sub i32 32, %97 %98 = load i32, i32* %wrbits, align 4 %sub120 = sub i32 %sub119, %98 %shl121 = shl i32 %96, %sub120 %call122 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx118, i32 %shl121) #2 %99 = load i32, i32* %wrbits, align 4 %100 = load i32, i32* %codewordlen, align 4 %sub123 = sub i32 %100, %99 store i32 %sub123, i32* %codewordlen, align 4 %101 = load i32, i32* %codewordlen, align 4 %tobool = icmp ne i32 %101, 0 br i1 %tobool, label %if.then124, label %if.end143 if.then124: ; preds = %cond.end %102 = load i32, i32* %codewordlen, align 4 %cmp125 = icmp ugt i32 %102, 32 br i1 %cmp125, label %cond.true126, label %cond.false127 cond.true126: ; preds = %if.then124 br label %cond.end128 cond.false127: ; preds = %if.then124 %103 = load i32, i32* %codewordlen, align 4 br label %cond.end128 cond.end128: ; preds = %cond.false127, %cond.true126 %cond129 = phi i32 [ 32, %cond.true126 ], [ %103, %cond.false127 ] store i32 %cond129, i32* %wrbits, align 4 %104 = load i64, i64* %cw64, align 8 %105 = load i32, i32* %codewordlen, align 4 %106 = load i32, i32* %wrbits, align 4 %sub130 = sub i32 %105, %106 %sh_prom131 = zext i32 %sub130 to i64 %shr132 = lshr i64 %104, %sh_prom131 %conv133 = trunc i64 %shr132 to i32 %107 = load i32, i32* %wrbits, align 4 %shl134 = shl i32 1, %107 %sub135 = sub nsw i32 %shl134, 1 %and = and i32 %conv133, %sub135 store i32 %and, i32* %tmpcw32, align 4 %108 = load i32*, i32** %as, align 8 %109 = load i32, i32* %kc, align 4 %add136 = add i32 %109, 1 %idxprom137 = zext i32 %add136 to i64 %arrayidx138 = getelementptr inbounds i32, i32* %108, i64 %idxprom137 %110 = load i32, i32* %tmpcw32, align 4 %111 = load i32, i32* %wrbits, align 4 %sub139 = sub i32 32, %111 %shl140 = shl i32 %110, %sub139 %call141 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx138, i32 %shl140) #2 %112 = load i32, i32* %wrbits, align 4 %113 = load i32, i32* %codewordlen, align 4 %sub142 = sub i32 %113, %112 store i32 %sub142, i32* %codewordlen, align 4 br label %if.end143 if.end143: ; preds = %cond.end128, %cond.end %114 = load i32, i32* %codewordlen, align 4 %tobool144 = icmp ne i32 %114, 0 br i1 %tobool144, label %if.then145, label %if.end157 if.then145: ; preds = %if.end143 %115 = load i64, i64* %cw64, align 8 %116 = load i32, i32* %codewordlen, align 4 %shl146 = shl i32 1, %116 %sub147 = sub nsw i32 %shl146, 1 %conv148 = sext i32 %sub147 to i64 %and149 = and i64 %115, %conv148 %conv150 = trunc i64 %and149 to i32 store i32 %conv150, i32* %tmpcw32, align 4 %117 = load i32*, i32** %as, align 8 %118 = load i32, i32* %kc, align 4 %add151 = add i32 %118, 2 %idxprom152 = zext i32 %add151 to i64 %arrayidx153 = getelementptr inbounds i32, i32* %117, i64 %idxprom152 %119 = load i32, i32* %tmpcw32, align 4 %120 = load i32, i32* %codewordlen, align 4 %sub154 = sub i32 32, %120 %shl155 = shl i32 %119, %sub154 %call156 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx153, i32 %shl155) #2 br label %if.end157 if.end157: ; preds = %if.then145, %if.end143 call void @llvm.nvvm.barrier0() %121 = load i32, i32* %k, align 4 %122 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_E5kcmax to i32*), align 4 %cmp158 = icmp ule i32 %121, %122 br i1 %cmp158, label %if.then159, label %if.end164 if.then159: ; preds = %if.end157 %123 = load i32*, i32** %as, align 8 %124 = load i32, i32* %k, align 4 %idxprom160 = zext i32 %124 to i64 %arrayidx161 = getelementptr inbounds i32, i32* %123, i64 %idxprom160 %125 = load i32, i32* %arrayidx161, align 4 %126 = load i32*, i32** %out.addr, align 8 %127 = load i32, i32* %kn, align 4 %idxprom162 = zext i32 %127 to i64 %arrayidx163 = getelementptr inbounds i32, i32* %126, i64 %idxprom162 store i32 %125, i32* %arrayidx163, align 4 br label %if.end164 if.end164: ; preds = %if.then159, %if.end157 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal i32 @_ZL8atomicOrPjj(i32* %address, i32 %val) #0 { entry: %address.addr = alloca i32*, align 8 %val.addr = alloca i32, align 4 store i32* %address, i32** %address.addr, align 8 store i32 %val, i32* %val.addr, align 4 %0 = load i32*, i32** %address.addr, align 8 %1 = load i32, i32* %val.addr, align 4 %call = call i32 @_ZL11__uAtomicOrPjj(i32* %0, i32 %1) #2 ret i32 %call } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL5pack2PjS_S_S_j(i32* %srcData, i32* %cindex, i32* %cindex2, i32* %dstData, i32 %original_num_block_elements) #0 { entry: %srcData.addr = alloca i32*, align 8 %cindex.addr = alloca i32*, align 8 %cindex2.addr = alloca i32*, align 8 %dstData.addr = alloca i32*, align 8 %original_num_block_elements.addr = alloca i32, align 4 %tid = alloca i32, align 4 %offset = alloca i32, align 4 %bitsize = alloca i32, align 4 %pos = alloca i32, align 4 %dword = alloca i32, align 4 %bit = alloca i32, align 4 %i = alloca i32, align 4 %dw = alloca i32, align 4 %tmp = alloca i32, align 4 store i32* %srcData, i32** %srcData.addr, align 8 store i32* %cindex, i32** %cindex.addr, align 8 store i32* %cindex2, i32** %cindex2.addr, align 8 store i32* %dstData, i32** %dstData.addr, align 8 store i32 %original_num_block_elements, i32* %original_num_block_elements.addr, align 4 %call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %mul = mul i32 %call, %call1 %call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %add = add i32 %mul, %call2 store i32 %add, i32* %tid, align 4 %0 = load i32, i32* %tid, align 4 %1 = load i32, i32* %original_num_block_elements.addr, align 4 %mul3 = mul i32 %0, %1 store i32 %mul3, i32* %offset, align 4 %2 = load i32*, i32** %cindex.addr, align 8 %3 = load i32, i32* %tid, align 4 %idxprom = zext i32 %3 to i64 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %4 = load i32, i32* %arrayidx, align 4 store i32 %4, i32* %bitsize, align 4 %5 = load i32*, i32** %cindex2.addr, align 8 %6 = load i32, i32* %tid, align 4 %idxprom4 = zext i32 %6 to i64 %arrayidx5 = getelementptr inbounds i32, i32* %5, i64 %idxprom4 %7 = load i32, i32* %arrayidx5, align 4 store i32 %7, i32* %pos, align 4 %8 = load i32, i32* %pos, align 4 %div = udiv i32 %8, 32 store i32 %div, i32* %dword, align 4 %9 = load i32, i32* %pos, align 4 %rem = urem i32 %9, 32 store i32 %rem, i32* %bit, align 4 %10 = load i32*, i32** %srcData.addr, align 8 %11 = load i32, i32* %offset, align 4 %idxprom6 = zext i32 %11 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6 %12 = load i32, i32* %arrayidx7, align 4 store i32 %12, i32* %dw, align 4 %13 = load i32, i32* %dw, align 4 %14 = load i32, i32* %bit, align 4 %shr = lshr i32 %13, %14 store i32 %shr, i32* %tmp, align 4 %15 = load i32*, i32** %dstData.addr, align 8 %16 = load i32, i32* %dword, align 4 %idxprom8 = zext i32 %16 to i64 %arrayidx9 = getelementptr inbounds i32, i32* %15, i64 %idxprom8 %17 = load i32, i32* %tmp, align 4 %call10 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx9, i32 %17) #2 %18 = load i32, i32* %bit, align 4 %cmp = icmp eq i32 %18, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry br label %cond.end cond.false: ; preds = %entry %19 = load i32, i32* %dw, align 4 %20 = load i32, i32* %bit, align 4 %sub = sub i32 32, %20 %shl = shl i32 %19, %sub br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ 0, %cond.true ], [ %shl, %cond.false ] store i32 %cond, i32* %tmp, align 4 store i32 1, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %cond.end %21 = load i32, i32* %i, align 4 %22 = load i32, i32* %bitsize, align 4 %div11 = udiv i32 %22, 32 %cmp12 = icmp ult i32 %21, %div11 br i1 %cmp12, label %for.body, label %for.end for.body: ; preds = %for.cond %23 = load i32*, i32** %srcData.addr, align 8 %24 = load i32, i32* %offset, align 4 %25 = load i32, i32* %i, align 4 %add13 = add i32 %24, %25 %idxprom14 = zext i32 %add13 to i64 %arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14 %26 = load i32, i32* %arrayidx15, align 4 store i32 %26, i32* %dw, align 4 %27 = load i32, i32* %dw, align 4 %28 = load i32, i32* %bit, align 4 %shr16 = lshr i32 %27, %28 %29 = load i32, i32* %tmp, align 4 %or = or i32 %29, %shr16 store i32 %or, i32* %tmp, align 4 %30 = load i32, i32* %tmp, align 4 %31 = load i32*, i32** %dstData.addr, align 8 %32 = load i32, i32* %dword, align 4 %33 = load i32, i32* %i, align 4 %add17 = add i32 %32, %33 %idxprom18 = zext i32 %add17 to i64 %arrayidx19 = getelementptr inbounds i32, i32* %31, i64 %idxprom18 store i32 %30, i32* %arrayidx19, align 4 %34 = load i32, i32* %bit, align 4 %cmp20 = icmp eq i32 %34, 0 br i1 %cmp20, label %cond.true21, label %cond.false22 cond.true21: ; preds = %for.body br label %cond.end25 cond.false22: ; preds = %for.body %35 = load i32, i32* %dw, align 4 %36 = load i32, i32* %bit, align 4 %sub23 = sub i32 32, %36 %shl24 = shl i32 %35, %sub23 br label %cond.end25 cond.end25: ; preds = %cond.false22, %cond.true21 %cond26 = phi i32 [ 0, %cond.true21 ], [ %shl24, %cond.false22 ] store i32 %cond26, i32* %tmp, align 4 br label %for.inc for.inc: ; preds = %cond.end25 %37 = load i32, i32* %i, align 4 %inc = add i32 %37, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond %38 = load i32, i32* %bit, align 4 %cmp27 = icmp ne i32 %38, 0 br i1 %cmp27, label %if.then, label %lor.lhs.false lor.lhs.false: ; preds = %for.end %39 = load i32, i32* %bitsize, align 4 %rem28 = urem i32 %39, 32 %cmp29 = icmp ne i32 %rem28, 0 br i1 %cmp29, label %if.then, label %if.end if.then: ; preds = %lor.lhs.false, %for.end %40 = load i32*, i32** %dstData.addr, align 8 %41 = load i32, i32* %dword, align 4 %42 = load i32, i32* %i, align 4 %add30 = add i32 %41, %42 %idxprom31 = zext i32 %add30 to i64 %arrayidx32 = getelementptr inbounds i32, i32* %40, i64 %idxprom31 %43 = load i32, i32* %tmp, align 4 %call33 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx32, i32 %43) #2 br label %if.end if.end: ; preds = %if.then, %lor.lhs.false %44 = load i32, i32* %bitsize, align 4 %rem34 = urem i32 %44, 32 %cmp35 = icmp ne i32 %rem34, 0 br i1 %cmp35, label %if.then36, label %if.end57 if.then36: ; preds = %if.end %45 = load i32*, i32** %srcData.addr, align 8 %46 = load i32, i32* %offset, align 4 %47 = load i32, i32* %i, align 4 %add37 = add i32 %46, %47 %idxprom38 = zext i32 %add37 to i64 %arrayidx39 = getelementptr inbounds i32, i32* %45, i64 %idxprom38 %48 = load i32, i32* %arrayidx39, align 4 store i32 %48, i32* %dw, align 4 %49 = load i32*, i32** %dstData.addr, align 8 %50 = load i32, i32* %dword, align 4 %51 = load i32, i32* %i, align 4 %add40 = add i32 %50, %51 %idxprom41 = zext i32 %add40 to i64 %arrayidx42 = getelementptr inbounds i32, i32* %49, i64 %idxprom41 %52 = load i32, i32* %dw, align 4 %53 = load i32, i32* %bit, align 4 %shr43 = lshr i32 %52, %53 %call44 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx42, i32 %shr43) #2 %54 = load i32*, i32** %dstData.addr, align 8 %55 = load i32, i32* %dword, align 4 %56 = load i32, i32* %i, align 4 %add45 = add i32 %55, %56 %add46 = add i32 %add45, 1 %idxprom47 = zext i32 %add46 to i64 %arrayidx48 = getelementptr inbounds i32, i32* %54, i64 %idxprom47 %57 = load i32, i32* %bit, align 4 %cmp49 = icmp eq i32 %57, 0 br i1 %cmp49, label %cond.true50, label %cond.false51 cond.true50: ; preds = %if.then36 br label %cond.end54 cond.false51: ; preds = %if.then36 %58 = load i32, i32* %dw, align 4 %59 = load i32, i32* %bit, align 4 %sub52 = sub i32 32, %59 %shl53 = shl i32 %58, %sub52 br label %cond.end54 cond.end54: ; preds = %cond.false51, %cond.true50 %cond55 = phi i32 [ 0, %cond.true50 ], [ %shl53, %cond.false51 ] %call56 = call i32 @_ZL8atomicOrPjj(i32* %arrayidx48, i32 %cond55) #2 br label %if.end57 if.end57: ; preds = %cond.end54, %if.end ret void } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL10uniformAddPjS_iii(i32* %g_data, i32* %uniforms, i32 %n, i32 %blockOffset, i32 %baseIndex) #0 { entry: %g_data.addr = alloca i32*, align 8 %uniforms.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %blockOffset.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %address = alloca i32, align 4 store i32* %g_data, i32** %g_data.addr, align 8 store i32* %uniforms, i32** %uniforms.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %blockOffset, i32* %blockOffset.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %0 = load i32*, i32** %uniforms.addr, align 8 %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %1 = load i32, i32* %blockOffset.addr, align 4 %add = add i32 %call1, %1 %idxprom = zext i32 %add to i64 %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom %2 = load i32, i32* %arrayidx, align 4 store i32 %2, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 br label %if.end if.end: ; preds = %if.then, %entry %call2 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call3, 1 %call4 = call i32 @_ZL7__mul24ii(i32 %call2, i32 %shl) #2 %3 = load i32, i32* %baseIndex.addr, align 4 %add5 = add nsw i32 %call4, %3 %call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %add7 = add i32 %add5, %call6 store i32 %add7, i32* %address, align 4 call void @llvm.nvvm.barrier0() %4 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 %5 = load i32*, i32** %g_data.addr, align 8 %6 = load i32, i32* %address, align 4 %idxprom8 = zext i32 %6 to i64 %arrayidx9 = getelementptr inbounds i32, i32* %5, i64 %idxprom8 %7 = load i32, i32* %arrayidx9, align 4 %add10 = add i32 %7, %4 store i32 %add10, i32* %arrayidx9, align 4 %call11 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %call12 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add13 = add i32 %call11, %call12 %8 = load i32, i32* %n.addr, align 4 %cmp14 = icmp ult i32 %add13, %8 %conv = zext i1 %cmp14 to i32 %9 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZL10uniformAddPjS_iiiE3uni to i32*), align 4 %mul = mul i32 %conv, %9 %10 = load i32*, i32** %g_data.addr, align 8 %11 = load i32, i32* %address, align 4 %call15 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add16 = add i32 %11, %call15 %idxprom17 = zext i32 %add16 to i64 %arrayidx18 = getelementptr inbounds i32, i32* %10, i64 %idxprom17 %12 = load i32, i32* %arrayidx18, align 4 %add19 = add i32 %12, %mul store i32 %add19, i32* %arrayidx18, align 4 ret void } ; Function Attrs: alwaysinline convergent nounwind define internal i32 @_ZL7__mul24ii(i32 %__a, i32 %__b) #1 { entry: %__a.addr = alloca i32, align 4 %__b.addr = alloca i32, align 4 store i32 %__a, i32* %__a.addr, align 4 store i32 %__b, i32* %__b.addr, align 4 %0 = load i32, i32* %__a.addr, align 4 %1 = load i32, i32* %__b.addr, align 4 %call = call i32 @__nv_mul24(i32 %0, i32 %1) #2 ret i32 %call } ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #3 ; Function Attrs: alwaysinline convergent nounwind define internal i32 @_ZL12__uAtomicAddPjj(i32* %__p, i32 %__v) #1 { entry: %__p.addr = alloca i32*, align 8 %__v.addr = alloca i32, align 4 store i32* %__p, i32** %__p.addr, align 8 store i32 %__v, i32* %__v.addr, align 4 %0 = load i32*, i32** %__p.addr, align 8 %1 = load i32, i32* %__v.addr, align 4 %2 = atomicrmw add i32* %0, i32 %1 seq_cst ret i32 %2 } ; Function Attrs: alwaysinline convergent nounwind define internal i32 @_ZL11__uAtomicOrPjj(i32* %__p, i32 %__v) #1 { entry: %__p.addr = alloca i32*, align 8 %__v.addr = alloca i32, align 4 store i32* %__p, i32** %__p.addr, align 8 store i32 %__v, i32* %__v.addr, align 4 %0 = load i32*, i32** %__p.addr, align 8 %1 = load i32, i32* %__v.addr, align 4 %2 = atomicrmw or i32* %0, i32 %1 seq_cst ret i32 %2 } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL7prescanILb1ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { entry: %g_odata.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %blockIndex.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 %mem_ai = alloca i32, align 4 %mem_bi = alloca i32, align 4 %bankOffsetA = alloca i32, align 4 %bankOffsetB = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 %0 = load i32*, i32** %g_idata.addr, align 8 %1 = load i32, i32* %n.addr, align 4 %2 = load i32, i32* %baseIndex.addr, align 4 %cmp = icmp eq i32 %2, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 br label %cond.end cond.false: ; preds = %entry %3 = load i32, i32* %baseIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 %4 = load i32, i32* %blockIndex.addr, align 4 %5 = load i32*, i32** %g_blockSums.addr, align 8 call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 %6 = load i32*, i32** %g_odata.addr, align 8 %7 = load i32, i32* %n.addr, align 4 %8 = load i32, i32* %ai, align 4 %9 = load i32, i32* %bi, align 4 %10 = load i32, i32* %mem_ai, align 4 %11 = load i32, i32* %mem_bi, align 4 %12 = load i32, i32* %bankOffsetA, align 4 %13 = load i32, i32* %bankOffsetB, align 4 call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 { entry: %s_data.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai.addr = alloca i32*, align 8 %bi.addr = alloca i32*, align 8 %mem_ai.addr = alloca i32*, align 8 %mem_bi.addr = alloca i32*, align 8 %bankOffsetA.addr = alloca i32*, align 8 %bankOffsetB.addr = alloca i32*, align 8 %thid = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 store i32* %ai, i32** %ai.addr, align 8 store i32* %bi, i32** %bi.addr, align 8 store i32* %mem_ai, i32** %mem_ai.addr, align 8 store i32* %mem_bi, i32** %mem_bi.addr, align 8 store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8 store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 store i32 %call, i32* %thid, align 4 %0 = load i32, i32* %baseIndex.addr, align 4 %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %add = add i32 %0, %call1 %1 = load i32*, i32** %mem_ai.addr, align 8 store i32 %add, i32* %1, align 4 %2 = load i32*, i32** %mem_ai.addr, align 8 %3 = load i32, i32* %2, align 4 %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add3 = add i32 %3, %call2 %4 = load i32*, i32** %mem_bi.addr, align 8 store i32 %add3, i32* %4, align 4 %5 = load i32, i32* %thid, align 4 %6 = load i32*, i32** %ai.addr, align 8 store i32 %5, i32* %6, align 4 %7 = load i32, i32* %thid, align 4 %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add5 = add i32 %7, %call4 %8 = load i32*, i32** %bi.addr, align 8 store i32 %add5, i32* %8, align 4 %9 = load i32*, i32** %ai.addr, align 8 %10 = load i32, i32* %9, align 4 %shr = ashr i32 %10, 4 %11 = load i32*, i32** %bankOffsetA.addr, align 8 store i32 %shr, i32* %11, align 4 %12 = load i32*, i32** %bi.addr, align 8 %13 = load i32, i32* %12, align 4 %shr6 = ashr i32 %13, 4 %14 = load i32*, i32** %bankOffsetB.addr, align 8 store i32 %shr6, i32* %14, align 4 %15 = load i32*, i32** %g_idata.addr, align 8 %16 = load i32*, i32** %mem_ai.addr, align 8 %17 = load i32, i32* %16, align 4 %idxprom = sext i32 %17 to i64 %arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom %18 = load i32, i32* %arrayidx, align 4 %19 = load i32*, i32** %s_data.addr, align 8 %20 = load i32*, i32** %ai.addr, align 8 %21 = load i32, i32* %20, align 4 %22 = load i32*, i32** %bankOffsetA.addr, align 8 %23 = load i32, i32* %22, align 4 %add7 = add nsw i32 %21, %23 %idxprom8 = sext i32 %add7 to i64 %arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8 store i32 %18, i32* %arrayidx9, align 4 %24 = load i32*, i32** %g_idata.addr, align 8 %25 = load i32*, i32** %mem_bi.addr, align 8 %26 = load i32, i32* %25, align 4 %idxprom10 = sext i32 %26 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %24, i64 %idxprom10 %27 = load i32, i32* %arrayidx11, align 4 %28 = load i32*, i32** %s_data.addr, align 8 %29 = load i32*, i32** %bi.addr, align 8 %30 = load i32, i32* %29, align 4 %31 = load i32*, i32** %bankOffsetB.addr, align 8 %32 = load i32, i32* %31, align 4 %add12 = add nsw i32 %30, %32 %idxprom13 = sext i32 %add12 to i64 %arrayidx14 = getelementptr inbounds i32, i32* %28, i64 %idxprom13 store i32 %27, i32* %arrayidx14, align 4 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL12prescanBlockILb1EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 { entry: %data.addr = alloca i32*, align 8 %blockIndex.addr = alloca i32, align 4 %blockSums.addr = alloca i32*, align 8 %stride = alloca i32, align 4 store i32* %data, i32** %data.addr, align 8 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32* %blockSums, i32** %blockSums.addr, align 8 %0 = load i32*, i32** %data.addr, align 8 %call = call i32 @_ZL8buildSumPj(i32* %0) #2 store i32 %call, i32* %stride, align 4 %1 = load i32*, i32** %data.addr, align 8 %2 = load i32*, i32** %blockSums.addr, align 8 %3 = load i32, i32* %blockIndex.addr, align 4 %cmp = icmp eq i32 %3, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 br label %cond.end cond.false: ; preds = %entry %4 = load i32, i32* %blockIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ] call void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2 %5 = load i32*, i32** %data.addr, align 8 %6 = load i32, i32* %stride, align 4 call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 { entry: %g_odata.addr = alloca i32*, align 8 %s_data.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %ai.addr = alloca i32, align 4 %bi.addr = alloca i32, align 4 %mem_ai.addr = alloca i32, align 4 %mem_bi.addr = alloca i32, align 4 %bankOffsetA.addr = alloca i32, align 4 %bankOffsetB.addr = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %s_data, i32** %s_data.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %ai, i32* %ai.addr, align 4 store i32 %bi, i32* %bi.addr, align 4 store i32 %mem_ai, i32* %mem_ai.addr, align 4 store i32 %mem_bi, i32* %mem_bi.addr, align 4 store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4 store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4 call void @llvm.nvvm.barrier0() %0 = load i32*, i32** %s_data.addr, align 8 %1 = load i32, i32* %ai.addr, align 4 %2 = load i32, i32* %bankOffsetA.addr, align 4 %add = add nsw i32 %1, %2 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %4 = load i32*, i32** %g_odata.addr, align 8 %5 = load i32, i32* %mem_ai.addr, align 4 %idxprom1 = sext i32 %5 to i64 %arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1 store i32 %3, i32* %arrayidx2, align 4 %6 = load i32*, i32** %s_data.addr, align 8 %7 = load i32, i32* %bi.addr, align 4 %8 = load i32, i32* %bankOffsetB.addr, align 4 %add3 = add nsw i32 %7, %8 %idxprom4 = sext i32 %add3 to i64 %arrayidx5 = getelementptr inbounds i32, i32* %6, i64 %idxprom4 %9 = load i32, i32* %arrayidx5, align 4 %10 = load i32*, i32** %g_odata.addr, align 8 %11 = load i32, i32* %mem_bi.addr, align 4 %idxprom6 = sext i32 %11 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %10, i64 %idxprom6 store i32 %9, i32* %arrayidx7, align 4 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal i32 @_ZL8buildSumPj(i32* %s_data) #0 { entry: %s_data.addr = alloca i32*, align 8 %thid = alloca i32, align 4 %stride = alloca i32, align 4 %d = alloca i32, align 4 %i = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 store i32 %call, i32* %thid, align 4 store i32 1, i32* %stride, align 4 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 store i32 %call1, i32* %d, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %d, align 4 %cmp = icmp sgt i32 %0, 0 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond call void @llvm.nvvm.barrier0() %1 = load i32, i32* %thid, align 4 %2 = load i32, i32* %d, align 4 %cmp2 = icmp ult i32 %1, %2 br i1 %cmp2, label %if.then, label %if.end if.then: ; preds = %for.body %3 = load i32, i32* %stride, align 4 %call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %3) #2 %4 = load i32, i32* %thid, align 4 %call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %4) #2 store i32 %call4, i32* %i, align 4 %5 = load i32, i32* %i, align 4 %6 = load i32, i32* %stride, align 4 %add = add i32 %5, %6 %sub = sub i32 %add, 1 store i32 %sub, i32* %ai, align 4 %7 = load i32, i32* %ai, align 4 %8 = load i32, i32* %stride, align 4 %add5 = add i32 %7, %8 store i32 %add5, i32* %bi, align 4 %9 = load i32, i32* %ai, align 4 %shr = ashr i32 %9, 4 %10 = load i32, i32* %ai, align 4 %add6 = add nsw i32 %10, %shr store i32 %add6, i32* %ai, align 4 %11 = load i32, i32* %bi, align 4 %shr7 = ashr i32 %11, 4 %12 = load i32, i32* %bi, align 4 %add8 = add nsw i32 %12, %shr7 store i32 %add8, i32* %bi, align 4 %13 = load i32*, i32** %s_data.addr, align 8 %14 = load i32, i32* %ai, align 4 %idxprom = sext i32 %14 to i64 %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom %15 = load i32, i32* %arrayidx, align 4 %16 = load i32*, i32** %s_data.addr, align 8 %17 = load i32, i32* %bi, align 4 %idxprom9 = sext i32 %17 to i64 %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9 %18 = load i32, i32* %arrayidx10, align 4 %add11 = add i32 %18, %15 store i32 %add11, i32* %arrayidx10, align 4 br label %if.end if.end: ; preds = %if.then, %for.body %19 = load i32, i32* %stride, align 4 %mul = mul i32 %19, 2 store i32 %mul, i32* %stride, align 4 br label %for.inc for.inc: ; preds = %if.end %20 = load i32, i32* %d, align 4 %shr12 = ashr i32 %20, 1 store i32 %shr12, i32* %d, align 4 br label %for.cond for.end: ; preds = %for.cond %21 = load i32, i32* %stride, align 4 ret i32 %21 } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL16clearLastElementILb1EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 { entry: %s_data.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %blockIndex.addr = alloca i32, align 4 %index = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %blockIndex, i32* %blockIndex.addr, align 4 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %sub = sub i32 %shl, 1 store i32 %sub, i32* %index, align 4 %0 = load i32, i32* %index, align 4 %shr = ashr i32 %0, 4 %1 = load i32, i32* %index, align 4 %add = add nsw i32 %1, %shr store i32 %add, i32* %index, align 4 %2 = load i32*, i32** %s_data.addr, align 8 %3 = load i32, i32* %index, align 4 %idxprom = sext i32 %3 to i64 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %4 = load i32, i32* %arrayidx, align 4 %5 = load i32*, i32** %g_blockSums.addr, align 8 %6 = load i32, i32* %blockIndex.addr, align 4 %idxprom2 = sext i32 %6 to i64 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 store i32 %4, i32* %arrayidx3, align 4 %7 = load i32*, i32** %s_data.addr, align 8 %8 = load i32, i32* %index, align 4 %idxprom4 = sext i32 %8 to i64 %arrayidx5 = getelementptr inbounds i32, i32* %7, i64 %idxprom4 store i32 0, i32* %arrayidx5, align 4 br label %if.end if.end: ; preds = %if.then, %entry ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL16scanRootToLeavesPjj(i32* %s_data, i32 %stride) #0 { entry: %s_data.addr = alloca i32*, align 8 %stride.addr = alloca i32, align 4 %thid = alloca i32, align 4 %d = alloca i32, align 4 %i = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 %t = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 store i32 %stride, i32* %stride.addr, align 4 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 store i32 %call, i32* %thid, align 4 store i32 1, i32* %d, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %d, align 4 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %cmp = icmp ule i32 %0, %call1 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %stride.addr, align 4 %shr = lshr i32 %1, 1 store i32 %shr, i32* %stride.addr, align 4 call void @llvm.nvvm.barrier0() %2 = load i32, i32* %thid, align 4 %3 = load i32, i32* %d, align 4 %cmp2 = icmp ult i32 %2, %3 br i1 %cmp2, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %stride.addr, align 4 %call3 = call i32 @_ZL7__mul24ii(i32 2, i32 %4) #2 %5 = load i32, i32* %thid, align 4 %call4 = call i32 @_ZL7__mul24ii(i32 %call3, i32 %5) #2 store i32 %call4, i32* %i, align 4 %6 = load i32, i32* %i, align 4 %7 = load i32, i32* %stride.addr, align 4 %add = add i32 %6, %7 %sub = sub i32 %add, 1 store i32 %sub, i32* %ai, align 4 %8 = load i32, i32* %ai, align 4 %9 = load i32, i32* %stride.addr, align 4 %add5 = add i32 %8, %9 store i32 %add5, i32* %bi, align 4 %10 = load i32, i32* %ai, align 4 %shr6 = ashr i32 %10, 4 %11 = load i32, i32* %ai, align 4 %add7 = add nsw i32 %11, %shr6 store i32 %add7, i32* %ai, align 4 %12 = load i32, i32* %bi, align 4 %shr8 = ashr i32 %12, 4 %13 = load i32, i32* %bi, align 4 %add9 = add nsw i32 %13, %shr8 store i32 %add9, i32* %bi, align 4 %14 = load i32*, i32** %s_data.addr, align 8 %15 = load i32, i32* %ai, align 4 %idxprom = sext i32 %15 to i64 %arrayidx = getelementptr inbounds i32, i32* %14, i64 %idxprom %16 = load i32, i32* %arrayidx, align 4 store i32 %16, i32* %t, align 4 %17 = load i32*, i32** %s_data.addr, align 8 %18 = load i32, i32* %bi, align 4 %idxprom10 = sext i32 %18 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %17, i64 %idxprom10 %19 = load i32, i32* %arrayidx11, align 4 %20 = load i32*, i32** %s_data.addr, align 8 %21 = load i32, i32* %ai, align 4 %idxprom12 = sext i32 %21 to i64 %arrayidx13 = getelementptr inbounds i32, i32* %20, i64 %idxprom12 store i32 %19, i32* %arrayidx13, align 4 %22 = load i32, i32* %t, align 4 %23 = load i32*, i32** %s_data.addr, align 8 %24 = load i32, i32* %bi, align 4 %idxprom14 = sext i32 %24 to i64 %arrayidx15 = getelementptr inbounds i32, i32* %23, i64 %idxprom14 %25 = load i32, i32* %arrayidx15, align 4 %add16 = add i32 %25, %22 store i32 %add16, i32* %arrayidx15, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %26 = load i32, i32* %d, align 4 %mul = mul nsw i32 %26, 2 store i32 %mul, i32* %d, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL7prescanILb1ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { entry: %g_odata.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %blockIndex.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 %mem_ai = alloca i32, align 4 %mem_bi = alloca i32, align 4 %bankOffsetA = alloca i32, align 4 %bankOffsetB = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 %0 = load i32*, i32** %g_idata.addr, align 8 %1 = load i32, i32* %n.addr, align 4 %2 = load i32, i32* %baseIndex.addr, align 4 %cmp = icmp eq i32 %2, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 br label %cond.end cond.false: ; preds = %entry %3 = load i32, i32* %baseIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 %4 = load i32, i32* %blockIndex.addr, align 4 %5 = load i32*, i32** %g_blockSums.addr, align 8 call void @_ZL12prescanBlockILb1EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 %6 = load i32*, i32** %g_odata.addr, align 8 %7 = load i32, i32* %n.addr, align 4 %8 = load i32, i32* %ai, align 4 %9 = load i32, i32* %bi, align 4 %10 = load i32, i32* %mem_ai, align 4 %11 = load i32, i32* %mem_bi, align 4 %12 = load i32, i32* %bankOffsetA, align 4 %13 = load i32, i32* %bankOffsetB, align 4 call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb1ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* %s_data, i32* %g_idata, i32 %n, i32 %baseIndex, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #0 { entry: %s_data.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai.addr = alloca i32*, align 8 %bi.addr = alloca i32*, align 8 %mem_ai.addr = alloca i32*, align 8 %mem_bi.addr = alloca i32*, align 8 %bankOffsetA.addr = alloca i32*, align 8 %bankOffsetB.addr = alloca i32*, align 8 %thid = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 store i32* %ai, i32** %ai.addr, align 8 store i32* %bi, i32** %bi.addr, align 8 store i32* %mem_ai, i32** %mem_ai.addr, align 8 store i32* %mem_bi, i32** %mem_bi.addr, align 8 store i32* %bankOffsetA, i32** %bankOffsetA.addr, align 8 store i32* %bankOffsetB, i32** %bankOffsetB.addr, align 8 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 store i32 %call, i32* %thid, align 4 %0 = load i32, i32* %baseIndex.addr, align 4 %call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %add = add i32 %0, %call1 %1 = load i32*, i32** %mem_ai.addr, align 8 store i32 %add, i32* %1, align 4 %2 = load i32*, i32** %mem_ai.addr, align 8 %3 = load i32, i32* %2, align 4 %call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add3 = add i32 %3, %call2 %4 = load i32*, i32** %mem_bi.addr, align 8 store i32 %add3, i32* %4, align 4 %5 = load i32, i32* %thid, align 4 %6 = load i32*, i32** %ai.addr, align 8 store i32 %5, i32* %6, align 4 %7 = load i32, i32* %thid, align 4 %call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %add5 = add i32 %7, %call4 %8 = load i32*, i32** %bi.addr, align 8 store i32 %add5, i32* %8, align 4 %9 = load i32*, i32** %ai.addr, align 8 %10 = load i32, i32* %9, align 4 %shr = ashr i32 %10, 4 %11 = load i32*, i32** %bankOffsetA.addr, align 8 store i32 %shr, i32* %11, align 4 %12 = load i32*, i32** %bi.addr, align 8 %13 = load i32, i32* %12, align 4 %shr6 = ashr i32 %13, 4 %14 = load i32*, i32** %bankOffsetB.addr, align 8 store i32 %shr6, i32* %14, align 4 %15 = load i32*, i32** %g_idata.addr, align 8 %16 = load i32*, i32** %mem_ai.addr, align 8 %17 = load i32, i32* %16, align 4 %idxprom = sext i32 %17 to i64 %arrayidx = getelementptr inbounds i32, i32* %15, i64 %idxprom %18 = load i32, i32* %arrayidx, align 4 %19 = load i32*, i32** %s_data.addr, align 8 %20 = load i32*, i32** %ai.addr, align 8 %21 = load i32, i32* %20, align 4 %22 = load i32*, i32** %bankOffsetA.addr, align 8 %23 = load i32, i32* %22, align 4 %add7 = add nsw i32 %21, %23 %idxprom8 = sext i32 %add7 to i64 %arrayidx9 = getelementptr inbounds i32, i32* %19, i64 %idxprom8 store i32 %18, i32* %arrayidx9, align 4 %24 = load i32*, i32** %bi.addr, align 8 %25 = load i32, i32* %24, align 4 %26 = load i32, i32* %n.addr, align 4 %cmp = icmp slt i32 %25, %26 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %27 = load i32*, i32** %g_idata.addr, align 8 %28 = load i32*, i32** %mem_bi.addr, align 8 %29 = load i32, i32* %28, align 4 %idxprom10 = sext i32 %29 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %27, i64 %idxprom10 %30 = load i32, i32* %arrayidx11, align 4 br label %cond.end cond.false: ; preds = %entry br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %30, %cond.true ], [ 0, %cond.false ] %31 = load i32*, i32** %s_data.addr, align 8 %32 = load i32*, i32** %bi.addr, align 8 %33 = load i32, i32* %32, align 4 %34 = load i32*, i32** %bankOffsetB.addr, align 8 %35 = load i32, i32* %34, align 4 %add12 = add nsw i32 %33, %35 %idxprom13 = sext i32 %add12 to i64 %arrayidx14 = getelementptr inbounds i32, i32* %31, i64 %idxprom13 store i32 %cond, i32* %arrayidx14, align 4 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %g_odata, i32* %s_data, i32 %n, i32 %ai, i32 %bi, i32 %mem_ai, i32 %mem_bi, i32 %bankOffsetA, i32 %bankOffsetB) #0 { entry: %g_odata.addr = alloca i32*, align 8 %s_data.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %ai.addr = alloca i32, align 4 %bi.addr = alloca i32, align 4 %mem_ai.addr = alloca i32, align 4 %mem_bi.addr = alloca i32, align 4 %bankOffsetA.addr = alloca i32, align 4 %bankOffsetB.addr = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %s_data, i32** %s_data.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %ai, i32* %ai.addr, align 4 store i32 %bi, i32* %bi.addr, align 4 store i32 %mem_ai, i32* %mem_ai.addr, align 4 store i32 %mem_bi, i32* %mem_bi.addr, align 4 store i32 %bankOffsetA, i32* %bankOffsetA.addr, align 4 store i32 %bankOffsetB, i32* %bankOffsetB.addr, align 4 call void @llvm.nvvm.barrier0() %0 = load i32*, i32** %s_data.addr, align 8 %1 = load i32, i32* %ai.addr, align 4 %2 = load i32, i32* %bankOffsetA.addr, align 4 %add = add nsw i32 %1, %2 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %4 = load i32*, i32** %g_odata.addr, align 8 %5 = load i32, i32* %mem_ai.addr, align 4 %idxprom1 = sext i32 %5 to i64 %arrayidx2 = getelementptr inbounds i32, i32* %4, i64 %idxprom1 store i32 %3, i32* %arrayidx2, align 4 %6 = load i32, i32* %bi.addr, align 4 %7 = load i32, i32* %n.addr, align 4 %cmp = icmp slt i32 %6, %7 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %8 = load i32*, i32** %s_data.addr, align 8 %9 = load i32, i32* %bi.addr, align 4 %10 = load i32, i32* %bankOffsetB.addr, align 4 %add3 = add nsw i32 %9, %10 %idxprom4 = sext i32 %add3 to i64 %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4 %11 = load i32, i32* %arrayidx5, align 4 %12 = load i32*, i32** %g_odata.addr, align 8 %13 = load i32, i32* %mem_bi.addr, align 4 %idxprom6 = sext i32 %13 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %12, i64 %idxprom6 store i32 %11, i32* %arrayidx7, align 4 br label %if.end if.end: ; preds = %if.then, %entry ret void } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL7prescanILb0ELb0EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { entry: %g_odata.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %blockIndex.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 %mem_ai = alloca i32, align 4 %mem_bi = alloca i32, align 4 %bankOffsetA = alloca i32, align 4 %bankOffsetB = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 %0 = load i32*, i32** %g_idata.addr, align 8 %1 = load i32, i32* %n.addr, align 4 %2 = load i32, i32* %baseIndex.addr, align 4 %cmp = icmp eq i32 %2, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 br label %cond.end cond.false: ; preds = %entry %3 = load i32, i32* %baseIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] call void @_ZL22loadSharedChunkFromMemILb0EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 %4 = load i32, i32* %blockIndex.addr, align 4 %5 = load i32*, i32** %g_blockSums.addr, align 8 call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 %6 = load i32*, i32** %g_odata.addr, align 8 %7 = load i32, i32* %n.addr, align 4 %8 = load i32, i32* %ai, align 4 %9 = load i32, i32* %bi, align 4 %10 = load i32, i32* %mem_ai, align 4 %11 = load i32, i32* %mem_bi, align 4 %12 = load i32, i32* %bankOffsetA, align 4 %13 = load i32, i32* %bankOffsetB, align 4 call void @_ZL21storeSharedChunkToMemILb0EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb0EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL12prescanBlockILb0EEvPjiS0_(i32* %data, i32 %blockIndex, i32* %blockSums) #0 { entry: %data.addr = alloca i32*, align 8 %blockIndex.addr = alloca i32, align 4 %blockSums.addr = alloca i32*, align 8 %stride = alloca i32, align 4 store i32* %data, i32** %data.addr, align 8 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32* %blockSums, i32** %blockSums.addr, align 8 %0 = load i32*, i32** %data.addr, align 8 %call = call i32 @_ZL8buildSumPj(i32* %0) #2 store i32 %call, i32* %stride, align 4 %1 = load i32*, i32** %data.addr, align 8 %2 = load i32*, i32** %blockSums.addr, align 8 %3 = load i32, i32* %blockIndex.addr, align 4 %cmp = icmp eq i32 %3, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 br label %cond.end cond.false: ; preds = %entry %4 = load i32, i32* %blockIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call1, %cond.true ], [ %4, %cond.false ] call void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %1, i32* %2, i32 %cond) #2 %5 = load i32*, i32** %data.addr, align 8 %6 = load i32, i32* %stride, align 4 call void @_ZL16scanRootToLeavesPjj(i32* %5, i32 %6) #2 ret void } ; Function Attrs: convergent noinline nounwind optnone define internal void @_ZL16clearLastElementILb0EEvPjS0_i(i32* %s_data, i32* %g_blockSums, i32 %blockIndex) #0 { entry: %s_data.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %blockIndex.addr = alloca i32, align 4 %index = alloca i32, align 4 store i32* %s_data, i32** %s_data.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %blockIndex, i32* %blockIndex.addr, align 4 %call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2 %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %sub = sub i32 %shl, 1 store i32 %sub, i32* %index, align 4 %0 = load i32, i32* %index, align 4 %shr = ashr i32 %0, 4 %1 = load i32, i32* %index, align 4 %add = add nsw i32 %1, %shr store i32 %add, i32* %index, align 4 %2 = load i32*, i32** %s_data.addr, align 8 %3 = load i32, i32* %index, align 4 %idxprom = sext i32 %3 to i64 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom store i32 0, i32* %arrayidx, align 4 br label %if.end if.end: ; preds = %if.then, %entry ret void } ; Function Attrs: convergent noinline nounwind optnone define dso_local void @_ZL7prescanILb0ELb1EEvPjPKjS0_iii(i32* %g_odata, i32* %g_idata, i32* %g_blockSums, i32 %n, i32 %blockIndex, i32 %baseIndex) #0 { entry: %g_odata.addr = alloca i32*, align 8 %g_idata.addr = alloca i32*, align 8 %g_blockSums.addr = alloca i32*, align 8 %n.addr = alloca i32, align 4 %blockIndex.addr = alloca i32, align 4 %baseIndex.addr = alloca i32, align 4 %ai = alloca i32, align 4 %bi = alloca i32, align 4 %mem_ai = alloca i32, align 4 %mem_bi = alloca i32, align 4 %bankOffsetA = alloca i32, align 4 %bankOffsetB = alloca i32, align 4 store i32* %g_odata, i32** %g_odata.addr, align 8 store i32* %g_idata, i32** %g_idata.addr, align 8 store i32* %g_blockSums, i32** %g_blockSums.addr, align 8 store i32 %n, i32* %n.addr, align 4 store i32 %blockIndex, i32* %blockIndex.addr, align 4 store i32 %baseIndex, i32* %baseIndex.addr, align 4 %0 = load i32*, i32** %g_idata.addr, align 8 %1 = load i32, i32* %n.addr, align 4 %2 = load i32, i32* %baseIndex.addr, align 4 %cmp = icmp eq i32 %2, 0 br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry %call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2 %call1 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #2 %shl = shl i32 %call1, 1 %call2 = call i32 @_ZL7__mul24ii(i32 %call, i32 %shl) #2 br label %cond.end cond.false: ; preds = %entry %3 = load i32, i32* %baseIndex.addr, align 4 br label %cond.end cond.end: ; preds = %cond.false, %cond.true %cond = phi i32 [ %call2, %cond.true ], [ %3, %cond.false ] call void @_ZL22loadSharedChunkFromMemILb1EEvPjPKjiiRiS3_S3_S3_S3_S3_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32* %0, i32 %1, i32 %cond, i32* dereferenceable(4) %ai, i32* dereferenceable(4) %bi, i32* dereferenceable(4) %mem_ai, i32* dereferenceable(4) %mem_bi, i32* dereferenceable(4) %bankOffsetA, i32* dereferenceable(4) %bankOffsetB) #2 %4 = load i32, i32* %blockIndex.addr, align 4 %5 = load i32*, i32** %g_blockSums.addr, align 8 call void @_ZL12prescanBlockILb0EEvPjiS0_(i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %4, i32* %5) #2 %6 = load i32*, i32** %g_odata.addr, align 8 %7 = load i32, i32* %n.addr, align 4 %8 = load i32, i32* %ai, align 4 %9 = load i32, i32* %bi, align 4 %10 = load i32, i32* %mem_ai, align 4 %11 = load i32, i32* %mem_bi, align 4 %12 = load i32, i32* %bankOffsetA, align 4 %13 = load i32, i32* %bankOffsetB, align 4 call void @_ZL21storeSharedChunkToMemILb1EEvPjPKjiiiiiii(i32* %6, i32* getelementptr inbounds ([3072 x i32], [3072 x i32]* addrspacecast ([3072 x i32] addrspace(3)* @_ZZL7prescanILb0ELb1EEvPjPKjS0_iiiE6s_data to [3072 x i32]*), i64 0, i64 0), i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #2 ret void } ; Function Attrs: alwaysinline convergent inlinehint nounwind define internal i32 @__nv_mul24(i32 %x, i32 %y) #4 { %1 = call i32 @llvm.nvvm.mul24.i(i32 %x, i32 %y) ret i32 %1 } ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.mul24.i(i32, i32) #3 attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { convergent nounwind } attributes #3 = { nounwind readnone } attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1, !2} !nvvm.annotations = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !11, !13, !13, !13, !13, !14, !14, !13} !llvm.ident = !{!15} !nvvmir.version = !{!16} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} !3 = !{void (i8*, i64, i32*)* @_Z12histo_kernelPhlPj, !"kernel", i32 1} !4 = !{void (i32*, i32*, i32*, i32*, i32*, i32*, i32*, i32*)* @_ZL26vlc_encode_kernel_sm64huffPjPKjS1_S_S_S_S_S_, !"kernel", i32 1} !5 = !{void (i32*, i32*, i32*, i32*, i32)* @_ZL5pack2PjS_S_S_j, !"kernel", i32 1} !6 = !{void (i32*, i32*, i32, i32, i32)* @_ZL10uniformAddPjS_iii, !"kernel", i32 1} !7 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb0EEvPjPKjS0_iii, !"kernel", i32 1} !8 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb1ELb1EEvPjPKjS0_iii, !"kernel", i32 1} !9 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb0EEvPjPKjS0_iii, !"kernel", i32 1} !10 = !{void (i32*, i32*, i32*, i32, i32, i32)* @_ZL7prescanILb0ELb1EEvPjPKjS0_iii, !"kernel", i32 1} !11 = !{null, !"align", i32 8} !12 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} !13 = !{null, !"align", i32 16} !14 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} !15 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"} !16 = !{i32 1, i32 4}