CuPBoP/examples/bfs/bfs-host-x86_64-unknown-lin...

; ModuleID = 'bfs-host-x86_64-unknown-linux-gnu.bc'
source_filename = "bfs.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.Node = type { i32, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

$_ZN4dim3C2Ejjj = comdat any

@no_of_nodes = dso_local global i32 0, align 4
@edge_list_size = dso_local global i32 0, align 4
@fp = dso_local global %struct._IO_FILE* null, align 8
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str = private unnamed_addr constant [24 x i8] c"Usage: %s <input_file>\0A\00", align 1
@.str.1 = private unnamed_addr constant [14 x i8] c"Reading File\0A\00", align 1
@.str.2 = private unnamed_addr constant [2 x i8] c"r\00", align 1
@.str.3 = private unnamed_addr constant [26 x i8] c"Error Reading graph file\0A\00", align 1
@.str.4 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
@.str.5 = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1
@.str.6 = private unnamed_addr constant [11 x i8] c"Read File\0A\00", align 1
@.str.7 = private unnamed_addr constant [33 x i8] c"Copied Everything to GPU memory\0A\00", align 1
@.str.8 = private unnamed_addr constant [27 x i8] c"Start traversing the tree\0A\00", align 1
@.str.9 = private unnamed_addr constant [26 x i8] c"Kernel Executed %d times\0A\00", align 1
@.str.10 = private unnamed_addr constant [11 x i8] c"result.txt\00", align 1
@.str.11 = private unnamed_addr constant [2 x i8] c"w\00", align 1
@.str.12 = private unnamed_addr constant [13 x i8] c"%d) cost:%d\0A\00", align 1
@.str.13 = private unnamed_addr constant [29 x i8] c"Result stored in result.txt\0A\00", align 1
@0 = private unnamed_addr constant [30 x i8] c"_Z6KernelP4NodePiPbS2_S2_S1_i\00", align 1
@1 = private unnamed_addr constant [20 x i8] c"_Z7Kernel2PbS_S_S_i\00", align 1
@2 = private constant [15329 x i8] c"P\EDU\BA\01\00\10\00\D0;\00\00\00\00\00\00\02\00\01\01@\00\00\00H2\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\A01\00\00\00\00\00\00\A0.\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z7Kernel2PbS_S_S_i\00.nv.info._Z7Kernel2PbS_S_S_i\00.nv.shared._Z7Kernel2PbS_S_S_i\00.nv.global\00.nv.constant0._Z7Kernel2PbS_S_S_i\00.text._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.info._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.shared._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.constant0._Z6KernelP4NodePiPbS2_S2_S1_i\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z7Kernel2PbS_S_S_i\00.text._Z7Kernel2PbS_S_S_i\00.nv.info._Z7Kernel2PbS_S_S_i\00.nv.shared._Z7Kernel2PbS_S_S_i\00.nv.global\00blockIdx\00threadIdx\00.nv.constant0._Z7Kernel2PbS_S_S_i\00_param\00_Z6KernelP4NodePiPbS2_S2_S1_i\00.text._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.info._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.shared._Z6KernelP4NodePiPbS2_S2_S1_i\00.nv.constant0._Z6KernelP4NodePiPbS2_S2_S1_i\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00F\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\9C\00\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A7\00\00\00\01\00\0B\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\B0\00\00\00\01\00\0B\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\BA\00\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00u\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\0D\00\00\00\00\00\00\E3\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\18\00\00\00\00\00\00\04/\08\00\09\00\00\00\13\00\00\00\04#\08\00\09\00\00\00\00\00\00\00\04\12\08\00\09\00\00\00@\00\00\00\04\11\08\00\09\00\00\00@\00\00\00\04/\08\00\08\00\00\00\0F\00\00\00\04#\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00(\00\00\00\04\11\08\00\08\00\00\00(\00\00\00\010\00\00\01*\00\00\04\0A\08\00\05\00\00\00@\01$\00\03\19$\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\C8\04\00\00\04\1C\04\00H\0D\00\00\04\1E\04\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\07\00\00\00@\014\00\03\194\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0\11\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00\A8\06\00\00\04\1C\04\008\18\00\00\04\1E\04\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([15329 x i8], [15329 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
entry:
  %g_graph_nodes.addr = alloca %struct.Node*, align 8
  %g_graph_edges.addr = alloca i32*, align 8
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_cost.addr = alloca i32*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
  store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i32* %g_cost, i32** %g_cost.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %kernel_args = alloca i8*, i64 7, align 16
  %0 = bitcast %struct.Node** %g_graph_nodes.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast i32** %g_graph_edges.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast i8** %g_graph_mask.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast i8** %g_updating_graph_mask.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast i8** %g_graph_visited.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = bitcast i32** %g_cost.addr to i8*
  %11 = getelementptr i8*, i8** %kernel_args, i32 5
  store i8* %10, i8** %11
  %12 = bitcast i32* %no_of_nodes.addr to i8*
  %13 = getelementptr i8*, i8** %kernel_args, i32 6
  store i8* %12, i8** %13
  %14 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %15 = load i64, i64* %shmem_size, align 8
  %16 = load i8*, i8** %stream, align 8
  %17 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %18 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 12, i1 false)
  %19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %20 = load i64, i64* %19, align 8
  %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %22 = load i32, i32* %21, align 8
  %23 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %24 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %23, i8* align 8 %24, i64 12, i1 false)
  %25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %26 = load i64, i64* %25, align 8
  %27 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %28 = load i32, i32* %27, align 8
  %29 = bitcast i8* %16 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i to i8*), i64 %20, i32 %22, i64 %26, i32 %28, i8** %kernel_args, i64 %15, %struct.CUstream_st* %29)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)

declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
entry:
  %g_graph_mask.addr = alloca i8*, align 8
  %g_updating_graph_mask.addr = alloca i8*, align 8
  %g_graph_visited.addr = alloca i8*, align 8
  %g_over.addr = alloca i8*, align 8
  %no_of_nodes.addr = alloca i32, align 4
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  %grid_dim.coerce = alloca { i64, i32 }, align 8
  %block_dim.coerce = alloca { i64, i32 }, align 8
  store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
  store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
  store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
  store i8* %g_over, i8** %g_over.addr, align 8
  store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
  %kernel_args = alloca i8*, i64 5, align 16
  %0 = bitcast i8** %g_graph_mask.addr to i8*
  %1 = getelementptr i8*, i8** %kernel_args, i32 0
  store i8* %0, i8** %1
  %2 = bitcast i8** %g_updating_graph_mask.addr to i8*
  %3 = getelementptr i8*, i8** %kernel_args, i32 1
  store i8* %2, i8** %3
  %4 = bitcast i8** %g_graph_visited.addr to i8*
  %5 = getelementptr i8*, i8** %kernel_args, i32 2
  store i8* %4, i8** %5
  %6 = bitcast i8** %g_over.addr to i8*
  %7 = getelementptr i8*, i8** %kernel_args, i32 3
  store i8* %6, i8** %7
  %8 = bitcast i32* %no_of_nodes.addr to i8*
  %9 = getelementptr i8*, i8** %kernel_args, i32 4
  store i8* %8, i8** %9
  %10 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
  %11 = load i64, i64* %shmem_size, align 8
  %12 = load i8*, i8** %stream, align 8
  %13 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
  %14 = bitcast %struct.dim3* %grid_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 12, i1 false)
  %15 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
  %16 = load i64, i64* %15, align 8
  %17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
  %18 = load i32, i32* %17, align 8
  %19 = bitcast { i64, i32 }* %block_dim.coerce to i8*
  %20 = bitcast %struct.dim3* %block_dim to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %19, i8* align 8 %20, i64 12, i1 false)
  %21 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
  %22 = load i64, i64* %21, align 8
  %23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
  %24 = load i32, i32* %23, align 8
  %25 = bitcast i8* %12 to %struct.CUstream_st*
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i to i8*), i64 %16, i32 %18, i64 %22, i32 %24, i8** %kernel_args, i64 %11, %struct.CUstream_st* %25)
  br label %setup.end

setup.end:                                        ; preds = %entry
  ret void
}

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #2 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %call = call i32 @cudaSetDevice(i32 0)
  store i32 0, i32* @no_of_nodes, align 4
  store i32 0, i32* @edge_list_size, align 4
  %0 = load i32, i32* %argc.addr, align 4
  %1 = load i8**, i8*** %argv.addr, align 8
  call void @_Z8BFSGraphiPPc(i32 %0, i8** %1)
  ret i32 0
}

declare dso_local i32 @cudaSetDevice(i32) #3

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z8BFSGraphiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %input_f = alloca i8*, align 8
  %source = alloca i32, align 4
  %num_of_blocks = alloca i32, align 4
  %num_of_threads_per_block = alloca i32, align 4
  %h_graph_nodes = alloca %struct.Node*, align 8
  %h_graph_mask = alloca i8*, align 8
  %h_updating_graph_mask = alloca i8*, align 8
  %h_graph_visited = alloca i8*, align 8
  %start = alloca i32, align 4
  %edgeno = alloca i32, align 4
  %i = alloca i32, align 4
  %id = alloca i32, align 4
  %cost = alloca i32, align 4
  %h_graph_edges = alloca i32*, align 8
  %i41 = alloca i32, align 4
  %d_graph_nodes = alloca %struct.Node*, align 8
  %d_graph_edges = alloca i32*, align 8
  %d_graph_mask = alloca i8*, align 8
  %d_updating_graph_mask = alloca i8*, align 8
  %d_graph_visited = alloca i8*, align 8
  %h_cost = alloca i32*, align 8
  %i90 = alloca i32, align 4
  %d_cost = alloca i32*, align 8
  %d_over = alloca i8*, align 8
  %grid = alloca %struct.dim3, align 4
  %threads = alloca %struct.dim3, align 4
  %k = alloca i32, align 4
  %stop = alloca i8, align 1
  %agg.tmp = alloca %struct.dim3, align 4
  %agg.tmp111 = alloca %struct.dim3, align 4
  %agg.tmp.coerce = alloca { i64, i32 }, align 4
  %agg.tmp111.coerce = alloca { i64, i32 }, align 4
  %agg.tmp115 = alloca %struct.dim3, align 4
  %agg.tmp116 = alloca %struct.dim3, align 4
  %agg.tmp115.coerce = alloca { i64, i32 }, align 4
  %agg.tmp116.coerce = alloca { i64, i32 }, align 4
  %fpo = alloca %struct._IO_FILE*, align 8
  %i130 = alloca i32, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %0 = load i32, i32* %argc.addr, align 4
  %cmp = icmp ne i32 %0, 2
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %1 = load i32, i32* %argc.addr, align 4
  %2 = load i8**, i8*** %argv.addr, align 8
  call void @_Z5UsageiPPc(i32 %1, i8** %2)
  call void @exit(i32 0) #8
  unreachable

if.end:                                           ; preds = %entry
  %3 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %3, i64 1
  %4 = load i8*, i8** %arrayidx, align 8
  store i8* %4, i8** %input_f, align 8
  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.1, i64 0, i64 0))
  %5 = load i8*, i8** %input_f, align 8
  %call1 = call %struct._IO_FILE* @fopen(i8* %5, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0))
  store %struct._IO_FILE* %call1, %struct._IO_FILE** @fp, align 8
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %tobool = icmp ne %struct._IO_FILE* %6, null
  br i1 %tobool, label %if.end4, label %if.then2

if.then2:                                         ; preds = %if.end
  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.3, i64 0, i64 0))
  br label %return

if.end4:                                          ; preds = %if.end
  store i32 0, i32* %source, align 4
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call5 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* @no_of_nodes)
  store i32 1, i32* %num_of_blocks, align 4
  %8 = load i32, i32* @no_of_nodes, align 4
  store i32 %8, i32* %num_of_threads_per_block, align 4
  %9 = load i32, i32* @no_of_nodes, align 4
  %cmp6 = icmp sgt i32 %9, 512
  br i1 %cmp6, label %if.then7, label %if.end9

if.then7:                                         ; preds = %if.end4
  %10 = load i32, i32* @no_of_nodes, align 4
  %conv = sitofp i32 %10 to double
  %div = fdiv double %conv, 5.120000e+02
  %11 = call double @llvm.ceil.f64(double %div)
  %conv8 = fptosi double %11 to i32
  store i32 %conv8, i32* %num_of_blocks, align 4
  store i32 512, i32* %num_of_threads_per_block, align 4
  br label %if.end9

if.end9:                                          ; preds = %if.then7, %if.end4
  %12 = load i32, i32* @no_of_nodes, align 4
  %conv10 = sext i32 %12 to i64
  %mul = mul i64 8, %conv10
  %call11 = call noalias i8* @malloc(i64 %mul) #9
  %13 = bitcast i8* %call11 to %struct.Node*
  store %struct.Node* %13, %struct.Node** %h_graph_nodes, align 8
  %14 = load i32, i32* @no_of_nodes, align 4
  %conv12 = sext i32 %14 to i64
  %mul13 = mul i64 1, %conv12
  %call14 = call noalias i8* @malloc(i64 %mul13) #9
  store i8* %call14, i8** %h_graph_mask, align 8
  %15 = load i32, i32* @no_of_nodes, align 4
  %conv15 = sext i32 %15 to i64
  %mul16 = mul i64 1, %conv15
  %call17 = call noalias i8* @malloc(i64 %mul16) #9
  store i8* %call17, i8** %h_updating_graph_mask, align 8
  %16 = load i32, i32* @no_of_nodes, align 4
  %conv18 = sext i32 %16 to i64
  %mul19 = mul i64 1, %conv18
  %call20 = call noalias i8* @malloc(i64 %mul19) #9
  store i8* %call20, i8** %h_graph_visited, align 8
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %if.end9
  %17 = load i32, i32* %i, align 4
  %18 = load i32, i32* @no_of_nodes, align 4
  %cmp21 = icmp ult i32 %17, %18
  br i1 %cmp21, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %19 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call22 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %19, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.5, i64 0, i64 0), i32* %start, i32* %edgeno)
  %20 = load i32, i32* %start, align 4
  %21 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8
  %22 = load i32, i32* %i, align 4
  %idxprom = zext i32 %22 to i64
  %arrayidx23 = getelementptr inbounds %struct.Node, %struct.Node* %21, i64 %idxprom
  %starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx23, i32 0, i32 0
  store i32 %20, i32* %starting, align 4
  %23 = load i32, i32* %edgeno, align 4
  %24 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8
  %25 = load i32, i32* %i, align 4
  %idxprom24 = zext i32 %25 to i64
  %arrayidx25 = getelementptr inbounds %struct.Node, %struct.Node* %24, i64 %idxprom24
  %no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx25, i32 0, i32 1
  store i32 %23, i32* %no_of_edges, align 4
  %26 = load i8*, i8** %h_graph_mask, align 8
  %27 = load i32, i32* %i, align 4
  %idxprom26 = zext i32 %27 to i64
  %arrayidx27 = getelementptr inbounds i8, i8* %26, i64 %idxprom26
  store i8 0, i8* %arrayidx27, align 1
  %28 = load i8*, i8** %h_updating_graph_mask, align 8
  %29 = load i32, i32* %i, align 4
  %idxprom28 = zext i32 %29 to i64
  %arrayidx29 = getelementptr inbounds i8, i8* %28, i64 %idxprom28
  store i8 0, i8* %arrayidx29, align 1
  %30 = load i8*, i8** %h_graph_visited, align 8
  %31 = load i32, i32* %i, align 4
  %idxprom30 = zext i32 %31 to i64
  %arrayidx31 = getelementptr inbounds i8, i8* %30, i64 %idxprom30
  store i8 0, i8* %arrayidx31, align 1
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %32 = load i32, i32* %i, align 4
  %inc = add i32 %32, 1
  store i32 %inc, i32* %i, align 4
  br label %for.cond

for.end:                                          ; preds = %for.cond
  %33 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call32 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %33, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %source)
  store i32 0, i32* %source, align 4
  %34 = load i8*, i8** %h_graph_mask, align 8
  %35 = load i32, i32* %source, align 4
  %idxprom33 = sext i32 %35 to i64
  %arrayidx34 = getelementptr inbounds i8, i8* %34, i64 %idxprom33
  store i8 1, i8* %arrayidx34, align 1
  %36 = load i8*, i8** %h_graph_visited, align 8
  %37 = load i32, i32* %source, align 4
  %idxprom35 = sext i32 %37 to i64
  %arrayidx36 = getelementptr inbounds i8, i8* %36, i64 %idxprom35
  store i8 1, i8* %arrayidx36, align 1
  %38 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call37 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %38, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* @edge_list_size)
  %39 = load i32, i32* @edge_list_size, align 4
  %conv38 = sext i32 %39 to i64
  %mul39 = mul i64 4, %conv38
  %call40 = call noalias i8* @malloc(i64 %mul39) #9
  %40 = bitcast i8* %call40 to i32*
  store i32* %40, i32** %h_graph_edges, align 8
  store i32 0, i32* %i41, align 4
  br label %for.cond42

for.cond42:                                       ; preds = %for.inc49, %for.end
  %41 = load i32, i32* %i41, align 4
  %42 = load i32, i32* @edge_list_size, align 4
  %cmp43 = icmp slt i32 %41, %42
  br i1 %cmp43, label %for.body44, label %for.end51

for.body44:                                       ; preds = %for.cond42
  %43 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call45 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %43, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %id)
  %44 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call46 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* %44, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.4, i64 0, i64 0), i32* %cost)
  %45 = load i32, i32* %id, align 4
  %46 = load i32*, i32** %h_graph_edges, align 8
  %47 = load i32, i32* %i41, align 4
  %idxprom47 = sext i32 %47 to i64
  %arrayidx48 = getelementptr inbounds i32, i32* %46, i64 %idxprom47
  store i32 %45, i32* %arrayidx48, align 4
  br label %for.inc49

for.inc49:                                        ; preds = %for.body44
  %48 = load i32, i32* %i41, align 4
  %inc50 = add nsw i32 %48, 1
  store i32 %inc50, i32* %i41, align 4
  br label %for.cond42

for.end51:                                        ; preds = %for.cond42
  %49 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %tobool52 = icmp ne %struct._IO_FILE* %49, null
  br i1 %tobool52, label %if.then53, label %if.end55

if.then53:                                        ; preds = %for.end51
  %50 = load %struct._IO_FILE*, %struct._IO_FILE** @fp, align 8
  %call54 = call i32 @fclose(%struct._IO_FILE* %50)
  br label %if.end55

if.end55:                                         ; preds = %if.then53, %for.end51
  %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.6, i64 0, i64 0))
  %51 = bitcast %struct.Node** %d_graph_nodes to i8**
  %52 = load i32, i32* @no_of_nodes, align 4
  %conv57 = sext i32 %52 to i64
  %mul58 = mul i64 8, %conv57
  %call59 = call i32 @cudaMalloc(i8** %51, i64 %mul58)
  %53 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8
  %54 = bitcast %struct.Node* %53 to i8*
  %55 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8
  %56 = bitcast %struct.Node* %55 to i8*
  %57 = load i32, i32* @no_of_nodes, align 4
  %conv60 = sext i32 %57 to i64
  %mul61 = mul i64 8, %conv60
  %call62 = call i32 @cudaMemcpy(i8* %54, i8* %56, i64 %mul61, i32 1)
  %58 = bitcast i32** %d_graph_edges to i8**
  %59 = load i32, i32* @edge_list_size, align 4
  %conv63 = sext i32 %59 to i64
  %mul64 = mul i64 4, %conv63
  %call65 = call i32 @cudaMalloc(i8** %58, i64 %mul64)
  %60 = load i32*, i32** %d_graph_edges, align 8
  %61 = bitcast i32* %60 to i8*
  %62 = load i32*, i32** %h_graph_edges, align 8
  %63 = bitcast i32* %62 to i8*
  %64 = load i32, i32* @edge_list_size, align 4
  %conv66 = sext i32 %64 to i64
  %mul67 = mul i64 4, %conv66
  %call68 = call i32 @cudaMemcpy(i8* %61, i8* %63, i64 %mul67, i32 1)
  %65 = load i32, i32* @no_of_nodes, align 4
  %conv69 = sext i32 %65 to i64
  %mul70 = mul i64 1, %conv69
  %call71 = call i32 @cudaMalloc(i8** %d_graph_mask, i64 %mul70)
  %66 = load i8*, i8** %d_graph_mask, align 8
  %67 = load i8*, i8** %h_graph_mask, align 8
  %68 = load i32, i32* @no_of_nodes, align 4
  %conv72 = sext i32 %68 to i64
  %mul73 = mul i64 1, %conv72
  %call74 = call i32 @cudaMemcpy(i8* %66, i8* %67, i64 %mul73, i32 1)
  %69 = load i32, i32* @no_of_nodes, align 4
  %conv75 = sext i32 %69 to i64
  %mul76 = mul i64 1, %conv75
  %call77 = call i32 @cudaMalloc(i8** %d_updating_graph_mask, i64 %mul76)
  %70 = load i8*, i8** %d_updating_graph_mask, align 8
  %71 = load i8*, i8** %h_updating_graph_mask, align 8
  %72 = load i32, i32* @no_of_nodes, align 4
  %conv78 = sext i32 %72 to i64
  %mul79 = mul i64 1, %conv78
  %call80 = call i32 @cudaMemcpy(i8* %70, i8* %71, i64 %mul79, i32 1)
  %73 = load i32, i32* @no_of_nodes, align 4
  %conv81 = sext i32 %73 to i64
  %mul82 = mul i64 1, %conv81
  %call83 = call i32 @cudaMalloc(i8** %d_graph_visited, i64 %mul82)
  %74 = load i8*, i8** %d_graph_visited, align 8
  %75 = load i8*, i8** %h_graph_visited, align 8
  %76 = load i32, i32* @no_of_nodes, align 4
  %conv84 = sext i32 %76 to i64
  %mul85 = mul i64 1, %conv84
  %call86 = call i32 @cudaMemcpy(i8* %74, i8* %75, i64 %mul85, i32 1)
  %77 = load i32, i32* @no_of_nodes, align 4
  %conv87 = sext i32 %77 to i64
  %mul88 = mul i64 4, %conv87
  %call89 = call noalias i8* @malloc(i64 %mul88) #9
  %78 = bitcast i8* %call89 to i32*
  store i32* %78, i32** %h_cost, align 8
  store i32 0, i32* %i90, align 4
  br label %for.cond91

for.cond91:                                       ; preds = %for.inc96, %if.end55
  %79 = load i32, i32* %i90, align 4
  %80 = load i32, i32* @no_of_nodes, align 4
  %cmp92 = icmp slt i32 %79, %80
  br i1 %cmp92, label %for.body93, label %for.end98

for.body93:                                       ; preds = %for.cond91
  %81 = load i32*, i32** %h_cost, align 8
  %82 = load i32, i32* %i90, align 4
  %idxprom94 = sext i32 %82 to i64
  %arrayidx95 = getelementptr inbounds i32, i32* %81, i64 %idxprom94
  store i32 -1, i32* %arrayidx95, align 4
  br label %for.inc96

for.inc96:                                        ; preds = %for.body93
  %83 = load i32, i32* %i90, align 4
  %inc97 = add nsw i32 %83, 1
  store i32 %inc97, i32* %i90, align 4
  br label %for.cond91

for.end98:                                        ; preds = %for.cond91
  %84 = load i32*, i32** %h_cost, align 8
  %85 = load i32, i32* %source, align 4
  %idxprom99 = sext i32 %85 to i64
  %arrayidx100 = getelementptr inbounds i32, i32* %84, i64 %idxprom99
  store i32 0, i32* %arrayidx100, align 4
  %86 = bitcast i32** %d_cost to i8**
  %87 = load i32, i32* @no_of_nodes, align 4
  %conv101 = sext i32 %87 to i64
  %mul102 = mul i64 4, %conv101
  %call103 = call i32 @cudaMalloc(i8** %86, i64 %mul102)
  %88 = load i32*, i32** %d_cost, align 8
  %89 = bitcast i32* %88 to i8*
  %90 = load i32*, i32** %h_cost, align 8
  %91 = bitcast i32* %90 to i8*
  %92 = load i32, i32* @no_of_nodes, align 4
  %conv104 = sext i32 %92 to i64
  %mul105 = mul i64 4, %conv104
  %call106 = call i32 @cudaMemcpy(i8* %89, i8* %91, i64 %mul105, i32 1)
  %call107 = call i32 @cudaMalloc(i8** %d_over, i64 1)
  %call108 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.7, i64 0, i64 0))
  %93 = load i32, i32* %num_of_blocks, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid, i32 %93, i32 1, i32 1)
  %94 = load i32, i32* %num_of_threads_per_block, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %threads, i32 %94, i32 1, i32 1)
  store i32 0, i32* %k, align 4
  %call109 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.8, i64 0, i64 0))
  br label %do.body

do.body:                                          ; preds = %do.cond, %for.end98
  store i8 0, i8* %stop, align 1
  %95 = load i8*, i8** %d_over, align 8
  %call110 = call i32 @cudaMemcpy(i8* %95, i8* %stop, i64 1, i32 1)
  %96 = bitcast %struct.dim3* %agg.tmp to i8*
  %97 = bitcast %struct.dim3* %grid to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %96, i8* align 4 %97, i64 12, i1 false)
  %98 = bitcast %struct.dim3* %agg.tmp111 to i8*
  %99 = bitcast %struct.dim3* %threads to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %98, i8* align 4 %99, i64 12, i1 false)
  %100 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
  %101 = bitcast %struct.dim3* %agg.tmp to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %100, i8* align 4 %101, i64 12, i1 false)
  %102 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
  %103 = load i64, i64* %102, align 4
  %104 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
  %105 = load i32, i32* %104, align 4
  %106 = bitcast { i64, i32 }* %agg.tmp111.coerce to i8*
  %107 = bitcast %struct.dim3* %agg.tmp111 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %106, i8* align 4 %107, i64 12, i1 false)
  %108 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp111.coerce, i32 0, i32 0
  %109 = load i64, i64* %108, align 4
  %110 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp111.coerce, i32 0, i32 1
  %111 = load i32, i32* %110, align 4
  %call112 = call i32 @__cudaPushCallConfiguration(i64 %103, i32 %105, i64 %109, i32 %111, i64 0, i8* null)
  %tobool113 = icmp ne i32 %call112, 0
  br i1 %tobool113, label %kcall.end, label %kcall.configok

kcall.configok:                                   ; preds = %do.body
  %112 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8
  %113 = load i32*, i32** %d_graph_edges, align 8
  %114 = load i8*, i8** %d_graph_mask, align 8
  %115 = load i8*, i8** %d_updating_graph_mask, align 8
  %116 = load i8*, i8** %d_graph_visited, align 8
  %117 = load i32*, i32** %d_cost, align 8
  %118 = load i32, i32* @no_of_nodes, align 4
  call void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %112, i32* %113, i8* %114, i8* %115, i8* %116, i32* %117, i32 %118)
  br label %kcall.end

kcall.end:                                        ; preds = %kcall.configok, %do.body
  %call114 = call i32 @cudaDeviceSynchronize()
  %119 = bitcast %struct.dim3* %agg.tmp115 to i8*
  %120 = bitcast %struct.dim3* %grid to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %119, i8* align 4 %120, i64 12, i1 false)
  %121 = bitcast %struct.dim3* %agg.tmp116 to i8*
  %122 = bitcast %struct.dim3* %threads to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %121, i8* align 4 %122, i64 12, i1 false)
  %123 = bitcast { i64, i32 }* %agg.tmp115.coerce to i8*
  %124 = bitcast %struct.dim3* %agg.tmp115 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %123, i8* align 4 %124, i64 12, i1 false)
  %125 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp115.coerce, i32 0, i32 0
  %126 = load i64, i64* %125, align 4
  %127 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp115.coerce, i32 0, i32 1
  %128 = load i32, i32* %127, align 4
  %129 = bitcast { i64, i32 }* %agg.tmp116.coerce to i8*
  %130 = bitcast %struct.dim3* %agg.tmp116 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %129, i8* align 4 %130, i64 12, i1 false)
  %131 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 0
  %132 = load i64, i64* %131, align 4
  %133 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp116.coerce, i32 0, i32 1
  %134 = load i32, i32* %133, align 4
  %call117 = call i32 @__cudaPushCallConfiguration(i64 %126, i32 %128, i64 %132, i32 %134, i64 0, i8* null)
  %tobool118 = icmp ne i32 %call117, 0
  br i1 %tobool118, label %kcall.end120, label %kcall.configok119

kcall.configok119:                                ; preds = %kcall.end
  %135 = load i8*, i8** %d_graph_mask, align 8
  %136 = load i8*, i8** %d_updating_graph_mask, align 8
  %137 = load i8*, i8** %d_graph_visited, align 8
  %138 = load i8*, i8** %d_over, align 8
  %139 = load i32, i32* @no_of_nodes, align 4
  call void @_Z7Kernel2PbS_S_S_i(i8* %135, i8* %136, i8* %137, i8* %138, i32 %139)
  br label %kcall.end120

kcall.end120:                                     ; preds = %kcall.configok119, %kcall.end
  %call121 = call i32 @cudaDeviceSynchronize()
  %140 = load i8*, i8** %d_over, align 8
  %call122 = call i32 @cudaMemcpy(i8* %stop, i8* %140, i64 1, i32 2)
  %141 = load i32, i32* %k, align 4
  %inc123 = add nsw i32 %141, 1
  store i32 %inc123, i32* %k, align 4
  br label %do.cond

do.cond:                                          ; preds = %kcall.end120
  %142 = load i8, i8* %stop, align 1
  %tobool124 = trunc i8 %142 to i1
  br i1 %tobool124, label %do.body, label %do.end

do.end:                                           ; preds = %do.cond
  %143 = load i32, i32* %k, align 4
  %call125 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.9, i64 0, i64 0), i32 %143)
  %144 = load i32*, i32** %h_cost, align 8
  %145 = bitcast i32* %144 to i8*
  %146 = load i32*, i32** %d_cost, align 8
  %147 = bitcast i32* %146 to i8*
  %148 = load i32, i32* @no_of_nodes, align 4
  %conv126 = sext i32 %148 to i64
  %mul127 = mul i64 4, %conv126
  %call128 = call i32 @cudaMemcpy(i8* %145, i8* %147, i64 %mul127, i32 2)
  %call129 = call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.10, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.11, i64 0, i64 0))
  store %struct._IO_FILE* %call129, %struct._IO_FILE** %fpo, align 8
  store i32 0, i32* %i130, align 4
  br label %for.cond131

for.cond131:                                      ; preds = %for.inc137, %do.end
  %149 = load i32, i32* %i130, align 4
  %150 = load i32, i32* @no_of_nodes, align 4
  %cmp132 = icmp slt i32 %149, %150
  br i1 %cmp132, label %for.body133, label %for.end139

for.body133:                                      ; preds = %for.cond131
  %151 = load %struct._IO_FILE*, %struct._IO_FILE** %fpo, align 8
  %152 = load i32, i32* %i130, align 4
  %153 = load i32*, i32** %h_cost, align 8
  %154 = load i32, i32* %i130, align 4
  %idxprom134 = sext i32 %154 to i64
  %arrayidx135 = getelementptr inbounds i32, i32* %153, i64 %idxprom134
  %155 = load i32, i32* %arrayidx135, align 4
  %call136 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %151, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.12, i64 0, i64 0), i32 %152, i32 %155)
  br label %for.inc137

for.inc137:                                       ; preds = %for.body133
  %156 = load i32, i32* %i130, align 4
  %inc138 = add nsw i32 %156, 1
  store i32 %inc138, i32* %i130, align 4
  br label %for.cond131

for.end139:                                       ; preds = %for.cond131
  %157 = load %struct._IO_FILE*, %struct._IO_FILE** %fpo, align 8
  %call140 = call i32 @fclose(%struct._IO_FILE* %157)
  %call141 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.13, i64 0, i64 0))
  %158 = load %struct.Node*, %struct.Node** %h_graph_nodes, align 8
  %159 = bitcast %struct.Node* %158 to i8*
  call void @free(i8* %159) #9
  %160 = load i32*, i32** %h_graph_edges, align 8
  %161 = bitcast i32* %160 to i8*
  call void @free(i8* %161) #9
  %162 = load i8*, i8** %h_graph_mask, align 8
  call void @free(i8* %162) #9
  %163 = load i8*, i8** %h_updating_graph_mask, align 8
  call void @free(i8* %163) #9
  %164 = load i8*, i8** %h_graph_visited, align 8
  call void @free(i8* %164) #9
  %165 = load i32*, i32** %h_cost, align 8
  %166 = bitcast i32* %165 to i8*
  call void @free(i8* %166) #9
  %167 = load %struct.Node*, %struct.Node** %d_graph_nodes, align 8
  %168 = bitcast %struct.Node* %167 to i8*
  %call142 = call i32 @cudaFree(i8* %168)
  %169 = load i32*, i32** %d_graph_edges, align 8
  %170 = bitcast i32* %169 to i8*
  %call143 = call i32 @cudaFree(i8* %170)
  %171 = load i8*, i8** %d_graph_mask, align 8
  %call144 = call i32 @cudaFree(i8* %171)
  %172 = load i8*, i8** %d_updating_graph_mask, align 8
  %call145 = call i32 @cudaFree(i8* %172)
  %173 = load i8*, i8** %d_graph_visited, align 8
  %call146 = call i32 @cudaFree(i8* %173)
  %174 = load i32*, i32** %d_cost, align 8
  %175 = bitcast i32* %174 to i8*
  %call147 = call i32 @cudaFree(i8* %175)
  br label %return

return:                                           ; preds = %for.end139, %if.then2
  ret void
}

; Function Attrs: noinline optnone uwtable
define dso_local void @_Z5UsageiPPc(i32 %argc, i8** %argv) #0 {
entry:
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %1 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %1, i64 0
  %2 = load i8*, i8** %arrayidx, align 8
  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i8* %2)
  ret void
}

declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #3

; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #4

declare dso_local i32 @printf(i8*, ...) #3

declare dso_local %struct._IO_FILE* @fopen(i8*, i8*) #3

declare dso_local i32 @fscanf(%struct._IO_FILE*, i8*, ...) #3

; Function Attrs: nounwind readnone speculatable willreturn
declare double @llvm.ceil.f64(double) #5

; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #6

declare dso_local i32 @fclose(%struct._IO_FILE*) #3

declare dso_local i32 @cudaMalloc(i8**, i64) #3

declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #3

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #7 comdat align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
  %x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
  %0 = load i32, i32* %vx.addr, align 4
  store i32 %0, i32* %x, align 4
  %y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
  %1 = load i32, i32* %vy.addr, align 4
  store i32 %1, i32* %y, align 4
  %z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
  %2 = load i32, i32* %vz.addr, align 4
  store i32 %2, i32* %z, align 4
  ret void
}

declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #3

declare dso_local i32 @cudaDeviceSynchronize() #3

; Function Attrs: nounwind
declare dso_local void @free(i8*) #6

declare dso_local i32 @cudaFree(i8*) #3

define internal void @__cuda_register_globals(i8** %0) {
entry:
  %1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i to i8*), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([30 x i8], [30 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  %2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i to i8*), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
  ret void
}

declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)

declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)

declare dso_local i8** @__cudaRegisterFatBinary(i8*)

define internal void @__cuda_module_ctor(i8* %0) {
entry:
  %1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
  store i8** %1, i8*** @__cuda_gpubin_handle, align 8
  call void @__cuda_register_globals(i8** %1)
  call void @__cudaRegisterFatBinaryEnd(i8** %1)
  %2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
  ret void
}

declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)

declare dso_local void @__cudaUnregisterFatBinary(i8**)

define internal void @__cuda_module_dtor(i8* %0) {
entry:
  %1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
  call void @__cudaUnregisterFatBinary(i8** %1)
  ret void
}

declare dso_local i32 @atexit(void (i8*)*)

attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind readnone speculatable willreturn }
attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { noreturn nounwind }
attributes #9 = { nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}