CuPBoP/examples/backprop/backprop_cuda-host-x86_64-u...

895 lines
112 KiB
LLVM
Raw Normal View History

2022-05-04 20:59:38 +08:00
; ModuleID = 'backprop_cuda-host-x86_64-unknown-linux-gnu.bc'
source_filename = "backprop_cuda.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
%struct.timeval = type { i64, i64 }
%struct.timezone = type { i32, i32 }
%struct.BPNN = type { i32, i32, i32, float*, float*, float*, float*, float*, float*, float**, float**, float**, float** }
$_ZN4dim3C2Ejjj = comdat any
$_ZSt3expf = comdat any
@num_threads = dso_local global i32 0, align 4
@num_blocks = dso_local global i32 0, align 4
@.str = private unnamed_addr constant [28 x i8] c"Performing GPU computation\0A\00", align 1
@.str.1 = private unnamed_addr constant [23 x i8] c"bpnn kernel error: %s\0A\00", align 1
@.str.2 = private unnamed_addr constant [4 x i8] c"%f \00", align 1
@.str.3 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
@0 = private unnamed_addr constant [37 x i8] c"_Z22bpnn_layerforward_CUDAPfS_S_S_ii\00", align 1
@1 = private unnamed_addr constant [39 x i8] c"_Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00", align 1
@2 = private constant [26889 x i8] c"P\EDU\BA\01\00\10\00\F8h\00\00\00\00\00\00\02\00\01\01@\00\00\00xY\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00=\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00e\00\00\00\00\00\00\00\00\00\00\00\D0X\00\00\00\00\00\00\10U\00\00\00\00\00\00=\05=\00@\008\00\03\00@\00\0F\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.info._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.shared._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.global\00.nv.global.init\00.nv.constant2._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.constant0._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.text._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.info._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.shared._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.constant0._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00_Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.text._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.info._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.shared._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00.nv.global\00blockIdx\00threadIdx\00.nv.global.init\00$str\00.nv.constant2._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00__ocg_const\00.nv.constant0._Z24bpnn_adjust_weights_cudaPfiS_iS_S_\00_param\00_Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.text._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.info._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00.nv.shared._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00$___ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node__186\00$___ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix__188\00.nv.constant0._Z22bpnn_layerforward_CUDAPfS_S_S_ii\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00Y\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E8\00\00\00\03\00\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F3\00\00\00\01\00\0D\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\FC\00\00\00\01\00\0D\00\01\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\06\01\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\16\01\00\00\01\00\0C\00\00\00\00\00\00\00\00\00\0B\00\00\00\00\00\00\00\1B\01\00\00\03\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\\\01\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\BD\01\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\16\02\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\BF\02\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\12\10\0A\00\00\00\00\00\00\00\00\00\80\1D\00\00\00\00\00\00\98\01\00\00\12\10\0B\00\00\00\00\00\00\00\00\00\80,\00\00\00\00\00\00\04/\08\00\0D\00\00\00\10\00\00\00\04#\08\00\0D\00\00\00\00\00\00\00\04\12\08\00\0D\00\00\00X\00\00\00\04\11\08\00\0D\00\00\00X\00\00\00\04/\08\00\0C\00\00\00\10\00\00\00\04#\08\00\0C\00\00\00\00\00\00\00\04\12\08\00\0C\00\00\00H\00\00\00\04\11\08\00\0C\00\00\00H\00\00\00\010\00\00\01*\00\00\04\0A\08\00\08\00\00\00@\010\00\03\190\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0\11\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0\11\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00X\05\00\00\04\1C\04\00p\1D\00\00\04\1E\04\000\00\00\00\010\00\00\01*\00\00\04\0A\08\00\0B\00\00\00@\01(\00\03\19(\00\04\17\0C\00\00\00\00\00\05\00$\00\00\F0\11\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0\11\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\04\1D\04\00H\05\00\00\04\1C\04\008,\00\00\04\1E\04\00@\00\00\00333333\D3?\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0
@__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } { i32 1180844977, i32 1, i8* getelementptr inbounds ([26889 x i8], [26889 x i8]* @2, i64 0, i64 0), i8* null }, section ".nvFatBinSegment", align 8
@__cuda_gpubin_handle = internal global i8** null, align 8
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* bitcast (void (i8*)* @__cuda_module_ctor to void ()*), i8* null }]
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
entry:
%input_cuda.addr = alloca float*, align 8
%output_hidden_cuda.addr = alloca float*, align 8
%input_hidden_cuda.addr = alloca float*, align 8
%hidden_partial_sum.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%hid.addr = alloca i32, align 4
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %input_cuda, float** %input_cuda.addr, align 8
store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
store i32 %in, i32* %in.addr, align 4
store i32 %hid, i32* %hid.addr, align 4
%kernel_args = alloca i8*, i64 6, align 16
%0 = bitcast float** %input_cuda.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast float** %output_hidden_cuda.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast float** %input_hidden_cuda.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = bitcast float** %hidden_partial_sum.addr to i8*
%7 = getelementptr i8*, i8** %kernel_args, i32 3
store i8* %6, i8** %7
%8 = bitcast i32* %in.addr to i8*
%9 = getelementptr i8*, i8** %kernel_args, i32 4
store i8* %8, i8** %9
%10 = bitcast i32* %hid.addr to i8*
%11 = getelementptr i8*, i8** %kernel_args, i32 5
store i8* %10, i8** %11
%12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%13 = load i64, i64* %shmem_size, align 8
%14 = load i8*, i8** %stream, align 8
%15 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%16 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false)
%17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%18 = load i64, i64* %17, align 8
%19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%20 = load i32, i32* %19, align 8
%21 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%22 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false)
%23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%24 = load i64, i64* %23, align 8
%25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%26 = load i32, i32* %25, align 8
%27 = bitcast i8* %14 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
declare dso_local i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**)
declare dso_local i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*)
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
; Function Attrs: noinline optnone uwtable
define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
entry:
%delta.addr = alloca float*, align 8
%hid.addr = alloca i32, align 4
%ly.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%w.addr = alloca float*, align 8
%oldw.addr = alloca float*, align 8
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
%grid_dim.coerce = alloca { i64, i32 }, align 8
%block_dim.coerce = alloca { i64, i32 }, align 8
store float* %delta, float** %delta.addr, align 8
store i32 %hid, i32* %hid.addr, align 4
store float* %ly, float** %ly.addr, align 8
store i32 %in, i32* %in.addr, align 4
store float* %w, float** %w.addr, align 8
store float* %oldw, float** %oldw.addr, align 8
%kernel_args = alloca i8*, i64 6, align 16
%0 = bitcast float** %delta.addr to i8*
%1 = getelementptr i8*, i8** %kernel_args, i32 0
store i8* %0, i8** %1
%2 = bitcast i32* %hid.addr to i8*
%3 = getelementptr i8*, i8** %kernel_args, i32 1
store i8* %2, i8** %3
%4 = bitcast float** %ly.addr to i8*
%5 = getelementptr i8*, i8** %kernel_args, i32 2
store i8* %4, i8** %5
%6 = bitcast i32* %in.addr to i8*
%7 = getelementptr i8*, i8** %kernel_args, i32 3
store i8* %6, i8** %7
%8 = bitcast float** %w.addr to i8*
%9 = getelementptr i8*, i8** %kernel_args, i32 4
store i8* %8, i8** %9
%10 = bitcast float** %oldw.addr to i8*
%11 = getelementptr i8*, i8** %kernel_args, i32 5
store i8* %10, i8** %11
%12 = call i32 @__cudaPopCallConfiguration(%struct.dim3* %grid_dim, %struct.dim3* %block_dim, i64* %shmem_size, i8** %stream)
%13 = load i64, i64* %shmem_size, align 8
%14 = load i8*, i8** %stream, align 8
%15 = bitcast { i64, i32 }* %grid_dim.coerce to i8*
%16 = bitcast %struct.dim3* %grid_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %15, i8* align 8 %16, i64 12, i1 false)
%17 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 0
%18 = load i64, i64* %17, align 8
%19 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %grid_dim.coerce, i32 0, i32 1
%20 = load i32, i32* %19, align 8
%21 = bitcast { i64, i32 }* %block_dim.coerce to i8*
%22 = bitcast %struct.dim3* %block_dim to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %21, i8* align 8 %22, i64 12, i1 false)
%23 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 0
%24 = load i64, i64* %23, align 8
%25 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %block_dim.coerce, i32 0, i32 1
%26 = load i32, i32* %25, align 8
%27 = bitcast i8* %14 to %struct.CUstream_st*
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_ to i8*), i64 %18, i32 %20, i64 %24, i32 %26, i8** %kernel_args, i64 %13, %struct.CUstream_st* %27)
br label %setup.end
setup.end: ; preds = %entry
ret void
}
; Function Attrs: noinline nounwind optnone uwtable
define dso_local double @_Z7gettimev() #2 {
entry:
%t = alloca %struct.timeval, align 8
%call = call i32 @gettimeofday(%struct.timeval* %t, %struct.timezone* null) #7
%tv_sec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 0
%0 = load i64, i64* %tv_sec, align 8
%conv = sitofp i64 %0 to double
%tv_usec = getelementptr inbounds %struct.timeval, %struct.timeval* %t, i32 0, i32 1
%1 = load i64, i64* %tv_usec, align 8
%conv1 = sitofp i64 %1 to double
%mul = fmul contract double %conv1, 0x3EB0C6F7A0B5ED8D
%add = fadd contract double %conv, %mul
ret double %add
}
; Function Attrs: nounwind
declare dso_local i32 @gettimeofday(%struct.timeval*, %struct.timezone*) #3
; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #4 {
entry:
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 @cudaSetDevice(i32 0)
%0 = load i32, i32* %argc.addr, align 4
%1 = load i8**, i8*** %argv.addr, align 8
%call1 = call i32 @setup(i32 %0, i8** %1)
ret i32 0
}
declare dso_local i32 @cudaSetDevice(i32) #5
declare dso_local i32 @setup(i32, i8**) #5
; Function Attrs: noinline optnone uwtable
define dso_local void @bpnn_train_cuda(%struct.BPNN* %net, float* %eo, float* %eh) #0 {
entry:
%net.addr = alloca %struct.BPNN*, align 8
%eo.addr = alloca float*, align 8
%eh.addr = alloca float*, align 8
%in = alloca i32, align 4
%hid = alloca i32, align 4
%out = alloca i32, align 4
%out_err = alloca float, align 4
%hid_err = alloca float, align 4
%m = alloca i32, align 4
%input_hidden_cuda = alloca float*, align 8
%input_cuda = alloca float*, align 8
%output_hidden_cuda = alloca float*, align 8
%partial_sum = alloca float*, align 8
%hidden_partial_sum = alloca float*, align 8
%hidden_delta_cuda = alloca float*, align 8
%input_prev_weights_cuda = alloca float*, align 8
%sum = alloca float, align 4
%input_weights_one_dim = alloca float*, align 8
%input_weights_prev_one_dim = alloca float*, align 8
%grid = alloca %struct.dim3, align 4
%threads = alloca %struct.dim3, align 4
%k = alloca i32, align 4
%j = alloca i32, align 4
%agg.tmp = alloca %struct.dim3, align 4
%agg.tmp59 = alloca %struct.dim3, align 4
%agg.tmp.coerce = alloca { i64, i32 }, align 4
%agg.tmp59.coerce = alloca { i64, i32 }, align 4
%error = alloca i32, align 4
%j70 = alloca i32, align 4
%k74 = alloca i32, align 4
%agg.tmp136 = alloca %struct.dim3, align 4
%agg.tmp137 = alloca %struct.dim3, align 4
%agg.tmp136.coerce = alloca { i64, i32 }, align 4
%agg.tmp137.coerce = alloca { i64, i32 }, align 4
%i = alloca i32, align 4
store %struct.BPNN* %net, %struct.BPNN** %net.addr, align 8
store float* %eo, float** %eo.addr, align 8
store float* %eh, float** %eh.addr, align 8
%0 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %0, i32 0, i32 0
%1 = load i32, i32* %input_n, align 8
store i32 %1, i32* %in, align 4
%2 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %2, i32 0, i32 1
%3 = load i32, i32* %hidden_n, align 4
store i32 %3, i32* %hid, align 4
%4 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_n = getelementptr inbounds %struct.BPNN, %struct.BPNN* %4, i32 0, i32 2
%5 = load i32, i32* %output_n, align 8
store i32 %5, i32* %out, align 4
store i32 0, i32* %m, align 4
%6 = load i32, i32* %in, align 4
%div = sdiv i32 %6, 16
store i32 %div, i32* @num_blocks, align 4
%7 = load i32, i32* @num_blocks, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %grid, i32 1, i32 %7, i32 1)
call void @_ZN4dim3C2Ejjj(%struct.dim3* %threads, i32 16, i32 16, i32 1)
%8 = load i32, i32* %in, align 4
%add = add nsw i32 %8, 1
%9 = load i32, i32* %hid, align 4
%add1 = add nsw i32 %9, 1
%mul = mul nsw i32 %add, %add1
%conv = sext i32 %mul to i64
%mul2 = mul i64 %conv, 4
%call = call noalias i8* @malloc(i64 %mul2) #7
%10 = bitcast i8* %call to float*
store float* %10, float** %input_weights_one_dim, align 8
%11 = load i32, i32* %in, align 4
%add3 = add nsw i32 %11, 1
%12 = load i32, i32* %hid, align 4
%add4 = add nsw i32 %12, 1
%mul5 = mul nsw i32 %add3, %add4
%conv6 = sext i32 %mul5 to i64
%mul7 = mul i64 %conv6, 4
%call8 = call noalias i8* @malloc(i64 %mul7) #7
%13 = bitcast i8* %call8 to float*
store float* %13, float** %input_weights_prev_one_dim, align 8
%14 = load i32, i32* @num_blocks, align 4
%mul9 = mul i32 %14, 16
%conv10 = zext i32 %mul9 to i64
%mul11 = mul i64 %conv10, 4
%call12 = call noalias i8* @malloc(i64 %mul11) #7
%15 = bitcast i8* %call12 to float*
store float* %15, float** %partial_sum, align 8
store i32 0, i32* %k, align 4
br label %for.cond
for.cond: ; preds = %for.inc27, %entry
%16 = load i32, i32* %k, align 4
%17 = load i32, i32* %in, align 4
%cmp = icmp sle i32 %16, %17
br i1 %cmp, label %for.body, label %for.end29
for.body: ; preds = %for.cond
store i32 0, i32* %j, align 4
br label %for.cond13
for.cond13: ; preds = %for.inc, %for.body
%18 = load i32, i32* %j, align 4
%19 = load i32, i32* %hid, align 4
%cmp14 = icmp sle i32 %18, %19
br i1 %cmp14, label %for.body15, label %for.end
for.body15: ; preds = %for.cond13
%20 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %20, i32 0, i32 9
%21 = load float**, float*** %input_weights, align 8
%22 = load i32, i32* %k, align 4
%idxprom = sext i32 %22 to i64
%arrayidx = getelementptr inbounds float*, float** %21, i64 %idxprom
%23 = load float*, float** %arrayidx, align 8
%24 = load i32, i32* %j, align 4
%idxprom16 = sext i32 %24 to i64
%arrayidx17 = getelementptr inbounds float, float* %23, i64 %idxprom16
%25 = load float, float* %arrayidx17, align 4
%26 = load float*, float** %input_weights_one_dim, align 8
%27 = load i32, i32* %m, align 4
%idxprom18 = sext i32 %27 to i64
%arrayidx19 = getelementptr inbounds float, float* %26, i64 %idxprom18
store float %25, float* %arrayidx19, align 4
%28 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_prev_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %28, i32 0, i32 11
%29 = load float**, float*** %input_prev_weights, align 8
%30 = load i32, i32* %k, align 4
%idxprom20 = sext i32 %30 to i64
%arrayidx21 = getelementptr inbounds float*, float** %29, i64 %idxprom20
%31 = load float*, float** %arrayidx21, align 8
%32 = load i32, i32* %j, align 4
%idxprom22 = sext i32 %32 to i64
%arrayidx23 = getelementptr inbounds float, float* %31, i64 %idxprom22
%33 = load float, float* %arrayidx23, align 4
%34 = load float*, float** %input_weights_prev_one_dim, align 8
%35 = load i32, i32* %m, align 4
%idxprom24 = sext i32 %35 to i64
%arrayidx25 = getelementptr inbounds float, float* %34, i64 %idxprom24
store float %33, float* %arrayidx25, align 4
%36 = load i32, i32* %m, align 4
%inc = add nsw i32 %36, 1
store i32 %inc, i32* %m, align 4
br label %for.inc
for.inc: ; preds = %for.body15
%37 = load i32, i32* %j, align 4
%inc26 = add nsw i32 %37, 1
store i32 %inc26, i32* %j, align 4
br label %for.cond13
for.end: ; preds = %for.cond13
br label %for.inc27
for.inc27: ; preds = %for.end
%38 = load i32, i32* %k, align 4
%inc28 = add nsw i32 %38, 1
store i32 %inc28, i32* %k, align 4
br label %for.cond
for.end29: ; preds = %for.cond
%39 = bitcast float** %input_cuda to i8**
%40 = load i32, i32* %in, align 4
%add30 = add nsw i32 %40, 1
%conv31 = sext i32 %add30 to i64
%mul32 = mul i64 %conv31, 4
%call33 = call i32 @cudaMalloc(i8** %39, i64 %mul32)
%41 = bitcast float** %output_hidden_cuda to i8**
%42 = load i32, i32* %hid, align 4
%add34 = add nsw i32 %42, 1
%conv35 = sext i32 %add34 to i64
%mul36 = mul i64 %conv35, 4
%call37 = call i32 @cudaMalloc(i8** %41, i64 %mul36)
%43 = bitcast float** %input_hidden_cuda to i8**
%44 = load i32, i32* %in, align 4
%add38 = add nsw i32 %44, 1
%45 = load i32, i32* %hid, align 4
%add39 = add nsw i32 %45, 1
%mul40 = mul nsw i32 %add38, %add39
%conv41 = sext i32 %mul40 to i64
%mul42 = mul i64 %conv41, 4
%call43 = call i32 @cudaMalloc(i8** %43, i64 %mul42)
%46 = bitcast float** %hidden_partial_sum to i8**
%47 = load i32, i32* @num_blocks, align 4
%mul44 = mul i32 %47, 16
%conv45 = zext i32 %mul44 to i64
%mul46 = mul i64 %conv45, 4
%call47 = call i32 @cudaMalloc(i8** %46, i64 %mul46)
%call48 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str, i64 0, i64 0))
%48 = load float*, float** %input_cuda, align 8
%49 = bitcast float* %48 to i8*
%50 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %50, i32 0, i32 3
%51 = load float*, float** %input_units, align 8
%52 = bitcast float* %51 to i8*
%53 = load i32, i32* %in, align 4
%add49 = add nsw i32 %53, 1
%conv50 = sext i32 %add49 to i64
%mul51 = mul i64 %conv50, 4
%call52 = call i32 @cudaMemcpy(i8* %49, i8* %52, i64 %mul51, i32 1)
%54 = load float*, float** %input_hidden_cuda, align 8
%55 = bitcast float* %54 to i8*
%56 = load float*, float** %input_weights_one_dim, align 8
%57 = bitcast float* %56 to i8*
%58 = load i32, i32* %in, align 4
%add53 = add nsw i32 %58, 1
%59 = load i32, i32* %hid, align 4
%add54 = add nsw i32 %59, 1
%mul55 = mul nsw i32 %add53, %add54
%conv56 = sext i32 %mul55 to i64
%mul57 = mul i64 %conv56, 4
%call58 = call i32 @cudaMemcpy(i8* %55, i8* %57, i64 %mul57, i32 1)
%60 = bitcast %struct.dim3* %agg.tmp to i8*
%61 = bitcast %struct.dim3* %grid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %60, i8* align 4 %61, i64 12, i1 false)
%62 = bitcast %struct.dim3* %agg.tmp59 to i8*
%63 = bitcast %struct.dim3* %threads to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %62, i8* align 4 %63, i64 12, i1 false)
%64 = bitcast { i64, i32 }* %agg.tmp.coerce to i8*
%65 = bitcast %struct.dim3* %agg.tmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %64, i8* align 4 %65, i64 12, i1 false)
%66 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 0
%67 = load i64, i64* %66, align 4
%68 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp.coerce, i32 0, i32 1
%69 = load i32, i32* %68, align 4
%70 = bitcast { i64, i32 }* %agg.tmp59.coerce to i8*
%71 = bitcast %struct.dim3* %agg.tmp59 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %70, i8* align 4 %71, i64 12, i1 false)
%72 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp59.coerce, i32 0, i32 0
%73 = load i64, i64* %72, align 4
%74 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp59.coerce, i32 0, i32 1
%75 = load i32, i32* %74, align 4
%call60 = call i32 @__cudaPushCallConfiguration(i64 %67, i32 %69, i64 %73, i32 %75, i64 0, i8* null)
%tobool = icmp ne i32 %call60, 0
br i1 %tobool, label %kcall.end, label %kcall.configok
kcall.configok: ; preds = %for.end29
%76 = load float*, float** %input_cuda, align 8
%77 = load float*, float** %output_hidden_cuda, align 8
%78 = load float*, float** %input_hidden_cuda, align 8
%79 = load float*, float** %hidden_partial_sum, align 8
%80 = load i32, i32* %in, align 4
%81 = load i32, i32* %hid, align 4
call void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %76, float* %77, float* %78, float* %79, i32 %80, i32 %81)
br label %kcall.end
kcall.end: ; preds = %kcall.configok, %for.end29
%call61 = call i32 @cudaThreadSynchronize()
%call62 = call i32 @cudaGetLastError()
store i32 %call62, i32* %error, align 4
%82 = load i32, i32* %error, align 4
%cmp63 = icmp ne i32 %82, 0
br i1 %cmp63, label %if.then, label %if.end
if.then: ; preds = %kcall.end
%83 = load i32, i32* %error, align 4
%call64 = call i8* @cudaGetErrorString(i32 %83)
%call65 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.1, i64 0, i64 0), i8* %call64)
call void @exit(i32 1) #8
unreachable
if.end: ; preds = %kcall.end
%84 = load float*, float** %partial_sum, align 8
%85 = bitcast float* %84 to i8*
%86 = load float*, float** %hidden_partial_sum, align 8
%87 = bitcast float* %86 to i8*
%88 = load i32, i32* @num_blocks, align 4
%mul66 = mul i32 %88, 16
%conv67 = zext i32 %mul66 to i64
%mul68 = mul i64 %conv67, 4
%call69 = call i32 @cudaMemcpy(i8* %85, i8* %87, i64 %mul68, i32 2)
store i32 1, i32* %j70, align 4
br label %for.cond71
for.cond71: ; preds = %for.inc98, %if.end
%89 = load i32, i32* %j70, align 4
%90 = load i32, i32* %hid, align 4
%cmp72 = icmp sle i32 %89, %90
br i1 %cmp72, label %for.body73, label %for.end100
for.body73: ; preds = %for.cond71
store float 0.000000e+00, float* %sum, align 4
store i32 0, i32* %k74, align 4
br label %for.cond75
for.cond75: ; preds = %for.inc83, %for.body73
%91 = load i32, i32* %k74, align 4
%92 = load i32, i32* @num_blocks, align 4
%cmp76 = icmp ult i32 %91, %92
br i1 %cmp76, label %for.body77, label %for.end85
for.body77: ; preds = %for.cond75
%93 = load float*, float** %partial_sum, align 8
%94 = load i32, i32* %k74, align 4
%95 = load i32, i32* %hid, align 4
%mul78 = mul nsw i32 %94, %95
%96 = load i32, i32* %j70, align 4
%add79 = add nsw i32 %mul78, %96
%sub = sub nsw i32 %add79, 1
%idxprom80 = sext i32 %sub to i64
%arrayidx81 = getelementptr inbounds float, float* %93, i64 %idxprom80
%97 = load float, float* %arrayidx81, align 4
%98 = load float, float* %sum, align 4
%add82 = fadd contract float %98, %97
store float %add82, float* %sum, align 4
br label %for.inc83
for.inc83: ; preds = %for.body77
%99 = load i32, i32* %k74, align 4
%inc84 = add nsw i32 %99, 1
store i32 %inc84, i32* %k74, align 4
br label %for.cond75
for.end85: ; preds = %for.cond75
%100 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_weights86 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %100, i32 0, i32 9
%101 = load float**, float*** %input_weights86, align 8
%arrayidx87 = getelementptr inbounds float*, float** %101, i64 0
%102 = load float*, float** %arrayidx87, align 8
%103 = load i32, i32* %j70, align 4
%idxprom88 = sext i32 %103 to i64
%arrayidx89 = getelementptr inbounds float, float* %102, i64 %idxprom88
%104 = load float, float* %arrayidx89, align 4
%105 = load float, float* %sum, align 4
%add90 = fadd contract float %105, %104
store float %add90, float* %sum, align 4
%106 = load float, float* %sum, align 4
%fneg = fneg float %106
%call91 = call float @_ZSt3expf(float %fneg)
%conv92 = fpext float %call91 to double
%add93 = fadd contract double 1.000000e+00, %conv92
%div94 = fdiv double 1.000000e+00, %add93
%conv95 = fptrunc double %div94 to float
%107 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %107, i32 0, i32 4
%108 = load float*, float** %hidden_units, align 8
%109 = load i32, i32* %j70, align 4
%idxprom96 = sext i32 %109 to i64
%arrayidx97 = getelementptr inbounds float, float* %108, i64 %idxprom96
store float %conv95, float* %arrayidx97, align 4
br label %for.inc98
for.inc98: ; preds = %for.end85
%110 = load i32, i32* %j70, align 4
%inc99 = add nsw i32 %110, 1
store i32 %inc99, i32* %j70, align 4
br label %for.cond71
for.end100: ; preds = %for.cond71
%111 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_units101 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %111, i32 0, i32 4
%112 = load float*, float** %hidden_units101, align 8
%113 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_units = getelementptr inbounds %struct.BPNN, %struct.BPNN* %113, i32 0, i32 5
%114 = load float*, float** %output_units, align 8
%115 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %115, i32 0, i32 10
%116 = load float**, float*** %hidden_weights, align 8
%117 = load i32, i32* %hid, align 4
%118 = load i32, i32* %out, align 4
call void @bpnn_layerforward(float* %112, float* %114, float** %116, i32 %117, i32 %118)
%119 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_delta = getelementptr inbounds %struct.BPNN, %struct.BPNN* %119, i32 0, i32 7
%120 = load float*, float** %output_delta, align 8
%121 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%target = getelementptr inbounds %struct.BPNN, %struct.BPNN* %121, i32 0, i32 8
%122 = load float*, float** %target, align 8
%123 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_units102 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %123, i32 0, i32 5
%124 = load float*, float** %output_units102, align 8
%125 = load i32, i32* %out, align 4
call void @bpnn_output_error(float* %120, float* %122, float* %124, i32 %125, float* %out_err)
%126 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_delta = getelementptr inbounds %struct.BPNN, %struct.BPNN* %126, i32 0, i32 6
%127 = load float*, float** %hidden_delta, align 8
%128 = load i32, i32* %hid, align 4
%129 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_delta103 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %129, i32 0, i32 7
%130 = load float*, float** %output_delta103, align 8
%131 = load i32, i32* %out, align 4
%132 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_weights104 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %132, i32 0, i32 10
%133 = load float**, float*** %hidden_weights104, align 8
%134 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_units105 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %134, i32 0, i32 4
%135 = load float*, float** %hidden_units105, align 8
call void @bpnn_hidden_error(float* %127, i32 %128, float* %130, i32 %131, float** %133, float* %135, float* %hid_err)
%136 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%output_delta106 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %136, i32 0, i32 7
%137 = load float*, float** %output_delta106, align 8
%138 = load i32, i32* %out, align 4
%139 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_units107 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %139, i32 0, i32 4
%140 = load float*, float** %hidden_units107, align 8
%141 = load i32, i32* %hid, align 4
%142 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_weights108 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %142, i32 0, i32 10
%143 = load float**, float*** %hidden_weights108, align 8
%144 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_prev_weights = getelementptr inbounds %struct.BPNN, %struct.BPNN* %144, i32 0, i32 12
%145 = load float**, float*** %hidden_prev_weights, align 8
call void @bpnn_adjust_weights(float* %137, i32 %138, float* %140, i32 %141, float** %143, float** %145)
%146 = bitcast float** %hidden_delta_cuda to i8**
%147 = load i32, i32* %hid, align 4
%add109 = add nsw i32 %147, 1
%conv110 = sext i32 %add109 to i64
%mul111 = mul i64 %conv110, 4
%call112 = call i32 @cudaMalloc(i8** %146, i64 %mul111)
%148 = bitcast float** %input_prev_weights_cuda to i8**
%149 = load i32, i32* %in, align 4
%add113 = add nsw i32 %149, 1
%150 = load i32, i32* %hid, align 4
%add114 = add nsw i32 %150, 1
%mul115 = mul nsw i32 %add113, %add114
%conv116 = sext i32 %mul115 to i64
%mul117 = mul i64 %conv116, 4
%call118 = call i32 @cudaMalloc(i8** %148, i64 %mul117)
%151 = load float*, float** %hidden_delta_cuda, align 8
%152 = bitcast float* %151 to i8*
%153 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%hidden_delta119 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %153, i32 0, i32 6
%154 = load float*, float** %hidden_delta119, align 8
%155 = bitcast float* %154 to i8*
%156 = load i32, i32* %hid, align 4
%add120 = add nsw i32 %156, 1
%conv121 = sext i32 %add120 to i64
%mul122 = mul i64 %conv121, 4
%call123 = call i32 @cudaMemcpy(i8* %152, i8* %155, i64 %mul122, i32 1)
%157 = load float*, float** %input_prev_weights_cuda, align 8
%158 = bitcast float* %157 to i8*
%159 = load float*, float** %input_weights_prev_one_dim, align 8
%160 = bitcast float* %159 to i8*
%161 = load i32, i32* %in, align 4
%add124 = add nsw i32 %161, 1
%162 = load i32, i32* %hid, align 4
%add125 = add nsw i32 %162, 1
%mul126 = mul nsw i32 %add124, %add125
%conv127 = sext i32 %mul126 to i64
%mul128 = mul i64 %conv127, 4
%call129 = call i32 @cudaMemcpy(i8* %158, i8* %160, i64 %mul128, i32 1)
%163 = load float*, float** %input_hidden_cuda, align 8
%164 = bitcast float* %163 to i8*
%165 = load float*, float** %input_weights_one_dim, align 8
%166 = bitcast float* %165 to i8*
%167 = load i32, i32* %in, align 4
%add130 = add nsw i32 %167, 1
%168 = load i32, i32* %hid, align 4
%add131 = add nsw i32 %168, 1
%mul132 = mul nsw i32 %add130, %add131
%conv133 = sext i32 %mul132 to i64
%mul134 = mul i64 %conv133, 4
%call135 = call i32 @cudaMemcpy(i8* %164, i8* %166, i64 %mul134, i32 1)
%169 = bitcast %struct.dim3* %agg.tmp136 to i8*
%170 = bitcast %struct.dim3* %grid to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %169, i8* align 4 %170, i64 12, i1 false)
%171 = bitcast %struct.dim3* %agg.tmp137 to i8*
%172 = bitcast %struct.dim3* %threads to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %171, i8* align 4 %172, i64 12, i1 false)
%173 = bitcast { i64, i32 }* %agg.tmp136.coerce to i8*
%174 = bitcast %struct.dim3* %agg.tmp136 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %173, i8* align 4 %174, i64 12, i1 false)
%175 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp136.coerce, i32 0, i32 0
%176 = load i64, i64* %175, align 4
%177 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp136.coerce, i32 0, i32 1
%178 = load i32, i32* %177, align 4
%179 = bitcast { i64, i32 }* %agg.tmp137.coerce to i8*
%180 = bitcast %struct.dim3* %agg.tmp137 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %179, i8* align 4 %180, i64 12, i1 false)
%181 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp137.coerce, i32 0, i32 0
%182 = load i64, i64* %181, align 4
%183 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %agg.tmp137.coerce, i32 0, i32 1
%184 = load i32, i32* %183, align 4
%call138 = call i32 @__cudaPushCallConfiguration(i64 %176, i32 %178, i64 %182, i32 %184, i64 0, i8* null)
%tobool139 = icmp ne i32 %call138, 0
br i1 %tobool139, label %kcall.end141, label %kcall.configok140
kcall.configok140: ; preds = %for.end100
%185 = load float*, float** %hidden_delta_cuda, align 8
%186 = load i32, i32* %hid, align 4
%187 = load float*, float** %input_cuda, align 8
%188 = load i32, i32* %in, align 4
%189 = load float*, float** %input_hidden_cuda, align 8
%190 = load float*, float** %input_prev_weights_cuda, align 8
call void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %185, i32 %186, float* %187, i32 %188, float* %189, float* %190)
br label %kcall.end141
kcall.end141: ; preds = %kcall.configok140, %for.end100
%191 = load %struct.BPNN*, %struct.BPNN** %net.addr, align 8
%input_units142 = getelementptr inbounds %struct.BPNN, %struct.BPNN* %191, i32 0, i32 3
%192 = load float*, float** %input_units142, align 8
%193 = bitcast float* %192 to i8*
%194 = load float*, float** %input_cuda, align 8
%195 = bitcast float* %194 to i8*
%196 = load i32, i32* %in, align 4
%add143 = add nsw i32 %196, 1
%conv144 = sext i32 %add143 to i64
%mul145 = mul i64 %conv144, 4
%call146 = call i32 @cudaMemcpy(i8* %193, i8* %195, i64 %mul145, i32 2)
%197 = load float*, float** %input_weights_one_dim, align 8
%198 = bitcast float* %197 to i8*
%199 = load float*, float** %input_hidden_cuda, align 8
%200 = bitcast float* %199 to i8*
%201 = load i32, i32* %in, align 4
%add147 = add nsw i32 %201, 1
%202 = load i32, i32* %hid, align 4
%add148 = add nsw i32 %202, 1
%mul149 = mul nsw i32 %add147, %add148
%conv150 = sext i32 %mul149 to i64
%mul151 = mul i64 %conv150, 4
%call152 = call i32 @cudaMemcpy(i8* %198, i8* %200, i64 %mul151, i32 2)
store i32 0, i32* %i, align 4
br label %for.cond153
for.cond153: ; preds = %for.inc163, %kcall.end141
%203 = load i32, i32* %i, align 4
%204 = load i32, i32* %in, align 4
%add154 = add nsw i32 %204, 1
%205 = load i32, i32* %hid, align 4
%add155 = add nsw i32 %205, 1
%mul156 = mul nsw i32 %add154, %add155
%cmp157 = icmp slt i32 %203, %mul156
br i1 %cmp157, label %for.body158, label %for.end165
for.body158: ; preds = %for.cond153
%206 = load float*, float** %input_weights_one_dim, align 8
%207 = load i32, i32* %i, align 4
%idxprom159 = sext i32 %207 to i64
%arrayidx160 = getelementptr inbounds float, float* %206, i64 %idxprom159
%208 = load float, float* %arrayidx160, align 4
%conv161 = fpext float %208 to double
%call162 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.2, i64 0, i64 0), double %conv161)
br label %for.inc163
for.inc163: ; preds = %for.body158
%209 = load i32, i32* %i, align 4
%inc164 = add nsw i32 %209, 1
store i32 %inc164, i32* %i, align 4
br label %for.cond153
for.end165: ; preds = %for.cond153
%call166 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i64 0, i64 0))
%210 = load float*, float** %input_cuda, align 8
%211 = bitcast float* %210 to i8*
%call167 = call i32 @cudaFree(i8* %211)
%212 = load float*, float** %output_hidden_cuda, align 8
%213 = bitcast float* %212 to i8*
%call168 = call i32 @cudaFree(i8* %213)
%214 = load float*, float** %input_hidden_cuda, align 8
%215 = bitcast float* %214 to i8*
%call169 = call i32 @cudaFree(i8* %215)
%216 = load float*, float** %hidden_partial_sum, align 8
%217 = bitcast float* %216 to i8*
%call170 = call i32 @cudaFree(i8* %217)
%218 = load float*, float** %input_prev_weights_cuda, align 8
%219 = bitcast float* %218 to i8*
%call171 = call i32 @cudaFree(i8* %219)
%220 = load float*, float** %hidden_delta_cuda, align 8
%221 = bitcast float* %220 to i8*
%call172 = call i32 @cudaFree(i8* %221)
%222 = load float*, float** %partial_sum, align 8
%223 = bitcast float* %222 to i8*
call void @free(i8* %223) #7
%224 = load float*, float** %input_weights_one_dim, align 8
%225 = bitcast float* %224 to i8*
call void @free(i8* %225) #7
%226 = load float*, float** %input_weights_prev_one_dim, align 8
%227 = bitcast float* %226 to i8*
call void @free(i8* %227) #7
ret void
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr #2 comdat align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3*, %struct.dim3** %this.addr, align 8
%x = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 0
%0 = load i32, i32* %vx.addr, align 4
store i32 %0, i32* %x, align 4
%y = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 1
%1 = load i32, i32* %vy.addr, align 4
store i32 %1, i32* %y, align 4
%z = getelementptr inbounds %struct.dim3, %struct.dim3* %this1, i32 0, i32 2
%2 = load i32, i32* %vz.addr, align 4
store i32 %2, i32* %z, align 4
ret void
}
; Function Attrs: nounwind
declare dso_local noalias i8* @malloc(i64) #3
declare dso_local i32 @cudaMalloc(i8**, i64) #5
declare dso_local i32 @printf(i8*, ...) #5
declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #5
declare dso_local i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) #5
declare dso_local i32 @cudaThreadSynchronize() #5
declare dso_local i32 @cudaGetLastError() #5
declare dso_local i8* @cudaGetErrorString(i32) #5
; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #6
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt3expf(float %__x) #2 comdat {
entry:
%__x.addr = alloca float, align 4
store float %__x, float* %__x.addr, align 4
%0 = load float, float* %__x.addr, align 4
%call = call float @expf(float %0) #7
ret float %call
}
declare dso_local void @bpnn_layerforward(float*, float*, float**, i32, i32) #5
declare dso_local void @bpnn_output_error(float*, float*, float*, i32, float*) #5
declare dso_local void @bpnn_hidden_error(float*, i32, float*, i32, float**, float*, float*) #5
declare dso_local void @bpnn_adjust_weights(float*, i32, float*, i32, float**, float**) #5
declare dso_local i32 @cudaFree(i8*) #5
; Function Attrs: nounwind
declare dso_local void @free(i8*) #3
; Function Attrs: nounwind
declare dso_local float @expf(float) #3
define internal void @__cuda_register_globals(i8** %0) {
entry:
%1 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii to i8*), i8* getelementptr inbounds ([37 x i8], [37 x i8]* @0, i64 0, i64 0), i8* getelementptr inbounds ([37 x i8], [37 x i8]* @0, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
%2 = call i32 @__cudaRegisterFunction(i8** %0, i8* bitcast (void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_ to i8*), i8* getelementptr inbounds ([39 x i8], [39 x i8]* @1, i64 0, i64 0), i8* getelementptr inbounds ([39 x i8], [39 x i8]* @1, i64 0, i64 0), i32 -1, i8* null, i8* null, i8* null, i8* null, i32* null)
ret void
}
declare dso_local i32 @__cudaRegisterFunction(i8**, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i32*)
declare dso_local i32 @__cudaRegisterVar(i8**, i8*, i8*, i8*, i32, i32, i32, i32)
declare dso_local i8** @__cudaRegisterFatBinary(i8*)
define internal void @__cuda_module_ctor(i8* %0) {
entry:
%1 = call i8** @__cudaRegisterFatBinary(i8* bitcast ({ i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper to i8*))
store i8** %1, i8*** @__cuda_gpubin_handle, align 8
call void @__cuda_register_globals(i8** %1)
call void @__cudaRegisterFatBinaryEnd(i8** %1)
%2 = call i32 @atexit(void (i8*)* @__cuda_module_dtor)
ret void
}
declare dso_local void @__cudaRegisterFatBinaryEnd(i8**)
declare dso_local void @__cudaUnregisterFatBinary(i8**)
define internal void @__cuda_module_dtor(i8* %0) {
entry:
%1 = load i8**, i8*** @__cuda_gpubin_handle, align 8
call void @__cudaUnregisterFatBinary(i8** %1)
ret void
}
declare dso_local i32 @atexit(void (i8*)*)
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nounwind }
attributes #8 = { noreturn nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}