; ModuleID = 'lud-host-x86_64-unknown-linux-gnu.bc'
source_filename = "cuda/lud.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct.option = type { i8*, i32, i32*, i32 }
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.__stopwatch_t = type { %struct.timeval, %struct.timeval }
%struct.timeval = type { i64, i64 }

@.str = private unnamed_addr constant [29 x i8] c"WG size of kernel = %d X %d\0A\00", align 1
@.str.1 = private unnamed_addr constant [8 x i8] c"::vs:i:\00", align 1
@_ZL12long_options = internal global [4 x %struct.option] [%struct.option { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.15, i32 0, i32 0), i32 1, i32* null, i32 105 }, %struct.option { i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.16, i32 0, i32 0), i32 1, i32* null, i32 115 }, %struct.option { i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.17, i32 0, i32 0), i32 0, i32* null, i32 118 }, %struct.option zeroinitializer], align 16
@optarg = external dso_local global i8*, align 8
@_ZL9do_verify = internal global i32 0, align 4
@.str.2 = private unnamed_addr constant [44 x i8] c"Generate input matrix internally, size =%d\0A\00", align 1
@stderr = external dso_local global %struct._IO_FILE*, align 8
@.str.3 = private unnamed_addr constant [16 x i8] c"invalid option\0A\00", align 1
@.str.4 = private unnamed_addr constant [18 x i8] c"missing argument\0A\00", align 1
@.str.5 = private unnamed_addr constant [47 x i8] c"Usage: %s [-v] [-s matrix_size|-i input_file]\0A\00", align 1
@optind = external dso_local global i32, align 4
@.str.6 = private unnamed_addr constant [29 x i8] c"Reading matrix from file %s\0A\00", align 1
@.str.7 = private unnamed_addr constant [34 x i8] c"error create matrix from file %s\0A\00", align 1
@.str.8 = private unnamed_addr constant [36 x i8] c"Creating matrix internally size=%d\0A\00", align 1
@.str.9 = private unnamed_addr constant [40 x i8] c"error create matrix internally size=%d\0A\00", align 1
@.str.10 = private unnamed_addr constant [26 x i8] c"No input file specified!\0A\00", align 1
@.str.11 = private unnamed_addr constant [12 x i8] c"Before LUD\0A\00", align 1
@.str.12 = private unnamed_addr constant [24 x i8] c"Time consumed(ms): %lf\0A\00", align 1
@.str.13 = private unnamed_addr constant [11 x i8] c"After LUD\0A\00", align 1
@.str.14 = private unnamed_addr constant [15 x i8] c">>>Verify<<<<\0A\00", align 1
@.str.15 = private unnamed_addr constant [6 x i8] c"input\00", align 1
@.str.16 = private unnamed_addr constant [5 x i8] c"size\00", align 1
@.str.17 = private unnamed_addr constant [7 x i8] c"verify\00", align 1

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #0 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %matrix_dim = alloca i32, align 4
  %opt = alloca i32, align 4
  %option_index = alloca i32, align 4
  %ret = alloca i32, align 4
  %input_file = alloca i8*, align 8
  %m = alloca float*, align 8
  %d_m = alloca float*, align 8
  %mm = alloca float*, align 8
  %sw = alloca %struct.__stopwatch_t, align 8
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  %call = call i32 @cudaSetDevice(i32 0)
  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i64 0, i64 0), i32 16, i32 16)
  store i32 32, i32* %matrix_dim, align 4
  store i32 0, i32* %option_index, align 4
  store i8* null, i8** %input_file, align 8
  br label %while.cond

while.cond:                                       ; preds = %sw.epilog, %entry
  %0 = load i32, i32* %argc.addr, align 4
  %1 = load i8**, i8*** %argv.addr, align 8
  %call2 = call i32 @getopt_long(i32 %0, i8** %1, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str.1, i64 0, i64 0), %struct.option* getelementptr inbounds ([4 x %struct.option], [4 x %struct.option]* @_ZL12long_options, i64 0, i64 0), i32* %option_index) #5
  store i32 %call2, i32* %opt, align 4
  %cmp = icmp ne i32 %call2, -1
  br i1 %cmp, label %while.body, label %while.end

while.body:                                       ; preds = %while.cond
  %2 = load i32, i32* %opt, align 4
  switch i32 %2, label %sw.default [
    i32 105, label %sw.bb
    i32 118, label %sw.bb3
    i32 115, label %sw.bb4
    i32 63, label %sw.bb7
    i32 58, label %sw.bb9
  ]

sw.bb:                                            ; preds = %while.body
  %3 = load i8*, i8** @optarg, align 8
  store i8* %3, i8** %input_file, align 8
  br label %sw.epilog

sw.bb3:                                           ; preds = %while.body
  store i32 1, i32* @_ZL9do_verify, align 4
  br label %sw.epilog

sw.bb4:                                           ; preds = %while.body
  %4 = load i8*, i8** @optarg, align 8
  %call5 = call i32 @atoi(i8* %4) #6
  store i32 %call5, i32* %matrix_dim, align 4
  %5 = load i32, i32* %matrix_dim, align 4
  %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.2, i64 0, i64 0), i32 %5)
  br label %sw.epilog

sw.bb7:                                           ; preds = %while.body
  %6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0))
  br label %sw.epilog

sw.bb9:                                           ; preds = %while.body
  %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %call10 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0))
  br label %sw.epilog

sw.default:                                       ; preds = %while.body
  %8 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %9 = load i8**, i8*** %argv.addr, align 8
  %arrayidx = getelementptr inbounds i8*, i8** %9, i64 0
  %10 = load i8*, i8** %arrayidx, align 8
  %call11 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %8, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.5, i64 0, i64 0), i8* %10)
  call void @exit(i32 1) #7
  unreachable

sw.epilog:                                        ; preds = %sw.bb9, %sw.bb7, %sw.bb4, %sw.bb3, %sw.bb
  br label %while.cond

while.end:                                        ; preds = %while.cond
  %11 = load i32, i32* @optind, align 4
  %12 = load i32, i32* %argc.addr, align 4
  %cmp12 = icmp slt i32 %11, %12
  br i1 %cmp12, label %if.then, label %lor.lhs.false

lor.lhs.false:                                    ; preds = %while.end
  %13 = load i32, i32* @optind, align 4
  %cmp13 = icmp eq i32 %13, 1
  br i1 %cmp13, label %if.then, label %if.end

if.then:                                          ; preds = %lor.lhs.false, %while.end
  %14 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %15 = load i8**, i8*** %argv.addr, align 8
  %arrayidx14 = getelementptr inbounds i8*, i8** %15, i64 0
  %16 = load i8*, i8** %arrayidx14, align 8
  %call15 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %14, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.5, i64 0, i64 0), i8* %16)
  call void @exit(i32 1) #7
  unreachable

if.end:                                           ; preds = %lor.lhs.false
  %17 = load i8*, i8** %input_file, align 8
  %tobool = icmp ne i8* %17, null
  br i1 %tobool, label %if.then16, label %if.else

if.then16:                                        ; preds = %if.end
  %18 = load i8*, i8** %input_file, align 8
  %call17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.6, i64 0, i64 0), i8* %18)
  %19 = load i8*, i8** %input_file, align 8
  %call18 = call i32 @create_matrix_from_file(float** %m, i8* %19, i32* %matrix_dim)
  store i32 %call18, i32* %ret, align 4
  %20 = load i32, i32* %ret, align 4
  %cmp19 = icmp ne i32 %20, 0
  br i1 %cmp19, label %if.then20, label %if.end22

if.then20:                                        ; preds = %if.then16
  store float* null, float** %m, align 8
  %21 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %22 = load i8*, i8** %input_file, align 8
  %call21 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %21, i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.7, i64 0, i64 0), i8* %22)
  call void @exit(i32 1) #7
  unreachable

if.end22:                                         ; preds = %if.then16
  br label %if.end34

if.else:                                          ; preds = %if.end
  %23 = load i32, i32* %matrix_dim, align 4
  %tobool23 = icmp ne i32 %23, 0
  br i1 %tobool23, label %if.then24, label %if.else31

if.then24:                                        ; preds = %if.else
  %24 = load i32, i32* %matrix_dim, align 4
  %call25 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.8, i64 0, i64 0), i32 %24)
  %25 = load i32, i32* %matrix_dim, align 4
  %call26 = call i32 @create_matrix(float** %m, i32 %25)
  store i32 %call26, i32* %ret, align 4
  %26 = load i32, i32* %ret, align 4
  %cmp27 = icmp ne i32 %26, 0
  br i1 %cmp27, label %if.then28, label %if.end30

if.then28:                                        ; preds = %if.then24
  store float* null, float** %m, align 8
  %27 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
  %28 = load i32, i32* %matrix_dim, align 4
  %call29 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %27, i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.9, i64 0, i64 0), i32 %28)
  call void @exit(i32 1) #7
  unreachable

if.end30:                                         ; preds = %if.then24
  br label %if.end33

if.else31:                                        ; preds = %if.else
  %call32 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.10, i64 0, i64 0))
  call void @exit(i32 1) #7
  unreachable

if.end33:                                         ; preds = %if.end30
  br label %if.end34

if.end34:                                         ; preds = %if.end33, %if.end22
  %29 = load i32, i32* @_ZL9do_verify, align 4
  %tobool35 = icmp ne i32 %29, 0
  br i1 %tobool35, label %if.then36, label %if.end38

if.then36:                                        ; preds = %if.end34
  %call37 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.11, i64 0, i64 0))
  %30 = load float*, float** %m, align 8
  %31 = load i32, i32* %matrix_dim, align 4
  call void @matrix_duplicate(float* %30, float** %mm, i32 %31)
  br label %if.end38

if.end38:                                         ; preds = %if.then36, %if.end34
  %32 = bitcast float** %d_m to i8**
  %33 = load i32, i32* %matrix_dim, align 4
  %34 = load i32, i32* %matrix_dim, align 4
  %mul = mul nsw i32 %33, %34
  %conv = sext i32 %mul to i64
  %mul39 = mul i64 %conv, 4
  %call40 = call i32 @cudaMalloc(i8** %32, i64 %mul39)
  call void @stopwatch_start(%struct.__stopwatch_t* %sw)
  %35 = load float*, float** %d_m, align 8
  %36 = bitcast float* %35 to i8*
  %37 = load float*, float** %m, align 8
  %38 = bitcast float* %37 to i8*
  %39 = load i32, i32* %matrix_dim, align 4
  %40 = load i32, i32* %matrix_dim, align 4
  %mul41 = mul nsw i32 %39, %40
  %conv42 = sext i32 %mul41 to i64
  %mul43 = mul i64 %conv42, 4
  %call44 = call i32 @cudaMemcpy(i8* %36, i8* %38, i64 %mul43, i32 1)
  %41 = load float*, float** %d_m, align 8
  %42 = load i32, i32* %matrix_dim, align 4
  call void @_Z8lud_cudaPfi(float* %41, i32 %42)
  %43 = load float*, float** %m, align 8
  %44 = bitcast float* %43 to i8*
  %45 = load float*, float** %d_m, align 8
  %46 = bitcast float* %45 to i8*
  %47 = load i32, i32* %matrix_dim, align 4
  %48 = load i32, i32* %matrix_dim, align 4
  %mul45 = mul nsw i32 %47, %48
  %conv46 = sext i32 %mul45 to i64
  %mul47 = mul i64 %conv46, 4
  %call48 = call i32 @cudaMemcpy(i8* %44, i8* %46, i64 %mul47, i32 2)
  call void @stopwatch_stop(%struct.__stopwatch_t* %sw)
  %call49 = call double @get_interval_by_sec(%struct.__stopwatch_t* %sw)
  %mul50 = fmul contract double 1.000000e+03, %call49
  %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.12, i64 0, i64 0), double %mul50)
  %49 = load float*, float** %d_m, align 8
  %50 = bitcast float* %49 to i8*
  %call52 = call i32 @cudaFree(i8* %50)
  %51 = load i32, i32* @_ZL9do_verify, align 4
  %tobool53 = icmp ne i32 %51, 0
  br i1 %tobool53, label %if.then54, label %if.end58

if.then54:                                        ; preds = %if.end38
  %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str.13, i64 0, i64 0))
  %call56 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.14, i64 0, i64 0))
  %52 = load float*, float** %mm, align 8
  %53 = load float*, float** %m, align 8
  %54 = load i32, i32* %matrix_dim, align 4
  %call57 = call i32 @lud_verify(float* %52, float* %53, i32 %54)
  %55 = load float*, float** %mm, align 8
  %56 = bitcast float* %55 to i8*
  call void @free(i8* %56) #5
  br label %if.end58

if.end58:                                         ; preds = %if.then54, %if.end38
  %57 = load float*, float** %m, align 8
  %58 = bitcast float* %57 to i8*
  call void @free(i8* %58) #5
  ret i32 0
}

declare dso_local i32 @cudaSetDevice(i32) #1

declare dso_local i32 @printf(i8*, ...) #1

; Function Attrs: nounwind
declare dso_local i32 @getopt_long(i32, i8**, i8*, %struct.option*, i32*) #2

; Function Attrs: nounwind readonly
declare dso_local i32 @atoi(i8*) #3

declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1

; Function Attrs: noreturn nounwind
declare dso_local void @exit(i32) #4

declare dso_local i32 @create_matrix_from_file(float**, i8*, i32*) #1

declare dso_local i32 @create_matrix(float**, i32) #1

declare dso_local void @matrix_duplicate(float*, float**, i32) #1

declare dso_local i32 @cudaMalloc(i8**, i64) #1

declare dso_local void @stopwatch_start(%struct.__stopwatch_t*) #1

declare dso_local i32 @cudaMemcpy(i8*, i8*, i64, i32) #1

declare dso_local void @_Z8lud_cudaPfi(float*, i32) #1

declare dso_local void @stopwatch_stop(%struct.__stopwatch_t*) #1

declare dso_local double @get_interval_by_sec(%struct.__stopwatch_t*) #1

declare dso_local i32 @cudaFree(i8*) #1

declare dso_local i32 @lud_verify(float*, float*, i32) #1

; Function Attrs: nounwind
declare dso_local void @free(i8*) #2

attributes #0 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { nounwind }
attributes #6 = { nounwind readonly }
attributes #7 = { noreturn nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}