#include "generate_cpu_format.h" #include "handle_sync.h" #include "init.h" #include "insert_sync.h" #include "insert_warp_loop.h" #include "performance.h" #include "tool.h" #include "warp_func.h" #include using namespace llvm; // to support constant memory variables, we need to convert information // from kernelTranslator to HostTranslator, since HostTranslator knows nothing // about the kernel functions, we need to write the information to a file // by KernelTranslator and read it in HostTranslator std::string PATH = "kernel_meta.log"; int main(int argc, char **argv) { assert(argc == 3 && "incorrect number of arguments\n"); llvm::Module *program = LoadModuleFromFilr(argv[1]); std::ofstream fout; fout.open(PATH); // inline __device__ functions, and create auxiliary global variables init_block(program, fout); // insert sync before each vote, and replace the // original vote function to warp vote handle_warp_vote(program); // replace warp shuffle handle_warp_shfl(program); // insert sync insert_sync(program); // split block by sync split_block_by_sync(program); // add loop for intra&intera thread, it refers 'hierarchical collapsing' in // COX paper. insert_warp_loop(program); replace_built_in_function(program); // the input kernel programs have NVIDIA metadata, they need to be replaced to // CPU metadata generate_cpu_format(program); // execute O3 pipeline on the transformed program performance_optimization(program); VerifyModule(program); DumpModule(program, argv[2]); fout.close(); return 0; }