diff --git a/jitify.hpp b/jitify.hpp index bc8fa14..ed06247 100644 --- a/jitify.hpp +++ b/jitify.hpp @@ -94,6 +94,10 @@ #include #include #include +#if CUDA_VERSION >= 9000 +#include +#endif + #if __cplusplus >= 201103L #define JITIFY_UNIQUE_PTR std::unique_ptr #define JITIFY_DEFINE_AUTO_PTR_COPY_WAR(cls) @@ -179,7 +183,7 @@ class ObjectCache { object_map _objects; key_rank _ranked_keys; size_t _capacity; - + inline void discard_old(size_t n=0) { if( n > _capacity ) { throw std::runtime_error("Insufficient capacity in cache"); @@ -250,6 +254,33 @@ class vector : public std::vector { #endif }; +//Helper function to cache sm count used by coop_groups + int get_device_SMs(int device_id){ + static bool init=false; + static vector SMs; + if(!init){ + int nDevices; + cudaGetDeviceCount(&nDevices); + SMs.resize(nDevices); + for (int i = 0; i < nDevices; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + SMs[i] = prop.multiProcessorCount; + } + } + init = true; + if(device_id >= (int) SMs.size()) + throw std::runtime_error(std::string("Error: Tried to query device of ordinal > number of devices.\n")); + return SMs[device_id]; + } + + int get_cur_dev_SMs(){ + int dev; + cudaGetDevice(&dev); + return get_device_SMs(dev); + } + + // Helper functions for parsing/manipulating source code std::string replace_characters(std::string str, std::string const& oldchars, char newchar) { @@ -374,14 +405,14 @@ inline bool extract_include_info_from_compile_error(std::string log, beg += pattern.size(); size_t end = log.find("\"", beg); name = log.substr(beg, end-beg); - + size_t line_beg = log.rfind("\n", beg); if( line_beg == std::string::npos ) { line_beg = 0; } else { line_beg += 1; } - + size_t split = log.find("(", line_beg); parent = log.substr(line_beg, split-line_beg); line_num = atoi(log.substr(split+1, @@ -513,11 +544,11 @@ inline bool load_source(std::string filename, bool remove_next_blank_line = false; while( std::getline(*source_stream, line) ) { ++linenum; - + // HACK WAR for static variables not allowed on the device (unless __shared__) // TODO: This breaks static member variables //line = replace_token(line, "static const", "/*static*/ const"); - + // TODO: Need to watch out for /* */ comments too std::string cleanline = line.substr(0, line.find("//")); // Strip line comments //if( cleanline.back() == "\r" ) { // Remove Windows line ending @@ -538,7 +569,7 @@ inline bool load_source(std::string filename, //line = "//" + line; // Comment out the #pragma once line continue; } - + // HACK WAR for Thrust using "#define FOO #pragma bar" size_t pragma_beg = cleanline.find("#pragma "); if( pragma_beg != std::string::npos ) { @@ -551,13 +582,14 @@ inline bool load_source(std::string filename, line += " " + pragma_split[2]; } } - + source += line + "\n"; } // HACK TESTING (WAR for cub) + //source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" + source; - + // WAR for #pragma once causing problems when there are multiple inclusions // of the same header from different paths. if( pragma_once ) { @@ -789,7 +821,7 @@ class CUDAKernel { std::string _func_name; std::string _ptx; std::vector _opts; - + inline void cuda_safe_call(CUresult res) { if( res != CUDA_SUCCESS ) { const char* msg; @@ -892,7 +924,7 @@ class CUDAKernel { this->destroy_module(); } inline operator CUfunction() const { return _kernel; } - + inline CUresult launch(dim3 grid, dim3 block, unsigned int smem, CUstream stream, std::vector arg_ptrs) { @@ -902,6 +934,42 @@ class CUDAKernel { smem, stream, &arg_ptrs[0], NULL); } +#if CUDA_VERSION >= 9000 + inline CUresult coop_max_grid_size(int &max_blocks, dim3 block, size_t smem) { + + int SMs = get_cur_dev_SMs(); + int blocksPerSm; + + CUresult occ_result = cuOccupancyMaxActiveBlocksPerMultiprocessor(&blocksPerSm, _kernel, block.x*block.y*block.z, 0); + max_blocks = blocksPerSm * SMs; + return occ_result; + } + + inline CUresult coop_launch(dim3 grid, dim3 block, + unsigned int smem, CUstream stream, + std::vector arg_ptrs) { + + int can_coop; + int dev; + cudaGetDevice(&dev); + cuDeviceGetAttribute(&can_coop, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev); + if(!can_coop) + throw std::runtime_error(std::string("Error: Cannot use cooperative groups on this device.\n")); + + int SMs = get_cur_dev_SMs(); + int blocksPerSm; + CUresult occ_result = cuOccupancyMaxActiveBlocksPerMultiprocessor(&blocksPerSm, _kernel, block.x*block.y*block.z, 0); + if (occ_result != CUDA_SUCCESS) return occ_result; + + if(blocksPerSm*SMs < (int) grid.x*grid.y*grid.z) + throw std::runtime_error(std::string("Error: Kernel cannot be launched with given grid size. Maximum number of blocks for this kernel is ") + + std::to_string(blocksPerSm*SMs)); + + return cuLaunchCooperativeKernel( _kernel, grid.x, grid.y, grid.z, + block.x, block.y, block.z, + smem, stream, &arg_ptrs[0]); + } +#endif }; const char* jitsafe_header_preinclude_h = R"( @@ -1544,7 +1612,7 @@ inline void split_compiler_and_linker_options( std::vector* compiler_options, std::vector* linker_files, std::vector* linker_paths) { - + for( int i=0; i<(int)options.size(); ++i ) { std::string opt = options[i]; std::string flag = opt.substr(0, 2); @@ -1584,14 +1652,14 @@ nvrtcResult compile_kernel(std::string program_name, header_names_c.push_back(name.c_str()); header_sources_c.push_back(code.c_str()); } - + std::vector options_c(options.size()+2); options_c[0] = "--device-as-default-execution-space"; options_c[1] = "--pre-include=jitify_preinclude.h"; for( int i=0; i<(int)options.size(); ++i ) { options_c[i+2] = options[i].c_str(); } - + #if CUDA_VERSION < 8000 std::string inst_dummy; if( !instantiation.empty() ) { @@ -1601,14 +1669,14 @@ nvrtcResult compile_kernel(std::string program_name, program_source += "\nvoid* "+ inst_dummy + " = (void*)" + instantiation + ";\n"; } #endif - + #define CHECK_NVRTC(call) do { \ nvrtcResult ret = call; \ if( ret != NVRTC_SUCCESS ) { \ return ret; \ } \ } while(0) - + nvrtcProgram nvrtc_program; CHECK_NVRTC( nvrtcCreateProgram(&nvrtc_program, program_source.c_str(), @@ -1616,14 +1684,14 @@ nvrtcResult compile_kernel(std::string program_name, num_headers, &header_sources_c[0], &header_names_c[0]) ); - + #if CUDA_VERSION >= 8000 if( !instantiation.empty() ) { CHECK_NVRTC( nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()) ); } #endif - + nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, options_c.size(), &options_c[0]); if( log ) { @@ -1636,7 +1704,7 @@ nvrtcResult compile_kernel(std::string program_name, return ret; } } - + if( ptx ) { size_t ptxsize; CHECK_NVRTC( nvrtcGetPTXSize(nvrtc_program, &ptxsize) ); @@ -1644,7 +1712,7 @@ nvrtcResult compile_kernel(std::string program_name, CHECK_NVRTC( nvrtcGetPTX(nvrtc_program, &vptx[0]) ); ptx->assign(&vptx[0], ptxsize); } - + if( !instantiation.empty() && mangled_instantiation ) { #if CUDA_VERSION >= 8000 const char* mangled_instantiation_cstr; @@ -1662,7 +1730,7 @@ nvrtcResult compile_kernel(std::string program_name, *mangled_instantiation = ptx->substr(mi_beg, mi_end-mi_beg); #endif } - + CHECK_NVRTC( nvrtcDestroyProgram(&nvrtc_program) ); #undef CHECK_NVRTC return NVRTC_SUCCESS; @@ -1697,9 +1765,9 @@ class JitCache_impl { inline JitCache_impl(size_t cache_size) : _kernel_cache(cache_size), _program_config_cache(cache_size) { - + detail::add_options_from_env(_options); - + // Bootstrap the cuda context to avoid errors cudaFree(0); } @@ -1777,6 +1845,9 @@ class KernelInstantiation_impl { inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default; inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default; #endif +#if CUDA_VERSION >= 9000 + inline CUresult coop_max_grid_size(int &max_block_size, dim3 block, size_t smem) const; +#endif }; class KernelLauncher_impl { @@ -1798,6 +1869,10 @@ class KernelLauncher_impl { #endif inline CUresult launch(jitify::detail::vector arg_ptrs, jitify::detail::vector arg_types=0) const; +#if CUDA_VERSION >= 9000 + inline CUresult coop_launch(jitify::detail::vector arg_ptrs, + jitify::detail::vector arg_types=0) const; +#endif }; /*! An object representing a configured and instantiated kernel ready @@ -1825,6 +1900,12 @@ class KernelLauncher { jitify::detail::vector arg_types=0) const { return _impl->launch(arg_ptrs, arg_types); } +#if CUDA_VERSION >= 9000 + inline CUresult coop_launch(std::vector arg_ptrs=std::vector(), + jitify::detail::vector arg_types=0) const { + return _impl->coop_launch(arg_ptrs, arg_types); + } +#endif #if __cplusplus >= 201103L // Regular function call syntax /*! Launch the kernel. @@ -1844,6 +1925,13 @@ class KernelLauncher { return this->launch(std::vector({(void*)&args...}), {reflection::reflect()...}); } +#if CUDA_VERSION >= 9000 + template + inline CUresult coop_launch(ArgTypes... args) const { + return this->coop_launch(std::vector({(void*)&args...}), + {reflection::reflect()...}); + } +#endif //cuda_version #endif }; @@ -1876,6 +1964,11 @@ class KernelInstantiation { size_t smem=0, cudaStream_t stream=0) const { return KernelLauncher(*this, grid, block, smem, stream); } +#if CUDA_VERSION >= 9000 + inline CUresult coop_max_grid_size(int &max_block_size, dim3 block, size_t smem) const{ + return _impl->coop_max_grid_size(max_block_size, block, smem); + } +#endif }; /*! An object representing a kernel made up of a Program, a name and options. @@ -1985,7 +2078,7 @@ class JitCache { enum { DEFAULT_CACHE_SIZE = 128 }; JitCache(size_t cache_size=DEFAULT_CACHE_SIZE) : _impl(new JitCache_impl(cache_size)) {} - + /*! Create a program. * * \param source A string containing either the source filename or @@ -2069,7 +2162,7 @@ std::ostream& operator<<(std::ostream& stream, dim3 d) { inline CUresult KernelLauncher_impl::launch(jitify::detail::vector arg_ptrs, jitify::detail::vector arg_types) const { - + #if JITIFY_PRINT_LAUNCH Kernel_impl const& kernel = _kernel_inst._kernel; std::string arg_types_string = (arg_types.empty() ? "..." : @@ -2085,6 +2178,30 @@ inline CUresult KernelLauncher_impl::launch(jitify::detail::vector arg_pt return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream, arg_ptrs); } +#if CUDA_VERSION >= 9000 +inline CUresult KernelLauncher_impl::coop_launch(jitify::detail::vector arg_ptrs, + jitify::detail::vector arg_types) const { +#if JITIFY_PRINT_LAUNCH + Kernel_impl const& kernel = _kernel_inst._kernel; + std::string arg_types_string = (arg_types.empty() ? "..." : + reflection::reflect_list(arg_types)); + std::cout << "Launching " + << kernel._name + << _kernel_inst._template_inst + << "<<<" << _grid << "," << _block + << "," << _smem << "," << _stream << ">>>" + << "(" << arg_types_string << ")" + << std::endl; +#endif + return _kernel_inst._cuda_kernel->coop_launch(_grid, _block, _smem, _stream, + arg_ptrs); +} + +inline CUresult KernelInstantiation_impl::coop_max_grid_size(int &max_blocks, dim3 block, size_t smem) const{ + return _cuda_kernel->coop_max_grid_size(max_blocks, block, smem); +} +#endif //cuda_version + KernelInstantiation_impl::KernelInstantiation_impl(Kernel_impl const& kernel, std::vector const& template_args) @@ -2123,15 +2240,15 @@ void KernelInstantiation_impl::print() const { void KernelInstantiation_impl::build_kernel() { Program_impl const& program = _kernel._program; - + std::string instantiation = _kernel._name + _template_inst; - + std::vector compiler_options; std::vector linker_files; std::vector linker_paths; detail::split_compiler_and_linker_options( _options, &compiler_options, &linker_files, &linker_paths); - + std::string log; std::string ptx; std::string mangled_instantiation; @@ -2148,7 +2265,7 @@ void KernelInstantiation_impl::build_kernel() { throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(ret)); } - + #if JITIFY_PRINT_PTX std::cout << "---------------------------------------" << std::endl; std::cout << mangled_instantiation << std::endl; @@ -2158,7 +2275,7 @@ void KernelInstantiation_impl::build_kernel() { std::cout << ptx << std::endl; std::cout << "---------------------------------------" << std::endl; #endif - + _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files, linker_paths); } @@ -2222,7 +2339,7 @@ void Program_impl::load_sources(std::string source, std::vector& include_paths = _config->include_paths; std::string& name = _config->name; ProgramConfig::source_map& sources = _config->sources; - + // Extract include paths from compile options std::vector::iterator iter = options.begin(); while( iter != options.end() ) { @@ -2236,7 +2353,7 @@ void Program_impl::load_sources(std::string source, } } _config->options = options; - + // Load program source if( !detail::load_source(source, sources, "", include_paths, @@ -2244,7 +2361,7 @@ void Program_impl::load_sources(std::string source, throw std::runtime_error("Source not found: " + source); } name = sources.begin()->first; - + // Load header sources for( int i=0; i<(int)headers.size(); ++i ) { if( !detail::load_source(headers[i], @@ -2255,7 +2372,7 @@ void Program_impl::load_sources(std::string source, throw std::runtime_error("Source not found: " + headers[i]); } } - + #if JITIFY_PRINT_SOURCE std::string& program_source = sources[name]; std::cout << "---------------------------------------" << std::endl; @@ -2264,13 +2381,13 @@ void Program_impl::load_sources(std::string source, detail::print_with_line_numbers(program_source); std::cout << "---------------------------------------" << std::endl; #endif - + std::vector compiler_options; std::vector linker_files; std::vector linker_paths; detail::split_compiler_and_linker_options( options, &compiler_options, &linker_files, &linker_paths); - + std::string log; nvrtcResult ret; while( (ret = detail::compile_kernel(name, sources, compiler_options, "", &log)) @@ -2289,7 +2406,7 @@ void Program_impl::load_sources(std::string source, // TODO: How to handle error? throw std::runtime_error("Runtime compilation failed"); } - + // Try to load the new header std::string include_path = detail::path_base(include_parent); if( !detail::load_source(include_name, sources, @@ -2297,13 +2414,13 @@ void Program_impl::load_sources(std::string source, file_callback) ) { // Comment-out the include line and print a warning if( !sources.count(include_parent) ) { - + // ***TODO: Unless there's another mechanism (e.g., potentially // the parent path vs. filename problem), getting // here means include_parent was found automatically // in a system include path. // We need a WAR to zap it from *its parent*. - + for( ProgramConfig::source_map::const_iterator it=sources.begin(); it != sources.end(); ++it ) { std::cout << " " << it->first << std::endl; @@ -2482,7 +2599,7 @@ CUresult parallel_for(ExecutionPolicy policy, IndexType end, Lambda const& lambda) { using namespace jitify; - + if( policy.location == HOST ) { #ifdef _OPENMP #pragma omp parallel for @@ -2500,7 +2617,7 @@ CUresult parallel_for(ExecutionPolicy policy, arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(), lambda._capture._arg_decls.end()); - + std::stringstream source_ss; source_ss << "parallel_for_program\n"; for( auto const& header : policy.headers ) { @@ -2517,7 +2634,7 @@ CUresult parallel_for(ExecutionPolicy policy, " " << "\t" << lambda._func_string << ";\n" << " }\n" "}\n"; - + Program program = kernel_cache.program(source_ss.str(), policy.headers, policy.options, @@ -2529,7 +2646,7 @@ CUresult parallel_for(ExecutionPolicy policy, arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(), lambda._capture._arg_ptrs.end()); - + size_t n = end - begin; dim3 block(policy.block_size); dim3 grid(std::min((n-1) / block.x + 1, size_t(65535))); diff --git a/jitify_example.cpp b/jitify_example.cpp index 1b4f513..65ccf47 100644 --- a/jitify_example.cpp +++ b/jitify_example.cpp @@ -129,12 +129,12 @@ bool test_kernels() { " }\n" "}\n" ; - + using jitify::reflection::reflect; using jitify::reflection::NonType; using jitify::reflection::Type; using jitify::reflection::type_of; - + thread_local static jitify::JitCache kernel_cache; jitify::Program program = kernel_cache.program (program1, // Code string specified above @@ -142,14 +142,14 @@ bool test_kernels() { {"--use_fast_math", "-I/usr/local/cuda/include"}, file_callback); - + T* indata; T* outdata; cudaMalloc((void**)&indata, sizeof(T)); cudaMalloc((void**)&outdata, sizeof(T)); T inval = 3.14159f; cudaMemcpy(indata, &inval, sizeof(T), cudaMemcpyHostToDevice); - + dim3 grid(1); dim3 block(1); CHECK_CUDA(program @@ -179,14 +179,14 @@ bool test_kernels() { .instantiate((int)C, type_of(*indata)) .configure(grid,block) .launch(indata, outdata) ); - + T outval = 0; cudaMemcpy(&outval, outdata, sizeof(T), cudaMemcpyDeviceToHost); cudaFree(outdata); cudaFree(indata); - + std::cout << inval << " -> " << outval << std::endl; - + return are_close(inval, outval); } @@ -196,16 +196,16 @@ bool test_parallel_for() { T* d_out; cudaMalloc((void**)&d_out, n*sizeof(T)); T val = 3.14159f; - + jitify::ExecutionPolicy policy(jitify::DEVICE); auto lambda = JITIFY_LAMBDA( (d_out, val) , d_out[i] = i*val ); CHECK_CUDA(jitify::parallel_for(policy, 0, n, lambda) ); - + std::vector h_out(n); cudaMemcpy(&h_out[0], d_out, n*sizeof(T), cudaMemcpyDeviceToHost); - + cudaFree(d_out); - + for( int i=0; i(); bool test_kernels_result = test_kernels(); @@ -229,11 +229,11 @@ int main(int argc, char *argv[]) { test_simple_result &= test_simple(); test_kernels_result &= test_kernels(); test_parallel_for_result &= test_parallel_for(); - + std::cout << "test_simple: " << TEST_RESULT(test_simple_result) << std::endl; std::cout << "test_kernels: " << TEST_RESULT(test_kernels_result) << std::endl; std::cout << "test_parallel_for: " << TEST_RESULT(test_parallel_for_result) << std::endl; - + return (!test_simple_result + !test_kernels_result + !test_parallel_for_result); diff --git a/stringify.cpp b/stringify.cpp index 9647fbe..75c129a 100644 --- a/stringify.cpp +++ b/stringify.cpp @@ -81,4 +81,3 @@ int main(int argc, char *argv[]) { ostream << ";" << std::endl; return 0; } -