diff --git a/CMakeLists.txt b/CMakeLists.txt index 1efaa14262566108d5c472ff84e7c259b0614a2d..0e08ef23032702bb0b04b835109121545ffd9d19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,6 +309,10 @@ option(WITH_IMAGE_CINEON "Enable CINEON and DPX Image Support" ON) option(WITH_IMAGE_HDR "Enable HDR Image Support" ON) option(WITH_IMAGE_FRAMESERVER "Enable image FrameServer Support for rendering" ON) +#IT4I +option(WITH_IT4I_MPI "Enable MPI (has to be supported by the compiler)" OFF) +option(WITH_IT4I_MIC_OFFLOAD "Enable MIC (has to be supported by the compiler)" OFF) + # Audio/Video format support option(WITH_CODEC_AVI "Enable Blenders own AVI file support (raw/jpeg)" ON) option(WITH_CODEC_FFMPEG "Enable FFMPeg Support (http://ffmpeg.org)" ${_init_CODEC_FFMPEG}) @@ -3035,12 +3039,14 @@ if(FIRST_RUN) info_cfg_option(WITH_CYCLES) info_cfg_option(WITH_FREESTYLE) info_cfg_option(WITH_OPENCOLORIO) - info_cfg_option(WITH_OPENVDB) + info_cfg_option(WITH_OPENVDB) info_cfg_text("Compiler Options:") info_cfg_option(WITH_BUILDINFO) info_cfg_option(WITH_OPENMP) info_cfg_option(WITH_RAYOPTIMIZATION) + info_cfg_option(WITH_IT4I_MPI) + info_cfg_option(WITH_IT4I_MIC_OFFLOAD) info_cfg_text("System Options:") info_cfg_option(WITH_INSTALL_PORTABLE) diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake index 7bdf79098258e5d4d2473bf81e0c4674f1325f5d..99546d3e6aa7b817443ff2497d9c70e677e5646d 100644 --- a/build_files/cmake/macros.cmake +++ b/build_files/cmake/macros.cmake @@ -502,6 +502,21 @@ function(SETUP_BLENDER_SORTED_LIBS) endif() endif() + if(WITH_IT4I_MPI) + list(APPEND BLENDER_LINK_LIBS + cycles_kernel_mpi) + endif() + + if(WITH_IT4I_MIC_OFFLOAD) + list(APPEND BLENDER_LINK_LIBS + cycles_kernel_mic) + endif() + + if(WITH_OPENMP) + list(APPEND BLENDER_LINK_LIBS + cycles_kernel_omp) + endif() + # Sort libraries set(BLENDER_SORTED_LIBS bf_windowmanager @@ -607,6 +622,9 @@ function(SETUP_BLENDER_SORTED_LIBS) cycles_bvh cycles_device cycles_kernel + cycles_kernel_mic + cycles_kernel_omp + cycles_kernel_mpi cycles_util cycles_subd bf_intern_opencolorio diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index a8cc4907cbf56ac14a7c4f47bb82dee57ed387f7..fe9b21ec1079c62487bb068d86c97c17b8497ebd 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -56,6 +56,18 @@ if(WITH_CYCLES_NETWORK) add_definitions(-DWITH_NETWORK) endif() +if (WITH_IT4I_MPI) + add_definitions(-DWITH_IT4I_MPI) +endif() + +if (WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) +endif() + +if (WITH_OPENMP) + add_definitions(-DWITH_OPENMP) +endif() + blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}") # avoid link failure with clang 3.4 debug diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 96dc3a59ef2df73319241f85a5f5ad8635e3934b..aa399ac27c3def588e898c898b369ef78d260db9 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -155,6 +155,14 @@ def with_osl(): def with_network(): import _cycles return _cycles.with_network + +def with_openmp(): + import _cycles + return _cycles.with_openmp + +def with_it4i_mpi(): + import _cycles + return _cycles.with_it4i_mpi def system_info(): diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 01aa619b3068a265d3827868c8aed39aa1018814..da0d8e59dd6f8c0afae8ba12da637e3445fb1690 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -29,12 +29,12 @@ import _cycles enum_devices = ( ('CPU', "CPU", "Use CPU for rendering"), - ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in user preferences"), + ('Acc', "Acc Compute", "Use GPU/MIC/MPI compute device for rendering, configured in user preferences"), ) if _cycles.with_network: enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),) - + enum_feature_set = ( ('SUPPORTED', "Supported", "Only use finished and supported features"), ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1), diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index b9e51dfddd4483a45cc232194f808d0dfbed13ba..ef561252f686af3a6277f7c7b5bb74acbad93c08 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -57,7 +57,7 @@ def use_cpu(context): cscene = context.scene.cycles device_type = context.user_preferences.system.compute_device_type - return (device_type == 'NONE' or cscene.device == 'CPU') + return (device_type == 'NONE' or cscene.device == 'CPU' or cscene.device == 'OMP' or cscene.device == 'MPI') def use_opencl(context): @@ -1609,7 +1609,7 @@ def draw_device(self, context): layout.prop(cscene, "feature_set") device_type = context.user_preferences.system.compute_device_type - if device_type in {'CUDA', 'OPENCL', 'NETWORK'}: + if device_type in {'CUDA', 'OPENCL', 'NETWORK', 'OMP', 'MPI'}: layout.prop(cscene, "device") if engine.with_osl() and use_cpu(context): diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 27eab0c7f681184675534ef1f4f7678be18fcace..3600a1ffd9eed1a59fba7f049b779a52957ec5fd 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -738,6 +738,22 @@ void *CCL_python_module_init() Py_INCREF(Py_False); #endif /* WITH_NETWORK */ +#ifdef WITH_OPENMP + PyModule_AddObject(mod, "with_openmp", Py_True); + Py_INCREF(Py_True); +#else /* WITH_OPENMP */ + PyModule_AddObject(mod, "with_openmp", Py_False); + Py_INCREF(Py_False); +#endif /* WITH_OPENMP */ + +#ifdef WITH_IT4I_MPI + PyModule_AddObject(mod, "with_it4i_mpi", Py_True); + Py_INCREF(Py_True); +#else /* WITH_IT4I_MPI */ + PyModule_AddObject(mod, "with_it4i_mpi", Py_False); + Py_INCREF(Py_False); +#endif /* WITH_IT4I_MPI */ + return (void*)mod; } @@ -754,6 +770,12 @@ CCLDeviceInfo *CCL_compute_device_list(int device_type) case 2: type = ccl::DEVICE_NETWORK; break; + case 4: + type = ccl::DEVICE_OMP; + break; + case 5: + type = ccl::DEVICE_MPI; + break; default: type = ccl::DEVICE_NONE; break; diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index f1b524f7b447ddea7b2e80dca6e7db094f284fcd..80f08c4e027fef783c7f7e56f4b5528406fe6360 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -108,7 +108,23 @@ void BlenderSession::create() void BlenderSession::create_session() { SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - bool is_cpu = session_params.device.type == DEVICE_CPU; + +// if(session_params.device.type == DEVICE_OMP || session_params.device.type == DEVICE_MPI) +// { +// if (!background) +// { +// session_params.tile_size = make_int2(width, height); +// } +// else +// { +// int tile_x = b_engine.resolution_x(); +// int tile_y = b_engine.resolution_y(); +// +// session_params.tile_size = make_int2(tile_x, tile_y); +// } +// } +// + bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP; SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); bool session_pause = BlenderSync::get_session_pause(b_scene, background); @@ -170,7 +186,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_) b_scene = b_scene_; SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - const bool is_cpu = session_params.device.type == DEVICE_CPU; + const bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP; SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); width = render_resolution_x(b_render); @@ -763,7 +779,7 @@ void BlenderSession::synchronize() /* on session/scene parameter changes, we recreate session entirely */ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - const bool is_cpu = session_params.device.type == DEVICE_CPU; + const bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP; SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); bool session_pause = BlenderSync::get_session_pause(b_scene, background); @@ -913,6 +929,12 @@ void BlenderSession::get_progress(float& progress, double& total_time, double& r session->progress.get_tile(tile, total_time, render_time, tile_time); + if (background && (session->params.device.type == DEVICE_MPI || session->params.device.type == DEVICE_OMP)) + { + progress = ((float) session->device->get_tile_id()) / session->device->get_num_tiles(); + } + else + { sample = session->progress.get_sample(); samples_per_tile = session->tile_manager.num_samples; @@ -922,6 +944,7 @@ void BlenderSession::get_progress(float& progress, double& total_time, double& r progress = ((float)samples) / total_samples; else progress = 0.0; + } } void BlenderSession::update_bake_progress() diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 749b8c0319bb7c3255cf22c1389fdabde0085ffc..c3c97fca6a170036d0716de37dd391b10be65802 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -509,14 +509,18 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, /* device default CPU */ params.device = devices[0]; +//#if !defined(WITH_IT4I_MIC_OFFLOAD) && !defined(WITH_IT4I_MPI) && !defined(WITH_OPENMP) if(get_enum(cscene, "device") == 2) { /* find network device */ foreach(DeviceInfo& info, devices) if(info.type == DEVICE_NETWORK) params.device = info; } - else if(get_enum(cscene, "device") == 1) { - /* find GPU device with given id */ + //GPU, MIC, MPI + else if(get_enum(cscene, "device") == 1) +//#endif + { + /* find device with given id */ PointerRNA systemptr = b_userpref.system().ptr; PropertyRNA *deviceprop = RNA_struct_find_property(&systemptr, "compute_device"); int device_id = b_userpref.system().compute_device(); @@ -569,7 +573,13 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, } /* tiles */ - if(params.device.type != DEVICE_CPU && !background) { + if((params.device.type == DEVICE_OPENCL || + params.device.type == DEVICE_CUDA || + params.device.type == DEVICE_NETWORK || + params.device.type == DEVICE_MULTI) && + !background + ) + { /* currently GPU could be much slower than CPU when using tiles, * still need to be investigated, but meanwhile make it possible * to work in viewport smoothly @@ -578,7 +588,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.tile_size = make_int2(debug_tile_size, debug_tile_size); } - else { + else + { int tile_x = b_engine.tile_x(); int tile_y = b_engine.tile_y(); diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 2a9ec0c38182eec2d6c74dc72514f1a9a10a5a3c..5d23c75378880596c6f3f08e9b089284fbcf9b30 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -4,9 +4,14 @@ set(INC ../kernel ../kernel/svm ../kernel/osl + ../kernel/kernels/mic + ../kernel/kernels/mpi + ../kernel/kernels/omp ../util ../render ../../glew-mx + ../../../it4i/client/api + ${MPI_INCLUDE_DIR} ) set(INC_SYS @@ -63,7 +68,26 @@ if(WITH_CYCLES_DEVICE_MULTI) add_definitions(-DWITH_MULTI) endif() +if (WITH_IT4I_MPI) + add_definitions(-DWITH_IT4I_MPI) + list(APPEND SRC + device_mpi.cpp + ) +endif() + +if (WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) +endif() + +if (WITH_OPENMP) + add_definitions(-DWITH_OPENMP) + list(APPEND SRC + device_omp.cpp + ) +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) add_library(cycles_device ${SRC} ${SRC_HEADERS}) +target_link_libraries (cycles_device ${MPI_LIB_FILE}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 8c01bcb116fe9edf2081b53e6d28e1ecd44d8896..0b58619168189f0b8fe5b7ea1404c9fa3de6068f 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -70,20 +70,20 @@ Device::~Device() void Device::pixels_alloc(device_memory& mem) { - mem_alloc(mem, MEM_READ_WRITE); + mem_alloc("pixel", mem, MEM_READ_WRITE); } void Device::pixels_copy_from(device_memory& mem, int y, int w, int h) { if(mem.data_type == TYPE_HALF) - mem_copy_from(mem, y, w, h, sizeof(half4)); + mem_copy_from("pixel", mem, y, w, h, sizeof(half4)); else - mem_copy_from(mem, y, w, h, sizeof(uchar4)); + mem_copy_from("pixel", mem, y, w, h, sizeof(uchar4)); } void Device::pixels_free(device_memory& mem) { - mem_free(mem); + mem_free("pixel", mem); } void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent, @@ -214,7 +214,23 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background) switch(info.type) { case DEVICE_CPU: device = device_cpu_create(info, stats, background); + break; +#ifdef WITH_OPENMP + case DEVICE_OMP: + if (device_omp_init()) + device = device_omp_create(info, stats, background); + else + device = NULL; + break; +#endif +#ifdef WITH_IT4I_MPI + case DEVICE_MPI: + if (device_mpi_init()) + device = device_mpi_create(info, stats, background); + else + device = NULL; break; +#endif #ifdef WITH_CUDA case DEVICE_CUDA: if(device_cuda_init()) @@ -276,6 +292,10 @@ string Device::string_from_type(DeviceType type) return "network"; else if(type == DEVICE_MULTI) return "multi"; + else if (type == DEVICE_OMP) + return "omp"; + else if (type == DEVICE_MPI) + return "mpi"; return ""; } @@ -286,6 +306,16 @@ vector<DeviceType>& Device::available_types() types.clear(); types.push_back(DEVICE_CPU); +#ifdef WITH_OPENMP + if (device_omp_init()) + types.push_back(DEVICE_OMP); +#endif + +#ifdef WITH_IT4I_MPI + if (device_mpi_init()) + types.push_back(DEVICE_MPI); +#endif + #ifdef WITH_CUDA if(device_cuda_init()) types.push_back(DEVICE_CUDA); @@ -313,6 +343,17 @@ vector<DeviceInfo>& Device::available_devices() { if(need_devices_update) { devices.clear(); + +#ifdef WITH_OPENMP + if (device_omp_init()) + device_omp_info(devices); +#endif + +#ifdef WITH_IT4I_MPI + if (device_mpi_init()) + device_mpi_info(devices); +#endif + #ifdef WITH_CUDA if(device_cuda_init()) device_cuda_info(devices); @@ -374,4 +415,163 @@ void Device::free_memory() devices.free_memory(); } +bool Device::get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams ¶ms, float* buffer_data_pointer) +{ + int pass_offset = 0; + + foreach(Pass& pass, params.passes) { + if(pass.type != type) { + pass_offset += pass.components; + continue; + } + + float *in = (float*)buffer_data_pointer + pass_offset; + int pass_stride = params.get_passes_size(); + + float scale = (pass.filter)? 1.0f/(float)sample: 1.0f; + float scale_exposure = (pass.exposure)? scale*exposure: scale; + + int size = params.width*params.height; + + if(components == 1) { + assert(pass.components == components); + + /* scalar */ + if(type == PASS_DEPTH) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = (f == 0.0f)? 1e10f: f*scale_exposure; + } + } + else if(type == PASS_MIST) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = saturate(f*scale_exposure); + } + } +#ifdef WITH_CYCLES_DEBUG + else if(type == PASS_BVH_TRAVERSAL_STEPS) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = f; + } + } + else if(type == PASS_RAY_BOUNCES) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = f; + } + } +#endif + else { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = f*scale_exposure; + } + } + } + else if(components == 3) { + assert(pass.components == 4); + + /* RGBA */ + if(type == PASS_SHADOW) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + float4 f = make_float4(in[0], in[1], in[2], in[3]); + float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f; + + pixels[0] = f.x*invw; + pixels[1] = f.y*invw; + pixels[2] = f.z*invw; + } + } + else if(pass.divide_type != PASS_NONE) { + /* RGB lighting passes that need to divide out color */ + pass_offset = 0; + foreach(Pass& color_pass, params.passes) { + if(color_pass.type == pass.divide_type) + break; + pass_offset += color_pass.components; + } + + float *in_divide = (float*)buffer_data_pointer + pass_offset; + + for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) { + float3 f = make_float3(in[0], in[1], in[2]); + float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]); + + f = safe_divide_even_color(f*exposure, f_divide); + + pixels[0] = f.x; + pixels[1] = f.y; + pixels[2] = f.z; + } + } + else { + /* RGB/vector */ + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + float3 f = make_float3(in[0], in[1], in[2]); + + pixels[0] = f.x*scale_exposure; + pixels[1] = f.y*scale_exposure; + pixels[2] = f.z*scale_exposure; + } + } + } + else if(components == 4) { + assert(pass.components == components); + + /* RGBA */ + if(type == PASS_SHADOW) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { + float4 f = make_float4(in[0], in[1], in[2], in[3]); + float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f; + + pixels[0] = f.x*invw; + pixels[1] = f.y*invw; + pixels[2] = f.z*invw; + pixels[3] = 1.0f; + } + } + else if(type == PASS_MOTION) { + /* need to normalize by number of samples accumulated for motion */ + pass_offset = 0; + foreach(Pass& color_pass, params.passes) { + if(color_pass.type == PASS_MOTION_WEIGHT) + break; + pass_offset += color_pass.components; + } + + float *in_weight = (float*)buffer_data_pointer + pass_offset; + + for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) { + float4 f = make_float4(in[0], in[1], in[2], in[3]); + float w = in_weight[0]; + float invw = (w > 0.0f)? 1.0f/w: 0.0f; + + pixels[0] = f.x*invw; + pixels[1] = f.y*invw; + pixels[2] = f.z*invw; + pixels[3] = f.w*invw; + } + } + else { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { + float4 f = make_float4(in[0], in[1], in[2], in[3]); + + pixels[0] = f.x*scale_exposure; + pixels[1] = f.y*scale_exposure; + pixels[2] = f.z*scale_exposure; + + /* clamp since alpha might be > 1.0 due to russian roulette */ + pixels[3] = saturate(f.w*scale); + } + } + } + + return true; + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 30d0003b94070f45e1768f9b0fac50c362e09cb4..2a772b4897f6aaecfe8a33d2dbdbd73f112ba9d3 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,8 @@ #include "util_types.h" #include "util_vector.h" +#include "buffers.h" + CCL_NAMESPACE_BEGIN class Progress; @@ -42,7 +44,9 @@ enum DeviceType { DEVICE_OPENCL, DEVICE_CUDA, DEVICE_NETWORK, - DEVICE_MULTI + DEVICE_MULTI, + DEVICE_OMP, + DEVICE_MPI }; class DeviceInfo { @@ -201,14 +205,16 @@ public: /* statistics */ Stats &stats; + + /* regular memory */ - virtual void mem_alloc(device_memory& mem, MemoryType type) = 0; - virtual void mem_copy_to(device_memory& mem) = 0; - virtual void mem_copy_from(device_memory& mem, + virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0; + virtual void mem_copy_to(const char *name, device_memory& mem) = 0; + virtual void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) = 0; - virtual void mem_zero(device_memory& mem) = 0; - virtual void mem_free(device_memory& mem) = 0; + virtual void mem_zero(const char *name, device_memory& mem) = 0; + virtual void mem_free(const char *name, device_memory& mem) = 0; /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; @@ -222,7 +228,7 @@ public: (void)interpolation; /* Ignored. */ (void)extension; /* Ignored. */ }; - virtual void tex_free(device_memory& /*mem*/) {}; + virtual void tex_free(const char *name, device_memory& /*mem*/) {}; /* pixel memory */ virtual void pixels_alloc(device_memory& mem); @@ -242,6 +248,13 @@ public: virtual void task_add(DeviceTask& task) = 0; virtual void task_wait() = 0; virtual void task_cancel() = 0; + + /* tiles */ + virtual int get_tile_id(){return 0;}; + virtual int get_num_tiles(){return 0;}; + + /* pass */ + virtual bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams ¶ms, float* buffer_data_pointer); /* opengl drawing */ virtual void draw_pixels(device_memory& mem, int y, int w, int h, diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 676b1279a80ce3c80212bf69e37b4dfc5e79f346..eec6a061998fca3a06db57759f6e3cf5354e2c9f 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -46,6 +46,10 @@ #include "util_system.h" #include "util_thread.h" +#ifdef WITH_OPENMP + #include <omp.h> +#endif + CCL_NAMESPACE_BEGIN class CPUDevice : public Device @@ -112,31 +116,31 @@ public: task_pool.stop(); } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { mem.device_pointer = mem.data_pointer; mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); } - void mem_copy_to(device_memory& /*mem*/) + void mem_copy_to(const char *name, device_memory& /*mem*/) { /* no-op */ } - void mem_copy_from(device_memory& /*mem*/, + void mem_copy_from(const char *name, device_memory& /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) { /* no-op */ } - void mem_zero(device_memory& mem) + void mem_zero(const char *name, device_memory& mem) { memset((void*)mem.device_pointer, 0, mem.memory_size()); } - void mem_free(device_memory& mem) + void mem_free(const char *name, device_memory& mem) { if(mem.device_pointer) { mem.device_pointer = 0; @@ -155,6 +159,8 @@ public: InterpolationType interpolation, ExtensionType extension) { + printf("tex_alloc: %s\n", name); + VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; kernel_tex_copy(&kernel_globals, name, @@ -169,8 +175,10 @@ public: stats.mem_alloc(mem.device_size); } - void tex_free(device_memory& mem) + void tex_free(const char *name, device_memory& mem) { + printf("tex_free: %s\n", name); + if(mem.device_pointer) { mem.device_pointer = 0; stats.mem_free(mem.device_size); @@ -188,13 +196,22 @@ public: } void thread_run(DeviceTask *task) - { + { +#ifdef WITH_OPENMP + double t1 = omp_get_wtime(); +#endif + if(task->type == DeviceTask::PATH_TRACE) thread_path_trace(*task); else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) thread_shader(*task); + +#ifdef WITH_OPENMP + double t2 = omp_get_wtime(); + printf("DEVICE: CPU, %f\n", t2 - t1); +#endif } class CPUDeviceTask : public DeviceTask { @@ -460,6 +477,7 @@ public: void task_add(DeviceTask& task) { + /* split task into smaller ones */ list<DeviceTask> tasks; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 80c8cb1e59229b7f75fa765b00f45257d231b375..734b8100fe31b7da82eddcd31dfa5ead77d810c0 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -397,7 +397,7 @@ public: return (result == CUDA_SUCCESS); } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { cuda_push_context(); CUdeviceptr device_pointer; @@ -409,7 +409,7 @@ public: cuda_pop_context(); } - void mem_copy_to(device_memory& mem) + void mem_copy_to(const char *name, device_memory& mem) { cuda_push_context(); if(mem.device_pointer) @@ -417,7 +417,7 @@ public: cuda_pop_context(); } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) { size_t offset = elem*y*w; size_t size = elem*w*h; @@ -433,7 +433,7 @@ public: cuda_pop_context(); } - void mem_zero(device_memory& mem) + void mem_zero(const char *name, device_memory& mem) { memset((void*)mem.data_pointer, 0, mem.memory_size()); @@ -443,7 +443,7 @@ public: cuda_pop_context(); } - void mem_free(device_memory& mem) + void mem_free(const char *name, device_memory& mem) { if(mem.device_pointer) { cuda_push_context(); @@ -596,8 +596,8 @@ public: else { cuda_pop_context(); - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + mem_alloc(name, mem, MEM_READ_ONLY); + mem_copy_to(name, mem); cuda_push_context(); @@ -627,8 +627,8 @@ public: cuda_pop_context(); } else { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + mem_alloc(name, mem, MEM_READ_ONLY); + mem_copy_to(name, mem); cuda_push_context(); @@ -654,7 +654,7 @@ public: tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); } - void tex_free(device_memory& mem) + void tex_free(const char *name, device_memory& mem) { if(mem.device_pointer) { if(tex_interp_map[mem.device_pointer]) { @@ -670,7 +670,7 @@ public: } else { tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); - mem_free(mem); + mem_free(name, mem); } } } diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h index 47584ae6d226714f1d24610d2790ffddfb5b772e..d4622ce0acd9608bf12eb0962325d9deddb4ad13 100644 --- a/intern/cycles/device/device_intern.h +++ b/intern/cycles/device/device_intern.h @@ -39,6 +39,20 @@ string device_cpu_capabilities(void); string device_opencl_capabilities(void); string device_cuda_capabilities(void); +#ifdef WITH_OPENMP +string device_omp_capabilities(void); +bool device_omp_init(void); +Device *device_omp_create(DeviceInfo& info, Stats &stats, bool background); +void device_omp_info(vector<DeviceInfo>& devices); +#endif + +#ifdef WITH_IT4I_MPI +string device_mpi_capabilities(void); +void device_mpi_info(vector<DeviceInfo>& devices); +bool device_mpi_init(void); +Device *device_mpi_create(DeviceInfo& info, Stats &stats, bool background); +#endif + CCL_NAMESPACE_END #endif /* __DEVICE_INTERN_H__ */ diff --git a/intern/cycles/device/device_mpi.cpp b/intern/cycles/device/device_mpi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1adf5f2e353eb9dc7ddd45bd9d0aa66eac586381 --- /dev/null +++ b/intern/cycles/device/device_mpi.cpp @@ -0,0 +1,1034 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS,task_pool + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <stdlib.h> +#include <string.h> + +#include "device.h" +#include "device_intern.h" + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_types.h" +#include "kernel_globals.h" + +#include "util_foreach.h" + +#include "kernel_mpi.h" + +#include <mpi.h> +#include <omp.h> + +CCL_NAMESPACE_BEGIN + +class MultiMPIDevice : public Device +{ +public: + DedicatedTaskPool task_pool; + KernelGlobals kernel_globals; + device_ptr rgba_pixels; + + int tile_id; + int num_tiles; + + MultiMPIDevice(DeviceInfo& info, Stats &stats, bool background_) + : Device(info, stats, background_) + { + tile_id = 0; + num_tiles = 0; + rgba_pixels = NULL; + mpi_alloc_kg(info.num == 1); + } + + ~MultiMPIDevice() + { + mpi_free_kg(); + } + + void mem_alloc(const char *name, device_memory& mem, MemoryType type) + { + mem.device_pointer = mem.data_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + if (!strcmp(name, "pixel")) + { + rgba_pixels = mem.device_pointer; + } + + mpi_mem_alloc(name, mem.device_pointer, mem.device_size); + } + + void mem_copy_to(const char *name, device_memory& mem) + { + mpi_mem_copy_to(mem.device_pointer, mem.device_size, 0); + } + + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) + { + } + + void mem_zero(const char *name, device_memory& mem) + { + if (mem.device_pointer) + { + memset((void*) mem.device_pointer, 0, mem.memory_size()); + + mpi_mem_zero(mem.device_pointer, mem.device_size, 0); + } + } + + void mem_free(const char *name, device_memory& mem) + { + if (mem.device_pointer) + { + if (!strcmp(name, "pixel")) + { + rgba_pixels = NULL; + } + + mpi_mem_free(mem.device_pointer, mem.device_size); + + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + void const_copy_to(const char *name, void *host, size_t size) + { + kernel_const_copy(&kernel_globals, name, host, size); + mpi_const_copy(name, (char*) host, size); + } + + void tex_alloc(const char *name, + device_memory& mem, + InterpolationType + interpolation, + ExtensionType extension) + { + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + mem.device_pointer = mem.data_pointer; + + kernel_tex_copy(&kernel_globals, + name, + mem.data_pointer, + mem.data_width, + mem.data_height, + mem.data_depth, + interpolation, + extension); + + mpi_tex_copy(name, + mem.device_pointer, + mem.device_size, + mem.data_width, + mem.data_height, + mem.data_depth, + interpolation, + (int) extension); + } + + void tex_free(const char *name, device_memory& mem) + { + if (mem.device_pointer) + { + mpi_tex_free(name, mem.device_pointer, mem.device_size); + + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + int get_split_task_count(DeviceTask& task) + { + return 1; + } + + class MultiMPIDeviceTask : public DeviceTask + { + public: + + MultiMPIDeviceTask(MultiMPIDevice *device, DeviceTask& task) + : DeviceTask(task) + { + run = function_bind(&MultiMPIDevice::thread_run, device, this); + } + }; + + void task_add(DeviceTask& task) + { + task_pool.push(new MultiMPIDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + void thread_run(DeviceTask *task) + { +#ifdef WITH_OPENMP + double t1 = omp_get_wtime(); +#endif + + if (task->type == DeviceTask::PATH_TRACE) + thread_path_trace(*task); + else if (task->type == DeviceTask::FILM_CONVERT) + thread_film_convert(*task); + else if (task->type == DeviceTask::SHADER) + thread_shader(*task); + +#ifdef WITH_OPENMP + double t2 = omp_get_wtime(); + printf("DEVICE: MPI, %f\n", t2 - t1); +#endif + } + + void receive_path_buffer_progressive(DeviceTask& task, RenderTile &tile, int offset, int stride) + { + const int dev_count = info.multi_devices.size(); + + int tile_x = tile.buffers->params.full_x; + int tile_y = tile.buffers->params.full_y; + int tile_h = tile.buffers->params.height; + int tile_w = tile.buffers->params.width; + + int pass_stride = tile.buffers->params.get_passes_size(); + int end_sample = tile.start_sample + tile.num_samples; + + int tile_step = tile_h / info.multi_devices.size(); + int tile_last = tile_h - (info.multi_devices.size() - 1) * tile_step; + + const int dev_countAll = dev_count + 1; + std::vector<int> displsBuf(dev_countAll); + std::vector<int> recvcountsBuf(dev_countAll); + displsBuf[0] = 0; + recvcountsBuf[0] = 0; + + std::vector<int> displsByte(dev_countAll); + std::vector<int> recvcountsByte(dev_countAll); + displsByte[0] = 0; + recvcountsByte[0] = 0; + + std::vector<int> sample_finished(dev_count); + std::vector<int> displsSample(dev_countAll); + std::vector<int> recvcountsSample(dev_countAll); + displsSample[0] = 0; + recvcountsSample[0] = 0; + + for (int dev = 0; dev < dev_count; dev++) + { + int tile_y2 = tile_y + tile_step * dev; + int tile_h2 = (dev_count - 1 == dev) ? tile_last : tile_step; + + displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float); + recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float); + + displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4); + recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4); + + displsSample[dev + 1] = dev * sizeof (int); + recvcountsSample[dev + 1] = sizeof (int); + } + + int reqFinished = 0; + + if (rgba_pixels != NULL) + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + else + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + tile.sample = end_sample; + task.update_progress(&tile); + } + + virtual int get_tile_id() + { + return tile_id; + }; + + virtual int get_num_tiles() + { + return num_tiles; + }; + + void receive_path_buffer_offline(DeviceTask& task, RenderTile &tile, int offset, int stride) + { + const int dev_count = info.multi_devices.size(); + + int tile_x = tile.buffers->params.full_x; + int tile_y = tile.buffers->params.full_y; + int tile_h = tile.buffers->params.height; + int tile_w = tile.buffers->params.width; + + tile_x = tile.x; + tile_y = tile.y; + tile_h = tile.h; + tile_w = tile.w; + + offset = tile.offset; + stride = tile.stride; + + num_tiles = tile_h; + tile_id = 0; + + int pass_stride = tile.buffers->params.get_passes_size(); + int end_sample = tile.start_sample + tile.num_samples; + + int tile_step = 1;//TILE_STEP; + + if (getenv("IT4I_OMP_TILE_STEP")) + { + tile_step = atoi(getenv("IT4I_OMP_TILE_STEP")); + printf("IT4I_OMP_TILE_STEP: %d\n", tile_step); + } + + const int dev_countAll = dev_count + 1; + std::vector<int> displsBuf(dev_countAll); + std::vector<int> recvcountsBuf(dev_countAll); + displsBuf[0] = 0; + recvcountsBuf[0] = 0; + + std::vector<int> displsByte(dev_countAll); + std::vector<int> recvcountsByte(dev_countAll); + displsByte[0] = 0; + recvcountsByte[0] = 0; + + std::vector<int> sample_finished(dev_count); + std::vector<int> displsSample(dev_countAll); + std::vector<int> recvcountsSample(dev_countAll); + displsSample[0] = 0; + recvcountsSample[0] = 0; + + std::vector<int> row_finished(dev_count); + std::vector<int> displsRow(dev_countAll); + std::vector<int> recvcountsRow(dev_countAll); + displsRow[0] = 0; + recvcountsRow[0] = 0; + + std::vector<int> reqJob(dev_count); + std::vector<int> displsJob(dev_countAll); + std::vector<int> sendcountsJob(dev_countAll); + displsJob[0] = 0; + sendcountsJob[0] = 0; + + int tile_y_node = tile_y + tile_step*dev_count; + + for (int dev = 0; dev < dev_count; dev++) + { + int tile_y2 = tile_y + tile_step * dev; + int tile_h2 = tile_step; + + displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float); + recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float); + + displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4); + recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4); + + displsSample[dev + 1] = dev * sizeof (int); + recvcountsSample[dev + 1] = sizeof (int); + + displsRow[dev + 1] = dev * sizeof (int); + recvcountsRow[dev + 1] = sizeof (int); + + displsJob[dev + 1] = dev * sizeof (int); + sendcountsJob[dev + 1] = sizeof (int); + reqJob[dev] = -1; + } + + int reqFinished = 0; + + while (true) + { + MPI_Gatherv(NULL, 0, MPI_BYTE, &sample_finished[0], &recvcountsSample[0], &displsSample[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + for (int i = 0; i < dev_count; i++) + { + //printf("SERVER: sample_finished: %d\n", sample_finished[i]); + //printf("SERVER: row_finished: %d\n", row_finished[i]); + + displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float); + displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4); + } + + if (rgba_pixels != NULL) + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + else + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + int min_count = 1; + for (int i = 0; i < dev_count; i++) + { + if (sample_finished[i] == 0 && tile_y_node < tile_h) + { + reqJob[i] = tile_y_node; + sample_finished[i] = 1; + + //displsBuf[i + 1] = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + //displsByte[i + 1] = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4); + + tile_y_node += tile_step; + + tile_id += tile_step; + } + else + { + reqJob[i] = -1; + } + + if (min_count > sample_finished[i]) + min_count = sample_finished[i]; + } + + task.update_progress(&tile); + + if (reqFinished != 0) + { + for (int i = 0; i < dev_count; i++) + { + reqJob[i] = -2; + } + } + + MPI_Scatterv(&reqJob[0], &sendcountsJob[0], &displsJob[0], MPI_BYTE, NULL, 0, MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (reqFinished != 0) + { + //printf("SERVER: finished %f\n", omp_get_wtime()); + //fflush(0); + break; + } + + if (min_count == 0 && tile_y_node >= tile_h) + { + reqFinished = 1; + } + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + reqFinished = 1; + } + } + + MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + for (int i = 0; i < dev_count; i++) + { + displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float); + displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4); + } + + if (rgba_pixels != NULL) + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + else + MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + + // { + // for (int d = 0; d < MAX_NODE_DEVICES; d++) + // { + // MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // + // for (int i = 0; i < dev_count; i++) + // { + // displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float); + // displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4); + // } + // + // if (rgba_pixels != NULL) + // MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // else + // MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // } + // } + } + + // void receive_path_buffer_offline(DeviceTask& task, RenderTile &tile, int offset, int stride) + // { + // const int dev_count = info.multi_devices.size(); + // + // int tile_x = tile.buffers->params.full_x; + // int tile_y = tile.buffers->params.full_y; + // int tile_h = tile.buffers->params.height; + // int tile_w = tile.buffers->params.width; + // + // tile_x = tile.x; + // tile_y = tile.y; + // tile_h = tile.h; + // tile_w = tile.w; + // + // offset = tile.offset; + // stride = tile.stride; + // + // num_tiles = tile_h; + // tile_id = 0; + // + // int pass_stride = tile.buffers->params.get_passes_size(); + // int end_sample = tile.start_sample + tile.num_samples; + // + // int tile_step = TILE_STEP; + // + // const int dev_countAll = dev_count + 1; + // std::vector<int> displsBuf(dev_countAll); + // std::vector<int> recvcountsBuf(dev_countAll); + // displsBuf[0] = 0; + // recvcountsBuf[0] = 0; + // + // std::vector<int> displsByte(dev_countAll); + // std::vector<int> recvcountsByte(dev_countAll); + // displsByte[0] = 0; + // recvcountsByte[0] = 0; + // + // std::vector<int> sample_finished(dev_count); + // std::vector<int> displsSample(dev_countAll); + // std::vector<int> recvcountsSample(dev_countAll); + // displsSample[0] = 0; + // recvcountsSample[0] = 0; + // + // std::vector<int> row_finished(dev_count); + // std::vector<int> displsRow(dev_countAll); + // std::vector<int> recvcountsRow(dev_countAll); + // displsRow[0] = 0; + // recvcountsRow[0] = 0; + // + // std::vector<int> reqJob(dev_count); + // std::vector<int> displsJob(dev_countAll); + // std::vector<int> sendcountsJob(dev_countAll); + // displsJob[0] = 0; + // sendcountsJob[0] = 0; + // + // int tile_y_node = tile_y + dev_count*TILE_STEP; + // + // for (int dev = 0; dev < dev_count; dev++) + // { + // int tile_y2 = tile_y + tile_step * dev; + // int tile_h2 = tile_step; + // + // displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float); + // recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float); + // + // displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4); + // recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4); + // + // displsSample[dev + 1] = dev * sizeof (int); + // recvcountsSample[dev + 1] = sizeof (int); + // + // displsRow[dev + 1] = dev * sizeof (int); + // recvcountsRow[dev + 1] = sizeof (int); + // + // displsJob[dev + 1] = dev * sizeof (int); + // sendcountsJob[dev + 1] = sizeof (int); + // reqJob[dev] = -1; + // } + // + // int reqFinished = 0; + // + // while (true) + // { + // MPI_Gatherv(NULL, 0, MPI_BYTE, &sample_finished[0], &recvcountsSample[0], &displsSample[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // + // MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // + // for (int i = 0; i < dev_count; i++) + // { + // //printf("SERVER: sample_finished: %d\n", sample_finished[i]); + // //printf("SERVER: row_finished: %d\n", row_finished[i]); + // + // displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float); + // displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4); + // } + // + // if (rgba_pixels != NULL) + // MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // else + // MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + // + // int min_count = end_sample; + // for (int i = 0; i < dev_count; i++) + // { + // if (min_count > sample_finished[i]) + // min_count = sample_finished[i]; + // + // if (sample_finished[i] == end_sample && tile_y_node < tile_h) + // { + // reqJob[i] = tile_y_node; + // + // //displsBuf[i + 1] = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + // //displsByte[i + 1] = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4); + // + // tile_y_node+=TILE_STEP; + // + // tile_id+=TILE_STEP; + // } + // else + // { + // reqJob[i] = -1; + // } + // } + // + // task.update_progress(&tile); + // + // if (reqFinished != 0) + // { + // for (int i = 0; i < dev_count; i++) + // { + // reqJob[i] = -2; + // } + // } + // + // MPI_Scatterv(&reqJob[0], &sendcountsJob[0], &displsJob[0], MPI_BYTE, NULL, 0, MPI_BYTE, 0, MPI_COMM_WORLD); + // //MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD); + // + // if (reqFinished != 0) + // { + // break; + // } + // + // if (min_count == end_sample && tile_y_node >= tile_h) + // { + // reqFinished = 1; + // } + // + // if (task_pool.canceled()) + // { + // if (task.need_finish_queue == false) + // reqFinished = 1; + // } + // } + // + //// { + //// for (int d = 0; d < MAX_NODE_DEVICES; d++) + //// { + //// MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD); + //// + //// for (int i = 0; i < dev_count; i++) + //// { + //// displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float); + //// displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4); + //// } + //// + //// if (rgba_pixels != NULL) + //// MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD); + //// else + //// MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD); + //// } + //// } + // } + + void thread_path_trace(DeviceTask& task) + { + //printf("SERVER: thread_path_trace\n"); + + //double t[256]; + //int index = 0; + + //t[index++] = omp_get_wtime(); + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + return; + } + + RenderTile tile; + + int tile_h = 0; + int tile_w = 0; + int num_samples_orig = 0; + + while (task.acquire_tile(this, tile)) + { + int offset, stride; + tile.buffers->params.get_offset_stride(offset, stride); + + int tile_x = tile.buffers->params.full_x; + int tile_y = tile.buffers->params.full_y; + tile_h = tile.buffers->params.height; + tile_w = tile.buffers->params.width; + num_samples_orig = tile.num_samples_orig; + + tile.sample = tile.start_sample + tile.num_samples; + + if (!tile.progressive) + { + tile_x = tile.x; + tile_y = tile.y; + tile_h = tile.h; + tile_w = tile.w; + + offset = tile.offset; + stride = tile.stride; + } + + bool progressive = tile.progressive; // || background; + + mpi_path_trace(kernel_globals.__data_size, (char*) rgba_pixels, tile.half_float, (char*) tile.buffer, (char*) tile.rng_state, progressive, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w); + + if (progressive) + { + receive_path_buffer_progressive(task, tile, offset, stride); + } + else + { +// if (tile_h % TILE_STEP != 0) +// { +// printf("ERROR: tile_h is not divided with %d\n", TILE_STEP); +// return; +// } + receive_path_buffer_offline(task, tile, offset, stride); + } + + tile_id = tile_h; + + task.release_tile(tile); + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + break; + } + } + + //t[index++] = omp_get_wtime(); + //printf("=========MPI: thread_path_trace========: t: %f, w: %d, h:%d, s:%d\n", t[index - 1] - t[0], tile_w, tile_h, num_samples_orig); + } + + void thread_film_convert(DeviceTask& task) + { + } + + void thread_shader(DeviceTask& task) + { + } + + bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams ¶ms, float* buffer) + { + printf("get_pass_rect, sample: %d\n", sample); + + int pass_offset = 0; + + foreach(Pass& pass, params.passes) + { + if (pass.type != type) + { + pass_offset += pass.components; + continue; + } + + float *in = (float*) buffer + pass_offset; + int pass_stride = params.get_passes_size(); + + float scale = (pass.filter) ? 1.0f / (float) sample : 1.0f; + float scale_exposure = (pass.exposure) ? scale * exposure : scale; + + int size = params.width * params.height; + + if (components == 1) + { + assert(pass.components == components); + + /* scalar */ + if (type == PASS_DEPTH) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = (f == 0.0f) ? 1e10f : f*scale_exposure; + } + in += size*pass_stride; + pixels += size; + } + else if (type == PASS_MIST) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = saturate(f * scale_exposure); + } + in += size*pass_stride; + pixels += size; + } +#ifdef WITH_CYCLES_DEBUG + else if (type == PASS_BVH_TRAVERSAL_STEPS) + { + for (int i = 0; i < size; i++, in += pass_stride, pixels++) + { + float f = *in; + pixels[0] = f; + } + } + else if (type == PASS_RAY_BOUNCES) + { + for (int i = 0; i < size; i++, in += pass_stride, pixels++) + { + float f = *in; + pixels[0] = f; + } + } +#endif + else + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = f*scale_exposure; + } + in += size*pass_stride; + pixels += size; + } + } + else if (components == 3) + { + assert(pass.components == 4); + + /* RGBA */ + if (type == PASS_SHADOW) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 3 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; + + pixels[i * 3 + 0] = f.x*invw; + pixels[i * 3 + 1] = f.y*invw; + pixels[i * 3 + 2] = f.z*invw; + } + in += size*pass_stride; + pixels += size * 3; + + } + else if (pass.divide_type != PASS_NONE) + { + /* RGB lighting passes that need to divide out color */ + pass_offset = 0; + + foreach(Pass& color_pass, params.passes) + { + if (color_pass.type == pass.divide_type) + break; + pass_offset += color_pass.components; + } + + float *in_divide = (float*) buffer + pass_offset; + +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, in_divide += pass_stride, pixels += 3 + float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]); + float3 f_divide = make_float3(in_divide[i * pass_stride + 0], in_divide[i * pass_stride + 1], in_divide[i * pass_stride + 2]); + + f = safe_divide_even_color(f*exposure, f_divide); + + pixels[i * 3 + 0] = f.x; + pixels[i * 3 + 1] = f.y; + pixels[i * 3 + 2] = f.z; + } + + in += size*pass_stride; + in_divide += size*pass_stride; + pixels += size * 3; + } + else + { + /* RGB/vector */ +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 3 + float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]); + + pixels[i * 3 + 0] = f.x*scale_exposure; + pixels[i * 3 + 1] = f.y*scale_exposure; + pixels[i * 3 + 2] = f.z*scale_exposure; + } + + in += size*pass_stride; + pixels += size * 3; + } + } + else if (components == 4) + { + assert(pass.components == components); + + /* RGBA */ + if (type == PASS_SHADOW) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; + + pixels[i * 4 + 0] = f.x*invw; + pixels[i * 4 + 1] = f.y*invw; + pixels[i * 4 + 2] = f.z*invw; + pixels[i * 4 + 3] = 1.0f; + } + + in += size*pass_stride; + pixels += size * 4; + } + else if (type == PASS_MOTION) + { + /* need to normalize by number of samples accumulated for motion */ + pass_offset = 0; + + foreach(Pass& color_pass, params.passes) + { + if (color_pass.type == PASS_MOTION_WEIGHT) + break; + pass_offset += color_pass.components; + } + + float *in_weight = (float*) buffer + pass_offset; + +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, in_weight += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float w = in_weight[i * pass_stride + 0]; + float invw = (w > 0.0f) ? 1.0f / w : 0.0f; + + pixels[i * 4 + 0] = f.x*invw; + pixels[i * 4 + 1] = f.y*invw; + pixels[i * 4 + 2] = f.z*invw; + pixels[i * 4 + 3] = f.w*invw; + } + + in += size*pass_stride; + in_weight += size*pass_stride; + pixels += size * 4; + } + else + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + + pixels[i * 4 + 0] = f.x*scale_exposure; + pixels[i * 4 + 1] = f.y*scale_exposure; + pixels[i * 4 + 2] = f.z*scale_exposure; + + /* clamp since alpha might be > 1.0 due to russian roulette */ + //pixels[i * 4 + 3] = saturate(f.w*scale); + pixels[i * 4 + 3] = saturate(f.w); + } + + in += size*pass_stride; + pixels += size * 4; + } + } + + return true; + } + return false; + } + + +}; + +Device *device_mpi_create(DeviceInfo& info, Stats &stats, bool background) +{ + return new MultiMPIDevice(info, stats, background); +} + +bool device_mpi_init(void) +{ + return true; +} + +void device_mpi_info(vector<DeviceInfo>& devices) +{ + if (getCountOfDevices() < 1) + return; + + DeviceInfo info; + + info.type = DEVICE_MPI; + info.description = string_printf("MPI_%d", getCountOfDevices()); + info.num = 0; + info.id = string_printf("MPI_0"); + info.advanced_shading = true; + info.pack_images = false; + + //#ifdef WITH_IT4I_MIC_OFFLOAD + // DeviceInfo infoMICS; + // + // infoMICS.type = DEVICE_MPI; + // infoMICS.description = string_printf("MPI_%d with MICS", getCountOfDevices()); + // infoMICS.num = 1; + // infoMICS.id = string_printf("MPI_1"); + // infoMICS.advanced_shading = true; + // infoMICS.pack_images = false; + // + //#endif + + for (int i = 0; i < getCountOfDevices(); i++) + { + DeviceInfo subinfo; + + subinfo.type = DEVICE_MPI; + subinfo.description = string_printf("MPI_%d", i); + subinfo.num = i + 1; + subinfo.id = string_printf("MPI_%d", i); + subinfo.advanced_shading = true; + subinfo.pack_images = false; + + info.multi_devices.push_back(subinfo); + + //#ifdef WITH_IT4I_MIC_OFFLOAD + // infoMICS.multi_devices.push_back(subinfo); + //#endif + } + + devices.insert(devices.begin(), info); + + //#ifdef WITH_IT4I_MIC_OFFLOAD + // devices.insert(devices.begin(), infoMICS); + //#endif +} + +string device_mpi_capabilities(void) +{ + return ""; +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 069305e8a292cae9a0945744eb3894d8bcdb7189..8822c60115983f93f93d5895d89190d02498fc72 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -98,30 +98,30 @@ public: return true; } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { foreach(SubDevice& sub, devices) { mem.device_pointer = 0; - sub.device->mem_alloc(mem, type); + sub.device->mem_alloc(name, mem, type); sub.ptr_map[unique_ptr] = mem.device_pointer; } mem.device_pointer = unique_ptr++; } - void mem_copy_to(device_memory& mem) + void mem_copy_to(const char *name, device_memory& mem) { device_ptr tmp = mem.device_pointer; foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; - sub.device->mem_copy_to(mem); + sub.device->mem_copy_to(name, mem); } mem.device_pointer = tmp; } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) { device_ptr tmp = mem.device_pointer; int i = 0, sub_h = h/devices.size(); @@ -131,32 +131,32 @@ public: int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h; mem.device_pointer = sub.ptr_map[tmp]; - sub.device->mem_copy_from(mem, sy, w, sh, elem); + sub.device->mem_copy_from(name, mem, sy, w, sh, elem); i++; } mem.device_pointer = tmp; } - void mem_zero(device_memory& mem) + void mem_zero(const char *name, device_memory& mem) { device_ptr tmp = mem.device_pointer; foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; - sub.device->mem_zero(mem); + sub.device->mem_zero(name, mem); } mem.device_pointer = tmp; } - void mem_free(device_memory& mem) + void mem_free(const char *name, device_memory& mem) { device_ptr tmp = mem.device_pointer; foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; - sub.device->mem_free(mem); + sub.device->mem_free(name, mem); sub.ptr_map.erase(sub.ptr_map.find(tmp)); } @@ -186,13 +186,13 @@ public: mem.device_pointer = unique_ptr++; } - void tex_free(device_memory& mem) + void tex_free(const char *name, device_memory& mem) { device_ptr tmp = mem.device_pointer; foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; - sub.device->tex_free(mem); + sub.device->tex_free(name, mem); sub.ptr_map.erase(sub.ptr_map.find(tmp)); } diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index cf4a05de8fc35f321b33fda81549432a202f3fe4..f0286d4a5559ff6d6837b3a0f274e3084482bf73 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -82,7 +82,7 @@ public: snd.write(); } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { thread_scoped_lock lock(rpc_lock); @@ -95,7 +95,7 @@ public: snd.write(); } - void mem_copy_to(device_memory& mem) + void mem_copy_to(const char *name, device_memory& mem) { thread_scoped_lock lock(rpc_lock); @@ -106,7 +106,7 @@ public: snd.write_buffer((void*)mem.data_pointer, mem.memory_size()); } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) { thread_scoped_lock lock(rpc_lock); @@ -125,7 +125,7 @@ public: rcv.read_buffer((void*)mem.data_pointer, data_size); } - void mem_zero(device_memory& mem) + void mem_zero(const char *name, device_memory& mem) { thread_scoped_lock lock(rpc_lock); @@ -135,7 +135,7 @@ public: snd.write(); } - void mem_free(device_memory& mem) + void mem_free(const char *name, device_memory& mem) { if(mem.device_pointer) { thread_scoped_lock lock(rpc_lock); @@ -186,7 +186,7 @@ public: snd.write_buffer((void*)mem.data_pointer, mem.memory_size()); } - void tex_free(device_memory& mem) + void tex_free(const char *name, device_memory& mem) { if(mem.device_pointer) { thread_scoped_lock lock(rpc_lock); @@ -654,7 +654,7 @@ protected: task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); - task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this); + task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this, _1); task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1); task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this); diff --git a/intern/cycles/device/device_omp.cpp b/intern/cycles/device/device_omp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..79dcf664651e499da9ea7649c97f76ede0205033 --- /dev/null +++ b/intern/cycles/device/device_omp.cpp @@ -0,0 +1,1354 @@ +#include <stdlib.h> +#include <string.h> + +#include "device.h" +#include "device_intern.h" + +#include "kernel.h" +//#include "kernel_compat_omp.h" +// +//#include "kernel_types.h" +//#include "kernel_globals.h" +// +//#include "buffers.h" +// +//#include "util_debug.h" +#include "util_foreach.h" +//#include "util_function.h" +//#include "util_logging.h" +//#include "util_opengl.h" +//#include "util_progress.h" +//#include "util_system.h" +//#include "util_thread.h" +// +//#include "kernel_omp.h" + +#include <boost/algorithm/string.hpp> +#include <omp.h> + +#include "kernel_omp.h" + +#ifdef WITH_IT4I_MIC_OFFLOAD +#include "kernel_mic.h" +#endif + +#define SIZEOF_UCHAR4 (sizeof(unsigned char)*4) + +CCL_NAMESPACE_BEGIN + +class OMPDevice : public Device +{ +public: + DedicatedTaskPool task_pool; + device_ptr kernel_globals_cpu; + std::vector<device_ptr> kernel_globals_mics; + + device_ptr rgba_pixels; + int tile_id; + int num_tiles; + + OMPDevice(DeviceInfo& info, Stats &stats, bool background) + : Device(info, stats, background) + { + printf("OMPDevice\n"); + rgba_pixels = NULL; + tile_id = 0; + num_tiles = 0; + + kernel_globals_mics.resize(info.multi_devices.size()); + kernel_globals_cpu = omp_alloc_kg(info.num); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + kernel_globals_mics[dev] = mic_alloc_kg(info.multi_devices[dev].num); + } +#endif + } + + ~OMPDevice() + { + printf("~OMPDevice\n"); + omp_free_kg(info.num, kernel_globals_cpu); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_free_kg(info.multi_devices[dev].num, kernel_globals_mics[dev]); + } +#endif + + task_pool.stop(); + } + + void mem_alloc(const char *name, device_memory& mem, MemoryType type) + { + printf("mem_alloc: %s\n", name); + mem.device_pointer = mem.data_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + //printf(""); + + if (!strcmp(name, "pixel")) + { + rgba_pixels = mem.device_pointer; + } + + omp_mem_alloc(info.num, (char*) mem.device_pointer, mem.device_size); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_mem_alloc(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size); + } +#endif + } + + void mem_copy_to(const char *name, device_memory& mem) + { + printf("mem_copy_to: %s\n", name); + + omp_mem_copy_to(info.num, (char*) mem.device_pointer, mem.device_size, NULL); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_mem_copy_to(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size, NULL); + } +#endif + } + + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) + { + } + + void mem_zero(const char *name, device_memory& mem) + { + printf("mem_zero: %s\n", name); + + if (mem.device_pointer) + { + memset((void*) mem.device_pointer, 0, mem.memory_size()); + + omp_mem_zero(info.num, (char*) mem.device_pointer, mem.device_size); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_mem_zero(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size); + } +#endif + } + } + + void mem_free(const char *name, device_memory& mem) + { + printf("mem_free: %s\n", name); + + if (mem.device_pointer) + { + if (!strcmp(name, "pixel")) + { + rgba_pixels = NULL; + } + + omp_mem_free(info.num, (char*) mem.device_pointer, mem.device_size); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_mem_free(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size); + } +#endif + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + + } + + void const_copy_to(const char *name, void *host, size_t size) + { + printf("const_copy_to: %s\n", name); + + omp_const_copy(info.num, kernel_globals_cpu, name, (char*) host, size); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_const_copy(info.multi_devices[dev].num, kernel_globals_mics[dev], name, (char*) host, size); + } +#endif + } + + void tex_alloc(const char *name, + device_memory& mem, + InterpolationType interpolation, + ExtensionType extension) + { + printf("tex_alloc: %s\n", name); + + mem.device_pointer = mem.data_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + omp_tex_copy(info.num, kernel_globals_cpu, + name, + (char*) mem.device_pointer, + mem.device_size, + mem.data_width, + mem.data_height, + mem.data_depth, + interpolation, + (int) extension); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_tex_copy(info.multi_devices[dev].num, kernel_globals_mics[dev], + name, + (char*) mem.device_pointer, + mem.device_size, + mem.data_width, + mem.data_height, + mem.data_depth, + interpolation, + (int) extension); + } +#endif + } + + void tex_free(const char *name, device_memory& mem) + { + printf("tex_free: %s\n", name); + + if (mem.device_pointer) + { + omp_tex_free(info.num, kernel_globals_cpu, name, (char*) mem.device_pointer, mem.device_size); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < info.multi_devices.size(); dev++) + { + mic_tex_free(info.multi_devices[dev].num, kernel_globals_mics[dev], name, (char*) mem.device_pointer, mem.device_size); + } +#endif + + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + void *osl_memory() + { + return NULL; + } + + void thread_run(DeviceTask *task) + { + printf("thread_run: %d\n", task->type); + + double t1 = omp_get_wtime(); + + if (task->type == DeviceTask::PATH_TRACE) + thread_path_trace(*task); + else if (task->type == DeviceTask::FILM_CONVERT) + thread_film_convert(*task); + else if (task->type == DeviceTask::SHADER) + thread_shader(*task); + + double t2 = omp_get_wtime(); + printf("DEVICE: OMP, %f\n", t2 - t1); + } + + class OMPDeviceTask : public DeviceTask + { + public: + + OMPDeviceTask(OMPDevice *device, DeviceTask& task) + : DeviceTask(task) + { + run = function_bind(&OMPDevice::thread_run, device, this); + } + }; + + // void path_trace(size_t kg_data_size, char *buffer, char *rng_state, bool progressive, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile) + // { + // ///////////////////////////share nodes//////////////////////////////////// + // + // size_t offsetSample = 0; + // size_t sizeSample = sizeof (int); + // + // int reqFinished = 0; + // + //#ifdef WITH_IT4I_MIC_OFFLOAD + // for (int dev = 0; dev < info.multi_devices.size(); dev++) + // { + // mic_mem_alloc(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int)); + // } + //#endif + // + // int end_sample = start_sample + num_samples; + // + // int pass_stride = omp_get_pass_stride(kernel_globals_cpu); + // + // ////////////////////////////one node/////////////////////////////////// + // omp_set_nested(1); + // int nprocs_cpu = omp_get_max_threads() - 1; + // //printf("nprocs_cpu: %d\n", nprocs_cpu); + // + // int tile_y_node = tile_y; + // int tile_h_node = tile_h; + // + // int size_node = tile_h_node * tile_w; + // + // size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + // size_t sizeBuf_node = size_node * pass_stride * sizeof (float); + // + // size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4); + // size_t sizeByte_node = size_node * sizeof (uchar4); + // + //#ifdef WITH_IT4I_MIC_OFFLOAD + // int devices_size_cpu_mics = info.multi_devices.size() + 2; + // + // int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics; + // int dev_cpu_mics = 0; + // + // //////////////////////////mic0//////////////////////////////////// + // vector<int> sample_finished_mic0(info.multi_devices.size()); + // vector<int> tile_y_mic0(info.multi_devices.size()); + // vector<int> tile_h_mic0(info.multi_devices.size()); + // vector<int> size_mic0(info.multi_devices.size()); + // vector<size_t> offsetBuf_mic0(info.multi_devices.size()); + // vector<size_t> sizeBuf_mic0(info.multi_devices.size()); + // vector<size_t> offsetByte_mic0(info.multi_devices.size()); + // vector<size_t> sizeByte_mic0(info.multi_devices.size()); + // + // //sync + // for (int dev = 0; dev < info.multi_devices.size(); dev++) + // { + // sample_finished_mic0[dev] = 0; + // + // mic_mem_alloc(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], sizeof (int)); + // } + // + // //async + // for (int dev = 0; dev < info.multi_devices.size(); dev++) + // { + // dev_cpu_mics = dev; + // + // //sample_finished_mic0[dev] = 0; + // //mic_mem_alloc(info.multi_devices[dev].num, (char*)&sample_finished_mic0[dev], sizeof(int)); + // + // tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + // tile_h_mic0[dev] = tile_step_cpu_mics; + // + // size_mic0[dev] = tile_h_mic0[dev] * tile_w; + // + // offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float); + // sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float); + // + // offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * sizeof (uchar4); + // sizeByte_mic0[dev] = size_mic0[dev] * sizeof (uchar4); + // + // mic_path_trace(info.multi_devices[dev].num, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, 240, (char *) rng_state); + // } + // dev_cpu_mics = info.multi_devices.size(); + // + // //////////////////////////cpu///////////////////////////////////// + // int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + // int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics; + //#else + // //////////////////////////cpu///////////////////////////////////// + // + // int tile_y_cpu = tile_y_node; + // int tile_h_cpu = tile_h_node; + //#endif + // + // int sample_finished_cpu = 0; + // + // // int size_cpu = tile_h_cpu * tile_w; + // // + // // size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float); + // // size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float); + // // + // // size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * sizeof (uchar4); + // // size_t sizeByte_cpu = size_cpu * sizeof (uchar4); + // ////////////////////////////////////////////////////////////////// + // + //#pragma omp parallel num_threads(2) + // { + //#pragma omp single nowait + // { + //#pragma omp task + // { + // omp_path_trace(info.num, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL); + // } + // + //#pragma omp task + // { + // while (true) + // { + // int sample_finished = sample_finished_cpu; + // + //#ifdef WITH_IT4I_MIC_OFFLOAD + // for (int dev = 0; dev < info.multi_devices.size(); dev++) + // { + // mic_mem_copy_to(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int), (char*) &reqFinished); + // mic_mem_copy_from(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], 0, sizeof (int), (char*) &sample_finished_mic0[dev]); + // + // if (rgba_pixels != NULL) + // { + // mic_mem_copy_from(info.multi_devices[dev].num, (char*) rgba_pixels, offsetByte_mic0[dev], sizeByte_mic0[dev], (char*) rgba_pixels); + // } + // else + // { + // mic_mem_copy_from(info.multi_devices[dev].num, (char*) buffer, offsetBuf_mic0[dev], sizeBuf_mic0[dev], (char*) buffer); + // } + // + // mic_wait(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev]); + // + // if (sample_finished_mic0[dev] < sample_finished) + // sample_finished = sample_finished_mic0[dev]; + // } + // + //#endif + // + // if (sample_finished > 0 && tile.sample != sample_finished) + // { + // tile.sample = sample_finished; + // task.update_progress(&tile); + // } + // + // if (reqFinished != 0) + // { + // break; + // } + // + // if (sample_finished == end_sample) + // { + // reqFinished = 1; + // } + // + // if (task_pool.canceled()) + // { + // if (task.need_finish_queue == false) + // reqFinished = 1; + // } + // } + // } + // } + // + //#pragma omp taskwait + // } + // + //#ifdef WITH_IT4I_MIC_OFFLOAD + // for (int dev = 0; dev < info.multi_devices.size(); dev++) + // { + // //mic_wait(info.multi_devices[dev].num, (char*)&reqFinished); + // mic_mem_free(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], sizeof (int)); + // mic_mem_free(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int)); + // } + //#endif + // } + + virtual int get_tile_id() + { + return tile_id; + }; + + virtual int get_num_tiles() + { + return num_tiles; + }; + + void path_trace_progressive(size_t kg_data_size, char *buffer, char *rng_state, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile) + { + size_t offsetSample = 0; + size_t sizeSample = sizeof (int); + + int reqFinished = 0; + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int)); + } +#endif + + int end_sample = start_sample + num_samples; + int pass_stride = omp_get_pass_stride(kernel_globals_cpu); + + ////////////////////////////one node/////////////////////////////////// + //omp_set_nested(1); + int nprocs_mic = 240; + int nprocs_cpu = omp_get_max_threads() - 1; + + if (getenv("IT4I_OMP_CPU_NUM_THREADS")) + { + nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")); + printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu); + } + + if (getenv("IT4I_OMP_MIC_NUM_THREADS")) + { + nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS")); + printf("IT4I_OMP_MIC_NUM_THREADS: %d\n", nprocs_mic); + } + + int dev_node = 0; + int devices_size_node = 1; + + int tile_step_node = tile_h / devices_size_node; + int tile_last_node = tile_h - (devices_size_node - 1) * tile_step_node; + + int tile_y_node = tile_y + tile_step_node * dev_node; + int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node; + + int size_node = tile_h_node * tile_w; + + size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + size_t sizeBuf_node = size_node * pass_stride * sizeof (float); + + size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; + size_t sizeByte_node = size_node * SIZEOF_UCHAR4; + + int devices_size_cpu_mics = kernel_globals_mics.size() + 2; + + int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics; + //int tile_last_cpu_mics = tile_h_node - (devices_size_cpu_mics - 1) * tile_step_cpu_mics; + + int dev_cpu_mics = 0; + + int signal1, signal2, signal3, signal4; + + //////////////////////////mic0//////////////////////////////////// +#ifdef WITH_IT4I_MIC_OFFLOAD + std::vector<int> sample_finished_mic0(kernel_globals_mics.size()); + std::vector<int> tile_y_mic0(kernel_globals_mics.size()); + std::vector<int> tile_h_mic0(kernel_globals_mics.size()); + std::vector<int> size_mic0(kernel_globals_mics.size()); + std::vector<size_t> offsetBuf_mic0(kernel_globals_mics.size()); + std::vector<size_t> sizeBuf_mic0(kernel_globals_mics.size()); + std::vector<size_t> offsetByte_mic0(kernel_globals_mics.size()); + std::vector<size_t> sizeByte_mic0(kernel_globals_mics.size()); + + //sync + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + sample_finished_mic0[dev] = 0; + + mic_mem_alloc(dev, (char*) &sample_finished_mic0[dev], sizeof (int)); + } + + //async + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + dev_cpu_mics = dev; + + //sample_finished_mic0[dev] = 0; + //mic_mem_alloc(dev, (char*)&sample_finished_mic0[dev], sizeof(int)); + + tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + tile_h_mic0[dev] = tile_step_cpu_mics; + + size_mic0[dev] = tile_h_mic0[dev] * tile_w; + + offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float); + sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float); + + offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * SIZEOF_UCHAR4; + sizeByte_mic0[dev] = size_mic0[dev] * SIZEOF_UCHAR4; + + if (dev == 0) + mic_path_trace(dev, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, signal1); + + if (dev == 1) + mic_path_trace(dev, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, signal2); + } +#endif + //////////////////////////cpu///////////////////////////////////// + + dev_cpu_mics = kernel_globals_mics.size(); + + int sample_finished_cpu = 0; + + int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics; + + int size_cpu = tile_h_cpu * tile_w; + + size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float); + size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float); + + size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * SIZEOF_UCHAR4; + size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4; + ////////////////////////////////////////////////////////////////// + + omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL); + + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + if (dev == 0) + mic_wait(dev, signal1); + + if (dev == 1) + mic_wait(dev, signal2); + // while(true) + // { + //#pragma omp flush + // if (sample_finished_mic0[dev] == end_sample) + // break; + // } + + if (rgba_pixels != NULL) + { + mic_mem_copy_from(dev, (char*) rgba_pixels, offsetByte_mic0[dev], sizeByte_mic0[dev], NULL); + } + else + { + mic_mem_copy_from(dev, (char*) buffer, offsetBuf_mic0[dev], sizeBuf_mic0[dev], NULL); + } + } +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + //mic_wait(dev, (char*)&reqFinished); + mic_mem_free(dev, (char*) &sample_finished_mic0[dev], sizeof (int)); + mic_mem_free(dev, (char*) &reqFinished, sizeof (int)); + } +#endif + + tile_id = tile_h; + task.update_progress(&tile); + } + + void path_trace_offline(size_t kg_data_size, char *buffer, char *rng_state, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile) + { + size_t offsetSample = 0; + size_t sizeSample = sizeof (int); + + int reqFinished = 0; + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < kernel_globals_mics.size(); dev++) + { + mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int)); + } +#endif + + int end_sample = start_sample + num_samples; + int pass_stride = omp_get_pass_stride(kernel_globals_cpu); + + ////////////////////////////one node/////////////////////////////////// + omp_set_nested(1); + + int tile_step_node = 1; + + if (getenv("IT4I_OMP_TILE_STEP")) + { + tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP")); + printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node); + } + + int tile_h_node = tile_step_node; + int omp_path_trace_req = 0; + + int size_node = tile_h_node * tile_w; + + //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + size_t sizeBuf_node = size_node * pass_stride * sizeof (float); + + //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; + size_t sizeByte_node = size_node * SIZEOF_UCHAR4; + + //int sample_finished_node = 0; + + ////////////////////////////MICS////////////////////////////////////// + int signal1, signal2, signal3, signal4; + + const int num_devices_cpu_mics = kernel_globals_mics.size() + 1; + //const int num_devices_mics = mpiData->kernel_globals_mics.size(); + + std::vector<int> sample_finished_devices(num_devices_cpu_mics); + + if (kernel_globals_mics.size() == 0) + { + int nprocs_cpu = omp_get_max_threads(); + + if (getenv("IT4I_OMP_CPU_NUM_THREADS")) + { + nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")); + printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu); + } + + //omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL); + int size = tile_h*tile_w; + num_tiles = size; + sample_finished_devices[0] = start_sample; + +#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1) + for (int i = 0; i < size; i++) + { + int y = i / tile_w; + int x = i - y * tile_w; + + for (int sample = start_sample; sample < end_sample; sample++) + { + omp_kernel_path_trace(kernel_globals_cpu, (float *) buffer, (unsigned int*) rng_state, sample, x + tile_x, y + tile_y, offset, stride); + + if (rgba_pixels != NULL) + { + float sample_scale = 1.0f / (sample + 1.0f); + + if (tile.half_float) + omp_convert_to_half_float(kernel_globals_cpu, (char*) rgba_pixels, (float *) buffer, sample_scale, x + tile_x, y + tile_y, offset, stride); + else + omp_film_convert_byte(kernel_globals_cpu, (char*) rgba_pixels, (float *) buffer, sample_scale, x + tile_x, y + tile_y, offset, stride); + } + } + + int tid = omp_get_thread_num(); + if (tid == 0) + { + tile_id = i; + task.update_progress(&tile); + } + } + + sample_finished_devices[0] = end_sample; + + tile_id = size; + } + else + { + int nprocs_mic = 240; + int nprocs_cpu = omp_get_max_threads() - 1; + + if (getenv("IT4I_OMP_CPU_NUM_THREADS")) + { + nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1; + printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu); + } + + if (getenv("IT4I_OMP_MIC_NUM_THREADS")) + { + nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS")); + printf("IT4I_OMP_MIC_NUM_THREADS: %d\n", nprocs_mic); + } + + std::vector<int> tile_y_devices(num_devices_cpu_mics); + int tile_y_node = tile_y; + + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { + //sample_finished_devices[dev] = 0; + sample_finished_devices[dev] = end_sample; + tile_y_devices[dev] = 0; + +#ifdef WITH_IT4I_MIC_OFFLOAD + if (dev > 0) + { + //sync + mic_mem_alloc(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int)); + } +#endif + } + ////////////////////////////////////////////////////////////////// + +#pragma omp parallel num_threads(2) + { +#pragma omp single nowait + { +#pragma omp task + { + while (reqFinished == 0) + { +#pragma omp flush + if (omp_path_trace_req != 0) + { + //omp_path_trace(info.num, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL); + printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", 0, sample_finished_devices[0], end_sample, tile_y_devices[0], tile_h); + omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL); + omp_path_trace_req = 0; + } + usleep(100); + } + } + +#pragma omp task + { + while (true) + { + int min_count = end_sample; + + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { +#ifdef WITH_IT4I_MIC_OFFLOAD + if (dev > 0) + { + if (tile_y_devices[dev] != 0) + { + if (rgba_pixels != NULL) + { + size_t offsetByte_node = (offset + tile_x + tile_y_devices[dev] * stride) * SIZEOF_UCHAR4; + mic_mem_copy_from(dev - 1, (char*) rgba_pixels, offsetByte_node, sizeByte_node, NULL/*, (char*) &rgba_pixels*/); + } + else + { + size_t offsetBuf_node = (offset + tile_x + tile_y_devices[dev] * stride) * pass_stride * sizeof (float); + mic_mem_copy_from(dev - 1, (char*) buffer, offsetBuf_node, sizeBuf_node, NULL/*, (char*) &buffer*/); + } + } + } +#endif + +#pragma omp flush + if (min_count > sample_finished_devices[dev]) + min_count = sample_finished_devices[dev]; + + if (sample_finished_devices[dev] == end_sample && tile_y_node < tile_h) + { + sample_finished_devices[dev] = start_sample; + tile_y_devices[dev] = tile_y_node; + + tile_y_node += tile_step_node; + tile_id += tile_step_node; + + if (tile_y_node > tile_h) + { + tile_y_node = tile_h; + tile_id = tile_h; + } + + if (dev == 0) + { + omp_path_trace_req = 1; + } +#ifdef WITH_IT4I_MIC_OFFLOAD + else + { + printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h); + + if (dev == 1) + mic_path_trace(dev - 1, kernel_globals_mics[dev - 1], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal1); + + if (dev == 2) + mic_path_trace(dev - 1, kernel_globals_mics[dev - 1], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal2); + } +#endif + } + } + + task.update_progress(&tile); + + + if (min_count == end_sample && tile_y_node >= tile_h) + { + reqFinished = 1; + } + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + reqFinished = 1; + } + + if (reqFinished != 0) + { + break; + } + + } + } + } + +#pragma omp taskwait + } + + printf("tasks finished\n"); +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { + if (dev > 0) + { + if (dev == 1) + mic_wait(dev - 1, signal1); + + if (dev == 2) + mic_wait(dev - 1, signal2); + + mic_mem_free(dev - 1, (char*) &reqFinished, sizeof (int)); + mic_mem_free(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int)); + } + } +#endif + + tile_id = tile_h; + } + + task.update_progress(&tile); + } + + void thread_path_trace(DeviceTask& task) + { + printf("thread_path_trace start\n"); + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + return; + } + + RenderTile tile; + + int tile_h = 0; + int tile_w = 0; + int num_samples_orig = 0; + + while (task.acquire_tile(this, tile)) + { + printf("task.acquire_tile\n"); + + int offset, stride; + tile.buffers->params.get_offset_stride(offset, stride); + + int tile_x = tile.buffers->params.full_x; + int tile_y = tile.buffers->params.full_y; + tile_h = tile.buffers->params.height; + tile_w = tile.buffers->params.width; + num_samples_orig = tile.num_samples_orig; + + tile.sample = tile.start_sample + tile.num_samples; + + if (!tile.progressive) + { + tile_x = tile.x; + tile_y = tile.y; + tile_h = tile.h; + tile_w = tile.w; + + offset = tile.offset; + stride = tile.stride; + } + + num_tiles = tile_h; + tile_id = 0; + + if (/*background ||*/ tile.progressive) + path_trace_progressive(omp_get_data_size(kernel_globals_cpu), (char*) tile.buffer, (char*) tile.rng_state, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w, task, tile); + else + path_trace_offline(omp_get_data_size(kernel_globals_cpu), (char*) tile.buffer, (char*) tile.rng_state, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w, task, tile); + + + //tile_id = tile_h; + task.release_tile(tile); + + if (task_pool.canceled()) + { + if (task.need_finish_queue == false) + break; + } + } + + printf("thread_path_trace finish\n"); + } + + void thread_film_convert(DeviceTask& task) + { + } + + void thread_shader(DeviceTask& task) + { + } + + int get_split_task_count(DeviceTask& task) + { + return 1; + } + + void task_add(DeviceTask& task) + { + task_pool.push(new OMPDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams ¶ms, float* buffer) + { + int pass_offset = 0; + + foreach(Pass& pass, params.passes) + { + if (pass.type != type) + { + pass_offset += pass.components; + continue; + } + + float *in = (float*) buffer + pass_offset; + int pass_stride = params.get_passes_size(); + + float scale = (pass.filter) ? 1.0f / (float) sample : 1.0f; + float scale_exposure = (pass.exposure) ? scale * exposure : scale; + + int size = params.width * params.height; + + if (components == 1) + { + assert(pass.components == components); + + /* scalar */ + if (type == PASS_DEPTH) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = (f == 0.0f) ? 1e10f : f*scale_exposure; + } + in += size*pass_stride; + pixels += size; + } + else if (type == PASS_MIST) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = saturate(f * scale_exposure); + } + in += size*pass_stride; + pixels += size; + } +#ifdef WITH_CYCLES_DEBUG + else if (type == PASS_BVH_TRAVERSAL_STEPS) + { + for (int i = 0; i < size; i++, in += pass_stride, pixels++) + { + float f = *in; + pixels[0] = f; + } + } + else if (type == PASS_RAY_BOUNCES) + { + for (int i = 0; i < size; i++, in += pass_stride, pixels++) + { + float f = *in; + pixels[0] = f; + } + } +#endif + else + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //in += pass_stride, pixels++ + float f = in[i * pass_stride]; + pixels[i] = f*scale_exposure; + } + in += size*pass_stride; + pixels += size; + } + } + else if (components == 3) + { + assert(pass.components == 4); + + /* RGBA */ + if (type == PASS_SHADOW) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 3 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; + + pixels[i * 3 + 0] = f.x*invw; + pixels[i * 3 + 1] = f.y*invw; + pixels[i * 3 + 2] = f.z*invw; + } + in += size*pass_stride; + pixels += size * 3; + + } + else if (pass.divide_type != PASS_NONE) + { + /* RGB lighting passes that need to divide out color */ + pass_offset = 0; + + foreach(Pass& color_pass, params.passes) + { + if (color_pass.type == pass.divide_type) + break; + pass_offset += color_pass.components; + } + + float *in_divide = (float*) buffer + pass_offset; + +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, in_divide += pass_stride, pixels += 3 + float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]); + float3 f_divide = make_float3(in_divide[i * pass_stride + 0], in_divide[i * pass_stride + 1], in_divide[i * pass_stride + 2]); + + f = safe_divide_even_color(f*exposure, f_divide); + + pixels[i * 3 + 0] = f.x; + pixels[i * 3 + 1] = f.y; + pixels[i * 3 + 2] = f.z; + } + + in += size*pass_stride; + in_divide += size*pass_stride; + pixels += size * 3; + } + else + { + /* RGB/vector */ +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 3 + float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]); + + pixels[i * 3 + 0] = f.x*scale_exposure; + pixels[i * 3 + 1] = f.y*scale_exposure; + pixels[i * 3 + 2] = f.z*scale_exposure; + } + + in += size*pass_stride; + pixels += size * 3; + } + } + else if (components == 4) + { + assert(pass.components == components); + + /* RGBA */ + if (type == PASS_SHADOW) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; + + pixels[i * 4 + 0] = f.x*invw; + pixels[i * 4 + 1] = f.y*invw; + pixels[i * 4 + 2] = f.z*invw; + pixels[i * 4 + 3] = 1.0f; + } + + in += size*pass_stride; + pixels += size * 4; + } + else if (type == PASS_MOTION) + { + /* need to normalize by number of samples accumulated for motion */ + pass_offset = 0; + + foreach(Pass& color_pass, params.passes) + { + if (color_pass.type == PASS_MOTION_WEIGHT) + break; + pass_offset += color_pass.components; + } + + float *in_weight = (float*) buffer + pass_offset; + +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, in_weight += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + float w = in_weight[i * pass_stride + 0]; + float invw = (w > 0.0f) ? 1.0f / w : 0.0f; + + pixels[i * 4 + 0] = f.x*invw; + pixels[i * 4 + 1] = f.y*invw; + pixels[i * 4 + 2] = f.z*invw; + pixels[i * 4 + 3] = f.w*invw; + } + + in += size*pass_stride; + in_weight += size*pass_stride; + pixels += size * 4; + } + else + { +#pragma omp parallel for + for (int i = 0; i < size; i++) + { + //, in += pass_stride, pixels += 4 + float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]); + + pixels[i * 4 + 0] = f.x*scale_exposure; + pixels[i * 4 + 1] = f.y*scale_exposure; + pixels[i * 4 + 2] = f.z*scale_exposure; + + /* clamp since alpha might be > 1.0 due to russian roulette */ + //pixels[i * 4 + 3] = saturate(f.w*scale); + pixels[i * 4 + 3] = saturate(f.w); + } + + in += size*pass_stride; + pixels += size * 4; + } + } + + return true; + } + return false; + } + +}; + +Device *device_omp_create(DeviceInfo& info, Stats &stats, bool background) +{ + return new OMPDevice(info, stats, background); +} + +#ifdef WITH_IT4I_MIC_OFFLOAD + +string micFindDevices() +{ + // return ""; + +#if !defined(_WIN32) && !defined(__APPLE__) + FILE *handle = popen("micinfo -group Versions | grep 'Device Name'", "r"); + if (handle) + { + char buffer[4096] = {0}; + int len = fread(buffer, 1, sizeof (buffer) - 1, handle); + buffer[len] = '\0'; + pclose(handle); + + if (buffer[0]) + return string(buffer); + } +#endif + +#if defined(_WIN32) + return "fakeMIC"; +#else + return ""; +#endif +} + +#endif + +bool device_omp_init(void) +{ + return true; +} + +void device_omp_info(vector<DeviceInfo>& devices) +{ + DeviceInfo infoCPU; + + infoCPU.type = DEVICE_OMP; + infoCPU.description = "CPU"; + infoCPU.id = "OMP_CPU"; + infoCPU.num = -1; + infoCPU.advanced_shading = true; + infoCPU.pack_images = false; + + devices.insert(devices.begin(), infoCPU); + +#ifdef WITH_IT4I_MIC_OFFLOAD + string mics = micFindDevices(); + if (!mics.empty()) + { + DeviceInfo infoCPU_MICS; + + infoCPU_MICS.type = DEVICE_OMP; + infoCPU_MICS.description = "CPU"; + infoCPU_MICS.id = "OMP_CPU_MICS"; + infoCPU_MICS.num = -1; + infoCPU_MICS.advanced_shading = true; + infoCPU_MICS.pack_images = false; + + DeviceInfo infoCPU_MIC0; + + infoCPU_MIC0.type = DEVICE_OMP; + infoCPU_MIC0.description = "CPU"; + infoCPU_MIC0.id = "OMP_CPU_MIC0"; + infoCPU_MIC0.num = -1; + infoCPU_MIC0.advanced_shading = true; + infoCPU_MIC0.pack_images = false; + + int num = 0; + std::vector<std::string> strDevices; + boost::split(strDevices, mics, boost::is_any_of("\n")); + + int count_dev = 0; + + foreach(string strDevice, strDevices) + { + std::vector<std::string> strNames; + boost::split(strNames, strDevice, boost::is_any_of(":")); + + if (strNames.size() == 0) + continue; + + string name = strNames[strNames.size() - 1]; + boost::trim(name); + + if (!name.empty()) + { + DeviceInfo info; + + info.type = DEVICE_OMP; + info.description = strDevice; + info.id = string_printf("OMP_MIC_%d", num); + info.num = num++; + info.advanced_shading = true; + info.pack_images = false; + + infoCPU_MICS.description = string_printf("%s+%s", infoCPU_MICS.description.c_str(), info.description.c_str()); + infoCPU_MICS.multi_devices.push_back(info); + + if (count_dev == 0) + { + infoCPU_MIC0.description = string_printf("%s+%s", infoCPU_MIC0.description.c_str(), info.description.c_str()); + infoCPU_MIC0.multi_devices.push_back(info); + } + + count_dev++; + } + } + + devices.insert(devices.begin(), infoCPU_MIC0); + devices.insert(devices.begin(), infoCPU_MICS); + } + + +#endif +} + +string device_omp_capabilities(void) +{ + return ""; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 1b4e5421b5ae7d0f9fc808c9628a2ff8f4fec5d8..4aa0c5de4ec6b5a15d810dcd18f17713b447edf1 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -39,6 +39,8 @@ CCL_NAMESPACE_BEGIN +//#define VLOG(a) std::cout + #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) /* Macro declarations used with split kernel */ @@ -118,8 +120,8 @@ bool opencl_kernel_use_advanced_shading(const string& platform) return true; else if(platform == "AMD Accelerated Parallel Processing") return true; - else if(platform == "Intel(R) OpenCL") - return true; + //else if(platform == "Intel(R) OpenCL") + // return true; /* Make sure officially unsupported OpenCL platforms * does not set up to use advanced shading. */ @@ -165,6 +167,10 @@ bool opencl_device_supported(const string& platform_name, if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { return true; } + if(platform_name == "Intel(R) OpenCL") { + return true; + } + return false; } @@ -1056,7 +1062,7 @@ public: ConstMemMap::iterator mt; for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - mem_free(*(mt->second)); + mem_free("second", *(mt->second)); delete mt->second; } @@ -1076,7 +1082,7 @@ public: clReleaseContext(cxContext); } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { size_t size = mem.memory_size(); @@ -1111,7 +1117,7 @@ public: mem.device_size = size; } - void mem_copy_to(device_memory& mem) + void mem_copy_to(const char *name, device_memory& mem) { /* this is blocking */ size_t size = mem.memory_size(); @@ -1127,7 +1133,7 @@ public: } } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) + void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem) { size_t offset = elem*y*w; size_t size = elem*w*h; @@ -1142,15 +1148,15 @@ public: NULL, NULL)); } - void mem_zero(device_memory& mem) + void mem_zero(const char *name, device_memory& mem) { if(mem.device_pointer) { memset((void*)mem.data_pointer, 0, mem.memory_size()); - mem_copy_to(mem); + mem_copy_to(name, mem); } } - void mem_free(device_memory& mem) + void mem_free(const char *name, device_memory& mem) { if(mem.device_pointer) { if(mem.device_pointer != null_mem) { @@ -1171,7 +1177,7 @@ public: device_vector<uchar> *data = new device_vector<uchar>(); data->copy((uchar*)host, size); - mem_alloc(*data, MEM_READ_ONLY); + mem_alloc(name, *data, MEM_READ_ONLY); i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first; } else { @@ -1179,7 +1185,7 @@ public: data->copy((uchar*)host, size); } - mem_copy_to(*i->second); + mem_copy_to(name, *i->second); } void tex_alloc(const char *name, @@ -1188,13 +1194,13 @@ public: ExtensionType /*extension*/) { VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + mem_alloc(name, mem, MEM_READ_ONLY); + mem_copy_to(name, mem); assert(mem_map.find(name) == mem_map.end()); mem_map.insert(MemMap::value_type(name, mem.device_pointer)); } - void tex_free(device_memory& mem) + void tex_free(const char *name, device_memory& mem) { if(mem.device_pointer) { foreach(const MemMap::value_type& value, mem_map) { @@ -1204,7 +1210,7 @@ public: } } - mem_free(mem); + mem_free(name, mem); } } diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 1f1128a28f858fdd23326de3ba459c6350328a6e..82d12175c9e686271b85abd43fda9caf44a004ed 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -21,6 +21,7 @@ #include "util_algorithm.h" #include "util_time.h" +#include "buffers.h" CCL_NAMESPACE_BEGIN @@ -106,7 +107,7 @@ void DeviceTask::update_progress(RenderTile *rtile) return; if(update_progress_sample) - update_progress_sample(); + update_progress_sample(rtile->sample); if(update_tile_sample) { double current_time = time_dt(); diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index d7912f386f5c165d0badfe9d4fcbb8b7a1d67307..242635450a975b631e0f95146eecfa825ec50484 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -59,7 +59,7 @@ public: void update_progress(RenderTile *rtile); function<bool(Device *device, RenderTile&)> acquire_tile; - function<void(void)> update_progress_sample; + function<void(int)> update_progress_sample; function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; function<bool(void)> get_cancel; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 3c17429fd092b59fb72ded4cbead600ab284df73..d505e3da582fdcc010a350e6aff67a57c155001b 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -271,6 +271,21 @@ if(WITH_CYCLES_OSL) add_subdirectory(shaders) endif() +# MPI module +if(WITH_IT4I_MPI) + add_subdirectory(kernels/mpi) +endif() + +# OMP module +if(WITH_OPENMP) + add_subdirectory(kernels/omp) +endif() + +# MIC module +if(WITH_IT4I_MIC_OFFLOAD) + add_subdirectory(kernels/mic) +endif() + # CPU module include_directories(${INC}) diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 49f6122f3f4e70980a9d56b5d16e38ff87581fa4..7e390d3852251a8e83c706b8c175620161e37e39 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -43,6 +43,7 @@ typedef struct KernelGlobals { #include "kernel_textures.h" KernelData __data; + size_t __data_size; #ifdef __OSL__ /* On the CPU, we also have the OSL globals here. Most data structures are shared diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index bdd17c66c0f1fef54a5a9fde8fab48f7980bef32..88fad12cc241ff4115bcbd376a8aecdade1a3571 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -130,9 +130,16 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_OPENCL_INTEL_CPU__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ -# define __KERNEL_ADV_SHADING__ +# define __MULTI_CLOSURE__ +# define __PASSES__ +# define __BACKGROUND_MIS__ +# define __LAMP_MIS__ +# define __AO__ +# define __CAMERA_MOTION__ +# define __OBJECT_MOTION__ +# define __HAIR__ # ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ +# define __TRANSPARENT_SHADOWS__ # endif #endif diff --git a/intern/cycles/kernel/kernels/mic/CMakeLists.txt b/intern/cycles/kernel/kernels/mic/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5076784175293e79da50abaf3e91ce3c788e3ef3 --- /dev/null +++ b/intern/cycles/kernel/kernels/mic/CMakeLists.txt @@ -0,0 +1,32 @@ + +set(INC + . + ../../../kernel + ../../../util + ../../../kernel/osl + ../../../../../it4i/client/api +) + +set(SRC + kernel_mic.cpp +) + +set(SRC_HEADERS + kernel_compat_mic.h + kernel_mic.h +) + +if (WITH_IT4I_MIC_NATIVE) + add_definitions(-DWITH_IT4I_MIC_NATIVE) +endif() + +if (WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) + + # -ip -fp-model fast=2 + set_source_files_properties(kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qopenmp -qoffload-attribute-target=mic") + #set_source_files_properties(kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-openmp -offload=none") +endif() + +include_directories(${INC}) +add_library(cycles_kernel_mic ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h b/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h new file mode 100644 index 0000000000000000000000000000000000000000..8b1e200cdf744c766ba132dc62b0c49621116c99 --- /dev/null +++ b/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h @@ -0,0 +1,514 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_COMPAT_MIC_H__ +#define __KERNEL_COMPAT_MIC_H__ + +#define __KERNEL_CPU__ +#define __KERNEL_MIC__ + +/* Release kernel has too much false-positive maybe-uninitialized warnings, + * which makes it possible to miss actual warnings. + */ +#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG) +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +# pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +/* Selective nodes compilation. */ +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + +//#include "util_debug.h" +#include "util_math.h" +//#include "util_simd.h" +#include "util_half.h" +#include "util_types.h" + +#define ccl_addr_space + +/* On x86_64, versions of glibc < 2.16 have an issue where expf is + * much slower than the double version. This was fixed in glibc 2.16. + */ +#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \ + defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \ + (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16) +# define expf(x) ((float)exp((double)(x))) +#endif + +CCL_NAMESPACE_BEGIN + +/* Assertions inside the kernel only work for the CPU device, so we wrap it in + * a macro which is empty for other devices */ + +#define kernel_assert(cond) //assert(cond) + +/* Texture types to be compatible with CUDA textures. These are really just + * simple arrays and after inlining fetch hopefully revert to being a simple + * pointer lookup. */ + +template<typename T> struct texture { + ccl_always_inline T fetch(int index) + { + kernel_assert(index >= 0 && index < width); + return data[index]; + } + +#ifdef __KERNEL_SSE2__ + ccl_always_inline ssef fetch_ssef(int index) + { + kernel_assert(index >= 0 && index < width); + return ((ssef*)data)[index]; + } + + ccl_always_inline ssei fetch_ssei(int index) + { + kernel_assert(index >= 0 && index < width); + return ((ssei*)data)[index]; + } +#endif + + T *data; + int width; +}; + +template<typename T> struct texture_image { +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + + ccl_always_inline float4 read(float4 r) + { + return r; + } + + ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + + ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if(x < 0) + x += width; + return x; + } + + ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width-1); + } + + ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; + } + + ccl_always_inline float4 interp(float x, float y) + { + if(UNLIKELY(!data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + int ix, iy, nix, niy; + + if(interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + return read(data[ix + iy*width]); + } + else if(interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + + float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); + r += (1.0f - ty)*tx*read(data[nix + iy*width]); + r += ty*(1.0f - tx)*read(data[ix + niy*width]); + r += ty*tx*read(data[nix + niy*width]); + + return r; + } + else { + /* Bicubic b-spline interpolation. */ + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data[xc[x] + yc[y]])) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); + +#undef TERM +#undef DATA + } + } + + ccl_always_inline float4 interp_3d(float x, float y, float z) + { + return interp_3d_ex(x, y, z, interpolation); + } + + ccl_always_inline float4 interp_3d_ex(float x, float y, float z, + int interpolation = INTERPOLATION_LINEAR) + { + if(UNLIKELY(!data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + int ix, iy, iz, nix, niy, niz; + + if(interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + frac(z*(float)depth, &iz); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + return read(data[ix + iy*width + iz*width*height]); + } + else if(interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + float tz = frac(z*(float)depth - 0.5f, &iz); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + float4 r; + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); + r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); + r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); + r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); + + r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); + r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); + r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); + r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); + + return r; + } + else { + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + } + + ccl_always_inline void dimensions_set(int width_, int height_, int depth_) + { + width = width_; + height = height_; + depth = depth_; + } + + T *data; + int interpolation; + ExtensionType extension; + int width, height, depth; +#undef SET_CUBIC_SPLINE_WEIGHTS +}; + +typedef texture<float4> texture_float4; +typedef texture<float2> texture_float2; +typedef texture<float> texture_float; +typedef texture<uint> texture_uint; +typedef texture<int> texture_int; +typedef texture<uint4> texture_uint4; +typedef texture<uchar4> texture_uchar4; +typedef texture_image<float4> texture_image_float4; +typedef texture_image<uchar4> texture_image_uchar4; + +/* Macros to handle different memory storage on different devices */ + +#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) +#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) +#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) +#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) +#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y)) +#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z)) +#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation)) + +#define kernel_data (kg->__data) + +#ifdef __KERNEL_SSE2__ +typedef vector3<sseb> sse3b; +typedef vector3<ssef> sse3f; +typedef vector3<ssei> sse3i; + +ccl_device_inline void print_sse3b(const char *label, sse3b& a) +{ + print_sseb(label, a.x); + print_sseb(label, a.y); + print_sseb(label, a.z); +} + +ccl_device_inline void print_sse3f(const char *label, sse3f& a) +{ + print_ssef(label, a.x); + print_ssef(label, a.y); + print_ssef(label, a.z); +} + +ccl_device_inline void print_sse3i(const char *label, sse3i& a) +{ + print_ssei(label, a.x); + print_ssei(label, a.y); + print_ssei(label, a.z); +} + +#endif + +CCL_NAMESPACE_END + +#endif /* __KERNEL_COMPAT_MIC_H__ */ + diff --git a/intern/cycles/kernel/kernels/mic/kernel_mic.cpp b/intern/cycles/kernel/kernels/mic/kernel_mic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7261df09c98a94f4ac8e1299c19009a1b31ebfd7 --- /dev/null +++ b/intern/cycles/kernel/kernels/mic/kernel_mic.cpp @@ -0,0 +1,516 @@ +#include "kernel_mic.h" + +#include "kernel_compat_mic.h" +#include "kernel.h" + +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_path_branched.h" +#include "kernel_bake.h" + +#include <omp.h> + +//#define NUM_THREADS 240 +#define SIZE_T long + +#define ALLOC alloc_if(1) free_if(0) +#define FREE alloc_if(0) free_if(1) +#define REUSE alloc_if(0) free_if(0) + +#define ONE_USE //alloc_if(1) free_if(1) + +CCL_NAMESPACE_BEGIN + +void cwassert(const char * _Message, const char *_File, unsigned _Line) +{ + printf("ASSERT: %s, %s, %d\n", _Message, _File, _Line); +} + +/* Memory Copy */ +void mic_const_copy_internal(DEVICE_PTR kg_bin, char *host_bin, size_t size) +{ + KernelGlobals *kg = (KernelGlobals *) kg_bin; + memcpy(&kg->__data, host_bin, size); + kg->__data_size = size; +} + +void mic_const_copy(int numDevice, DEVICE_PTR kg_bin, const char *name, char *host_bin, size_t size) +{ + if (strcmp(name, "__data") == 0) + { + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) in(host_bin:length(size) ONE_USE) in(kg_bin) in(size) + { + mic_const_copy_internal(kg_bin, host_bin, size); + } + +#endif + } + else + { + mic_const_copy_internal(kg_bin, host_bin, size); + } + } + // else + // assert(0); +} + +void mic_tex_copy_internal(DEVICE_PTR kg_bin, + const char *name, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension) +{ + KernelGlobals *kg = (KernelGlobals *) kg_bin; + + if (0) + { + } +#define KERNEL_TEX(type, ttype, tname) \ + else if(strcmp(name, #tname) == 0) { \ + kg->tname.data = (type*)mem; \ + kg->tname.width = width; \ + } +#define KERNEL_IMAGE_TEX(type, ttype, tname) +#include "kernel_textures.h" + + else if (strstr(name, "__tex_image_float")) + { + texture_image_float4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_float_")); + int array_index = id; + + if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) + { + tex = &kg->texture_float_images[array_index]; + } + + if (tex) + { + tex->data = (float4*) mem; + tex->dimensions_set(width, height, depth); + tex->interpolation = interpolation; + tex->extension = (ExtensionType) extension; + } + } + else if (strstr(name, "__tex_image")) + { + texture_image_uchar4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_")); + int array_index = id - MAX_FLOAT_IMAGES; + + if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) + { + tex = &kg->texture_byte_images[array_index]; + } + + if (tex) + { + tex->data = (uchar4*) mem; + tex->dimensions_set(width, height, depth); + tex->interpolation = interpolation; + tex->extension = (ExtensionType) extension; + } + } + +} + +void mic_tex_copy(int numDevice, DEVICE_PTR kg_bin, + const char *name_bin, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension) +{ + if (name_bin == NULL || mem == NULL) + return; + + size_t nameSize = sizeof (char) * (strlen(name_bin) + 1); + char *name = (char *) name_bin; + + //printf("mic_tex_copy_internal: %d: %s, %d\n", numDevice, name, size); + + if (numDevice != -1) + { + +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) \ + in(mem:length(size) ONE_USE) \ + in(name:length(nameSize) ONE_USE) \ + in(kg_bin) in(size) in(width) in(height) in(depth) in(interpolation) in(extension) + { + char* mem2 = new char[size]; + memcpy(mem2, mem, size); + mic_tex_copy_internal(kg_bin, name, mem2, size, width, height, depth, interpolation, extension); + } +#endif + } + else + { + mic_tex_copy_internal(kg_bin, name, mem, size, width, height, depth, interpolation, extension); + } + + //printf("mic_tex_copy: %s\n", name); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void mic_wait(int numDevice, int signal_value) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload_wait target(mic:numDevice) wait(signal_value) +#endif + } +} + +void mic_film_convert_byte(KernelGlobals *kg, + uchar4 *rgba_byte, float *buffer, + float sample_scale, int x, int y, int offset, int stride) +{ + /* buffer offset */ + int index = offset + x + y*stride; + + rgba_byte += index; + //rgba_float += index; + buffer += index * kernel_data.film.pass_stride; + + /* map colors */ + float4 irradiance = *((ccl_global float4*) buffer); + float4 float_result = film_map(kg, irradiance, sample_scale); + uchar4 byte_result = film_float_to_byte(float_result); + + *rgba_byte = byte_result; +} + +void mic_convert_to_half_float(KernelGlobals *kg, + uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride) +{ + /* buffer offset */ + int index = offset + x + y*stride; + + float4 *in = (float4*) (buffer + index * kernel_data.film.pass_stride); + half *out = (half*) rgba + index * 4; + + float exposure = kernel_data.film.exposure; + + float4 rgba_in = *in; + + if (exposure != 1.0f) + { + rgba_in.x *= exposure; + rgba_in.y *= exposure; + rgba_in.z *= exposure; + } + + float4_store_half(out, rgba_in, sample_scale); +} + +void mic_path_trace_internal(DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu) +{ + int size = tile_h*tile_w; + + int *sample_finished = (int*) sample_finished_mic; + int *reqFinished = (int*) reqFinished_mic; + + *sample_finished = start_sample; + +#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1) + for (int i = 0; i < size; i++) + { + // if (*reqFinished != 0) + // continue; + + int y = i / tile_w; + int x = i - y * tile_w; + + for (int sample = start_sample; sample < end_sample; sample++) + { + kernel_path_trace((KernelGlobals *) kg_bin, (float *) buffer_bin, (unsigned int*) rng_state_bin, sample, x + tile_x, y + tile_y, offset, stride); + + if (rgba_byte_bin != NULL) + { + float sample_scale = 1.0f / (sample + 1.0f); + + if (is_rgba_float) + mic_convert_to_half_float((KernelGlobals *) kg_bin, (uchar4*) rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride); + else + mic_film_convert_byte((KernelGlobals *) kg_bin, (uchar4*) rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride); + } + } + } + + *sample_finished = end_sample; + //printf("MIC: mic_path_trace_internal finished: %d\n", *sample_finished); + //fflush(0); +} + +void mic_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu, int signal_value) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD + if (rgba_byte_bin == NULL) + { +#pragma offload target(mic:numDevice) \ + in(buffer_bin : length(0) REUSE) \ + in(rng_state_bin : length(0) REUSE) \ + inout(sample_finished_mic : length(sizeof(int)) REUSE) \ + in(reqFinished_mic : length(0) REUSE) \ + in(kg_bin) in(start_sample) in(end_sample) in(tile_x) in(tile_y) in(offset) in(stride) in(tile_h) in(tile_w) in(nprocs_cpu) \ + signal(signal_value) + { + mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, NULL, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu); + //printf("MIC: %d, mic_path_trace finished: %d\n", numDevice, *((int*)sample_finished_mic)); + //fflush(0); + } + } + else + { +#pragma offload target(mic:numDevice) \ + in(buffer_bin : length(0) REUSE) \ + in(rng_state_bin : length(0) REUSE) \ + inout(sample_finished_mic : length(sizeof(int)) REUSE) \ + in(reqFinished_mic : length(0) REUSE) \ + in(rgba_byte_bin : length(0) REUSE) \ + in(kg_bin) in(start_sample) in(end_sample) in(tile_x) in(tile_y) in(offset) in(stride) in(tile_h) in(tile_w) in(nprocs_cpu) \ + signal(signal_value) + { + mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu); + //printf("MIC: %d, mic_path_trace finished: %d\n", numDevice, *((int*)sample_finished_mic)); + //fflush(0); + } + } +#endif + } + else + { + mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu); + } + + //printf("MIC: mic_path_trace finished: %d\n", *((int*)sample_finished_mic)); + //fflush(0); +} + +DEVICE_PTR mic_alloc_kg(int numDevice) +{ + DEVICE_PTR kg_bin; + + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) out(kg_bin) + { + KernelGlobals *kg = new KernelGlobals(); + kg_bin = (DEVICE_PTR) kg; + } +#endif + } + else + { + KernelGlobals *kg = new KernelGlobals(); + kg_bin = (DEVICE_PTR) kg; + } + + return (DEVICE_PTR) kg_bin; +} + +void mic_free_kg(int numDevice, DEVICE_PTR kg_bin) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) in(kg_bin) + { + KernelGlobals *kg = (KernelGlobals *) kg_bin; + delete kg; + } +#endif + } + else + { + KernelGlobals *kg = (KernelGlobals *) kg_bin; + delete kg; + } +} + +void mic_mem_alloc(int numDevice, char *mem, size_t memSize) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) in(mem:length(memSize) ALLOC) + { + + } +#endif + } +} + +void mic_mem_copy_to(int numDevice, char *mem, size_t memSize, char* signal_value) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD + if (signal_value == NULL) + { +#pragma offload target(mic:numDevice) in(mem:length(memSize) REUSE) + { + + } + } + else + { +#pragma offload_transfer target(mic:numDevice) in(mem:length(memSize) REUSE) signal(signal_value) + } +#endif + } +} + +void mic_mem_copy_from(int numDevice, char *mem, size_t offset, size_t memSize, char* signal_value) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD + if (signal_value == NULL) + { +#pragma offload target(mic:numDevice) out(mem[offset:memSize]: REUSE) + { + + } + } + else + { +#pragma offload_transfer target(mic:numDevice) out(mem[offset:memSize]: REUSE) signal(signal_value) + } +#endif + } +} + +void mic_mem_zero(int numDevice, char *mem, size_t memSize) +{ + memset(mem, 0, memSize); + + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) in(mem:length(0) REUSE) in(memSize) + { + memset(mem, 0, memSize); + } +#endif + } +} + +void mic_mem_free(int numDevice, char *mem, size_t memSize) +{ + if (numDevice != -1) + { +#ifdef WITH_IT4I_MIC_OFFLOAD + //#pragma offload_transfer target(mic:numDevice) in(mem:length(0) FREE) +#pragma offload target(mic:numDevice) in(mem:length(0) FREE) + { + + } +#endif + } +} + +void mic_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name_bin, char *mem, size_t memSize) +{ + if (name_bin == NULL) + return; + + size_t nameSize = sizeof (char) * (strlen(name_bin) + 1); + char *name = (char *) name_bin; + + if (numDevice != -1) + { + +#ifdef WITH_IT4I_MIC_OFFLOAD +#pragma offload target(mic:numDevice) \ + in(name:length(nameSize) ONE_USE) + { + KernelGlobals *kg = (KernelGlobals *) kg_bin; + + if (0) + { + } +#define KERNEL_TEX(type, ttype, tname) \ + else if(strcmp(name, #tname) == 0) { \ + delete [] kg->tname.data; \ + kg->tname.data = NULL; \ + kg->tname.width = 0; \ + } +#define KERNEL_IMAGE_TEX(type, ttype, tname) +#include "kernel_textures.h" + + else if (strstr(name, "__tex_image_float")) + { + texture_image_float4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_float_")); + int array_index = id; + + if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) + { + tex = &kg->texture_float_images[array_index]; + } + + if (tex) + { + delete [] tex->data; + tex->data = NULL; + tex->dimensions_set(0, 0, 0); + } + } + else if (strstr(name, "__tex_image")) + { + texture_image_uchar4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_")); + int array_index = id - MAX_FLOAT_IMAGES; + + if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) + { + tex = &kg->texture_byte_images[array_index]; + } + + if (tex) + { + delete [] tex->data; + tex->data = NULL; + tex->dimensions_set(0, 0, 0); + } + } + } +#endif + } +} + +int mic_get_pass_stride(DEVICE_PTR kg) +{ + return ((KernelGlobals*) kg)->__data.film.pass_stride; +} + +size_t mic_get_data_size(DEVICE_PTR kg) +{ + return ((KernelGlobals*) kg)->__data_size; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernels/mic/kernel_mic.h b/intern/cycles/kernel/kernels/mic/kernel_mic.h new file mode 100644 index 0000000000000000000000000000000000000000..c4b5f96cbbec84417c3085e2447c8cc2dd0e249d --- /dev/null +++ b/intern/cycles/kernel/kernels/mic/kernel_mic.h @@ -0,0 +1,40 @@ +#ifndef __KERNEL_MIC_H__ +#define __KERNEL_MIC_H__ + +#include "client_api.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ +void mic_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu, int signal_value); + +/* Device memory */ +DEVICE_PTR mic_alloc_kg(int numDevice); +void mic_free_kg(int numDevice, DEVICE_PTR kg); + +void mic_mem_alloc(int numDevice, char* mem, size_t memSize); +void mic_mem_copy_to(int numDevice, char* mem, size_t memSize, char* signal_value); +void mic_mem_copy_from(int numDevice, char* mem, size_t offset, size_t memSize, char* signal_value); +void mic_mem_zero(int numDevice, char* mem, size_t memSize); +void mic_mem_free(int numDevice, char* mem, size_t memSize); +void mic_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name, char* mem, size_t memSize); + +void mic_const_copy(int numDevice, DEVICE_PTR kg, const char *name, char *host, size_t size); +void mic_tex_copy(int numDevice, DEVICE_PTR kg_bin, + const char *name, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension); + +void mic_wait(int numDevice, int signal_value); +int mic_get_pass_stride(DEVICE_PTR kg); +size_t mic_get_data_size(DEVICE_PTR kg); + +CCL_NAMESPACE_END + +#endif /* __KERNEL_MIC_H__ */ + diff --git a/intern/cycles/kernel/kernels/mpi/CMakeLists.txt b/intern/cycles/kernel/kernels/mpi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..037ab14da12ab17228268282d550b1fb47398899 --- /dev/null +++ b/intern/cycles/kernel/kernels/mpi/CMakeLists.txt @@ -0,0 +1,21 @@ + +set(INC + . + ../../../kernel + ../../../util + ../../../kernel/osl + ../../../../../it4i/client/api + ${MPI_INCLUDE_DIR} +) + +set(SRC + kernel_mpi.cpp +) + +set(SRC_HEADERS + kernel_mpi.h +) + +include_directories(${INC}) +add_library(cycles_kernel_mpi ${SRC} ${SRC_HEADERS}) +target_link_libraries (cycles_kernel_mpi ${MPI_LIB_FILE}) diff --git a/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp b/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5fbd820b83097d4a0906dd803386fea2c68e709 --- /dev/null +++ b/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp @@ -0,0 +1,177 @@ +#include "kernel_mpi.h" + +#include <string.h> +#include <mpi.h> + +CCL_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +int getCountOfDevices() +{ + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + return world_size - 1; +} + +void getMpiKernelData(mpi_kernel_struct *data, int mpi_tag) +{ + memset(data, 0, sizeof (mpi_kernel_struct)); + data->mpi_tag = mpi_tag; +} + +void sendMpiKernelData(mpi_kernel_struct *data) +{ + MPI_Bcast(data, sizeof (mpi_kernel_struct), MPI_BYTE, 0, MPI_COMM_WORLD); +} + +//////////////////////////////////////////////////////////////////////////// + +void mpi_const_copy(const char *name, char *host_bin, size_t size) +{ + if (strcmp(name, "__data") == 0) + { + //mpi_const_copy_struct s; + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_const_copy); + + strcpy(data.mpi_const_copy_data.name, name); + data.mpi_const_copy_data.host = (DEVICE_PTR) host_bin; + data.mpi_const_copy_data.size = size; + + sendMpiKernelData(&data); + + MPI_Bcast(host_bin, size, MPI_BYTE, 0, MPI_COMM_WORLD); + } +} + +void mpi_tex_copy( + const char *name, + DEVICE_PTR mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension) +{ + + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_tex_copy); + + strcpy(data.mpi_tex_copy_data.name, name); + data.mpi_tex_copy_data.mem = mem; + data.mpi_tex_copy_data.size = size; + data.mpi_tex_copy_data.width = width; + data.mpi_tex_copy_data.height = height; + data.mpi_tex_copy_data.depth = depth; + data.mpi_tex_copy_data.interpolation = interpolation; + data.mpi_tex_copy_data.extension = extension; + + sendMpiKernelData(&data); + + MPI_Bcast((char*) mem, size, MPI_BYTE, 0, MPI_COMM_WORLD); +} + +void mpi_alloc_kg(bool enable_mics) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_alloc_kg); + +// data.enable_mics = enable_mics; + + sendMpiKernelData(&data); +} + +void mpi_free_kg() +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_free_kg); + sendMpiKernelData(&data); +} + +void mpi_mem_alloc(const char* name, DEVICE_PTR mem, size_t memSize) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_mem_alloc); + + strcpy(data.mpi_mem_data.name, name); + data.mpi_mem_data.mem = (DEVICE_PTR) mem; + data.mpi_mem_data.memSize = memSize; + + sendMpiKernelData(&data); +} + +void mpi_mem_copy_to(DEVICE_PTR mem, size_t memSize, size_t offset) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_mem_copy_to); + + data.mpi_mem_data.mem = (DEVICE_PTR) mem; + data.mpi_mem_data.memSize = memSize; + data.mpi_mem_data.offset = offset; + + sendMpiKernelData(&data); + MPI_Bcast((char*) mem, memSize, MPI_BYTE, 0, MPI_COMM_WORLD); +} + +void mpi_mem_zero(DEVICE_PTR mem, size_t memSize, size_t offset) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_mem_zero); + data.mpi_mem_data.mem = (DEVICE_PTR) mem; + data.mpi_mem_data.memSize = memSize; + data.mpi_mem_data.offset = offset; + + sendMpiKernelData(&data); +} + +void mpi_mem_free(DEVICE_PTR mem, size_t memSize) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_mem_free); + data.mpi_mem_data.mem = (DEVICE_PTR) mem; + data.mpi_mem_data.memSize = memSize; + + sendMpiKernelData(&data); +} + +void mpi_tex_free(const char* name, DEVICE_PTR mem, size_t memSize) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_tex_free); + strcpy(data.mpi_mem_data.name, name); + data.mpi_mem_data.mem = (DEVICE_PTR) mem; + data.mpi_mem_data.memSize = memSize; + + sendMpiKernelData(&data); +} +///////////////////////////////////////////////////////////////////// + +void mpi_path_trace(size_t kg_data_size, char* rgba_pixels, bool half_float, char *buffer, char *rng_state, bool progressive, + int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w) +{ + mpi_kernel_struct data; + getMpiKernelData(&data, MPI_TAG_mpi_path_trace); + + data.mpi_path_trace_data.buffer = (DEVICE_PTR) buffer; + data.mpi_path_trace_data.rng_state = (DEVICE_PTR) rng_state; + data.mpi_path_trace_data.start_sample = start_sample; + data.mpi_path_trace_data.num_samples = num_samples; + data.mpi_path_trace_data.progressive = progressive; + data.mpi_path_trace_data.tile_x = tile_x; + data.mpi_path_trace_data.tile_y = tile_y; + data.mpi_path_trace_data.offset = offset; + data.mpi_path_trace_data.stride = stride; + data.mpi_path_trace_data.tile_h = tile_h; + data.mpi_path_trace_data.tile_w = tile_w; + data.mpi_path_trace_data.kg_data_size = kg_data_size; + data.mpi_path_trace_data.rgba_pixels = (DEVICE_PTR) rgba_pixels; + data.mpi_path_trace_data.half_float = half_float; + + sendMpiKernelData(&data); +} +///////////////////////////////////////////////////////////////////// + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/mpi/kernel_mpi.h b/intern/cycles/kernel/kernels/mpi/kernel_mpi.h new file mode 100644 index 0000000000000000000000000000000000000000..1252ea6e5b5cf1db1e326e915ccbffd9d2ae0e0a --- /dev/null +++ b/intern/cycles/kernel/kernels/mpi/kernel_mpi.h @@ -0,0 +1,35 @@ +#ifndef __KERNEL_MPI_H__ +#define __KERNEL_MPI_H__ + +#include "client_api.h" + +CCL_NAMESPACE_BEGIN + +void mpi_path_trace(size_t kg_data_size, char* rgba_pixels, bool half_float, char *buffer, char *rng_state, bool progressive, + int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w); + +void mpi_alloc_kg(bool enable_mics); +void mpi_free_kg(); + +void mpi_mem_alloc(const char *name, DEVICE_PTR mem, size_t memSize); +void mpi_mem_copy_to(DEVICE_PTR mem, size_t memSize, size_t offset); +void mpi_mem_zero(DEVICE_PTR mem, size_t memSize, size_t offset); +void mpi_mem_free(DEVICE_PTR mem, size_t memSize); +void mpi_tex_free(const char *name, DEVICE_PTR mem, size_t memSize); + +void mpi_const_copy(const char *name, char *host, size_t size); +void mpi_tex_copy(const char *name, + DEVICE_PTR mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension); + +int getCountOfDevices(); + +CCL_NAMESPACE_END + +#endif /* __KERNEL_MPI_H__ */ + diff --git a/intern/cycles/kernel/kernels/omp/CMakeLists.txt b/intern/cycles/kernel/kernels/omp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c31a53fe7f64018018fd4f8cd34fa2999fc5ec0f --- /dev/null +++ b/intern/cycles/kernel/kernels/omp/CMakeLists.txt @@ -0,0 +1,26 @@ + +set(INC + . + ../../../kernel + ../../../util + ../../../kernel/osl + ../../../../../it4i/client/api +) + +set(SRC + kernel_omp.cpp +) + +set(SRC_HEADERS + kernel_compat_omp.h + kernel_omp.h +) + +if (WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) +endif() + +set_source_files_properties(kernel_omp.cpp PROPERTIES COMPILE_FLAGS "-xCORE-AVX2") + +include_directories(${INC}) +add_library(cycles_kernel_omp ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h b/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h new file mode 100644 index 0000000000000000000000000000000000000000..1145bdcc1f0cdbc6194a125cd52a05876c5658b7 --- /dev/null +++ b/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h @@ -0,0 +1,513 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_COMPAT_OMP_H__ +#define __KERNEL_COMPAT_OMP_H__ + +#define __KERNEL_CPU__ + +/* Release kernel has too much false-positive maybe-uninitialized warnings, + * which makes it possible to miss actual warnings. + */ +#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG) +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +# pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +/* Selective nodes compilation. */ +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + +#include "util_debug.h" +#include "util_math.h" +#include "util_simd.h" +#include "util_half.h" +#include "util_types.h" + +#define ccl_addr_space + +/* On x86_64, versions of glibc < 2.16 have an issue where expf is + * much slower than the double version. This was fixed in glibc 2.16. + */ +#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \ + defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \ + (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16) +# define expf(x) ((float)exp((double)(x))) +#endif + +CCL_NAMESPACE_BEGIN + +/* Assertions inside the kernel only work for the CPU device, so we wrap it in + * a macro which is empty for other devices */ + +#define kernel_assert(cond) assert(cond) + +/* Texture types to be compatible with CUDA textures. These are really just + * simple arrays and after inlining fetch hopefully revert to being a simple + * pointer lookup. */ + +template<typename T> struct texture { + ccl_always_inline T fetch(int index) + { + kernel_assert(index >= 0 && index < width); + return data[index]; + } + +#ifdef __KERNEL_SSE2__ + ccl_always_inline ssef fetch_ssef(int index) + { + kernel_assert(index >= 0 && index < width); + return ((ssef*)data)[index]; + } + + ccl_always_inline ssei fetch_ssei(int index) + { + kernel_assert(index >= 0 && index < width); + return ((ssei*)data)[index]; + } +#endif + + T *data; + int width; +}; + +template<typename T> struct texture_image { +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + + ccl_always_inline float4 read(float4 r) + { + return r; + } + + ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + + ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if(x < 0) + x += width; + return x; + } + + ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width-1); + } + + ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; + } + + ccl_always_inline float4 interp(float x, float y) + { + if(UNLIKELY(!data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + int ix, iy, nix, niy; + + if(interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + return read(data[ix + iy*width]); + } + else if(interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + + float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); + r += (1.0f - ty)*tx*read(data[nix + iy*width]); + r += ty*(1.0f - tx)*read(data[ix + niy*width]); + r += ty*tx*read(data[nix + niy*width]); + + return r; + } + else { + /* Bicubic b-spline interpolation. */ + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data[xc[x] + yc[y]])) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); + +#undef TERM +#undef DATA + } + } + + ccl_always_inline float4 interp_3d(float x, float y, float z) + { + return interp_3d_ex(x, y, z, interpolation); + } + + ccl_always_inline float4 interp_3d_ex(float x, float y, float z, + int interpolation = INTERPOLATION_LINEAR) + { + if(UNLIKELY(!data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + int ix, iy, iz, nix, niy, niz; + + if(interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + frac(z*(float)depth, &iz); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + return read(data[ix + iy*width + iz*width*height]); + } + else if(interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + float tz = frac(z*(float)depth - 0.5f, &iz); + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + float4 r; + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); + r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); + r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); + r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); + + r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); + r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); + r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); + r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); + + return r; + } + else { + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch(extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + /* Fall through. */ + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + } + + ccl_always_inline void dimensions_set(int width_, int height_, int depth_) + { + width = width_; + height = height_; + depth = depth_; + } + + T *data; + int interpolation; + ExtensionType extension; + int width, height, depth; +#undef SET_CUBIC_SPLINE_WEIGHTS +}; + +typedef texture<float4> texture_float4; +typedef texture<float2> texture_float2; +typedef texture<float> texture_float; +typedef texture<uint> texture_uint; +typedef texture<int> texture_int; +typedef texture<uint4> texture_uint4; +typedef texture<uchar4> texture_uchar4; +typedef texture_image<float4> texture_image_float4; +typedef texture_image<uchar4> texture_image_uchar4; + +/* Macros to handle different memory storage on different devices */ + +#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) +#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) +#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) +#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) +#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y)) +#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z)) +#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation)) + +#define kernel_data (kg->__data) + +#ifdef __KERNEL_SSE2__ +typedef vector3<sseb> sse3b; +typedef vector3<ssef> sse3f; +typedef vector3<ssei> sse3i; + +ccl_device_inline void print_sse3b(const char *label, sse3b& a) +{ + print_sseb(label, a.x); + print_sseb(label, a.y); + print_sseb(label, a.z); +} + +ccl_device_inline void print_sse3f(const char *label, sse3f& a) +{ + print_ssef(label, a.x); + print_ssef(label, a.y); + print_ssef(label, a.z); +} + +ccl_device_inline void print_sse3i(const char *label, sse3i& a) +{ + print_ssei(label, a.x); + print_ssei(label, a.y); + print_ssei(label, a.z); +} + +#endif + +CCL_NAMESPACE_END + +#endif /* __KERNEL_COMPAT_OMP_H__ */ + diff --git a/intern/cycles/kernel/kernels/omp/kernel_omp.cpp b/intern/cycles/kernel/kernels/omp/kernel_omp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6893281c8b0df87425db53dd87893b611cf95bcd --- /dev/null +++ b/intern/cycles/kernel/kernels/omp/kernel_omp.cpp @@ -0,0 +1,302 @@ +#include "kernel_omp.h" + +#include "kernel_compat_omp.h" + +#include "kernel.h" + +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_path_branched.h" +#include "kernel_bake.h" + +#include <omp.h> + +//#define NUM_THREADS 240 +#define SIZE_T long + +CCL_NAMESPACE_BEGIN + +#ifndef WITH_IT4I_MIC_OFFLOAD +void cwassert(const char * _Message, const char *_File, unsigned _Line) +{ + printf("ASSERT: %s, %s, %d\n", _Message, _File, _Line); +} +#endif + +/* Memory Copy */ +void omp_const_copy_internal(DEVICE_PTR kg_bin, char *host_bin, size_t size) +{ + KernelGlobals *kg = (KernelGlobals *) kg_bin; + memcpy(&kg->__data, host_bin, size); + kg->__data_size = size; +} + +void omp_const_copy(int numDevice, DEVICE_PTR kg_bin, const char *name, char *host_bin, size_t size) +{ + if (strcmp(name, "__data") == 0) + { + omp_const_copy_internal(kg_bin, host_bin, size); + } + else + assert(0); +} + +void omp_tex_copy_internal(DEVICE_PTR kg_bin, + const char *name, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension) +{ + KernelGlobals *kg = (KernelGlobals *) kg_bin; + + if (0) + { + } +#define KERNEL_TEX(type, ttype, tname) \ + else if(strcmp(name, #tname) == 0) { \ + kg->tname.data = (type*)mem; \ + kg->tname.width = width; \ + } +#define KERNEL_IMAGE_TEX(type, ttype, tname) +#include "kernel_textures.h" + + else if (strstr(name, "__tex_image_float")) + { + texture_image_float4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_float_")); + int array_index = id; + + if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) + { + tex = &kg->texture_float_images[array_index]; + } + + if (tex) + { + tex->data = (float4*) mem; + tex->dimensions_set(width, height, depth); + tex->interpolation = interpolation; + tex->extension = (ExtensionType) extension; + } + } + else if (strstr(name, "__tex_image")) + { + texture_image_uchar4 *tex = NULL; + int id = atoi(name + strlen("__tex_image_")); + int array_index = id - MAX_FLOAT_IMAGES; + + if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) + { + tex = &kg->texture_byte_images[array_index]; + } + + if (tex) + { + tex->data = (uchar4*) mem; + tex->dimensions_set(width, height, depth); + tex->interpolation = interpolation; + tex->extension = (ExtensionType) extension; + } + } + +} + +void omp_tex_copy(int numDevice, DEVICE_PTR kg_bin, + const char *name_bin, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension) +{ + if (name_bin == NULL || mem == NULL) + return; + + size_t nameSize = sizeof (char) * (strlen(name_bin) + 1); + char *name = (char *) name_bin; + + //printf("omp_tex_copy_internal: %d: %s, %d\n", numDevice, name, size); + + omp_tex_copy_internal(kg_bin, name, mem, size, width, height, depth, interpolation, extension); + + //printf("omp_tex_copy: %s\n", name); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void omp_wait(int numDevice, char *signal_value) +{ +} + +void omp_film_convert_byte(DEVICE_PTR _kg, + char *_rgba_byte, float *buffer, + float sample_scale, int x, int y, int offset, int stride) +{ + KernelGlobals *kg = (KernelGlobals *)_kg; + uchar4 *rgba_byte = (uchar4 *)_rgba_byte; + + /* buffer offset */ + int index = offset + x + y*stride; + + rgba_byte += index; + //rgba_float += index; + buffer += index * kernel_data.film.pass_stride; + + /* map colors */ + float4 irradiance = *((ccl_global float4*) buffer); + float4 float_result = film_map(kg, irradiance, sample_scale); + uchar4 byte_result = film_float_to_byte(float_result); + + *rgba_byte = byte_result; +} + +void omp_convert_to_half_float(DEVICE_PTR _kg, + char *_rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride) +{ + KernelGlobals *kg = (KernelGlobals *)_kg; + uchar4 *rgba = (uchar4 *)_rgba; + + /* buffer offset */ + int index = offset + x + y*stride; + + float4 *in = (float4*) (buffer + index * kernel_data.film.pass_stride); + half *out = (half*) rgba + index * 4; + + float exposure = kernel_data.film.exposure; + + float4 rgba_in = *in; + + if (exposure != 1.0f) + { + rgba_in.x *= exposure; + rgba_in.y *= exposure; + rgba_in.z *= exposure; + } + + float4_store_half(out, rgba_in, sample_scale); +} + +void omp_path_trace_internal(DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu) +{ + int size = tile_h*tile_w; + + int *sample_finished = (int*) sample_finished_omp; + int *reqFinished = (int*) reqFinished_omp; + + //printf("exposure %f\n", ((KernelGlobals *) kg_bin)->__data.film.exposure); + //printf("sample_scale %f \n", 1.0f / (start_sample + 1.0f)); + + *sample_finished = start_sample; + +#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1) + for (int i = 0; i < size; i++) + { +// if (*reqFinished != 0) +// continue; + + int y = i / tile_w; + int x = i - y * tile_w; + + for (int sample = start_sample; sample < end_sample; sample++) + { + + kernel_path_trace((KernelGlobals *) kg_bin, (float *) buffer_bin, (unsigned int*) rng_state_bin, sample, x + tile_x, y + tile_y, offset, stride); + + if (rgba_byte_bin != NULL) + { + float sample_scale = 1.0f / (sample + 1.0f); + //printf("sample_scale %f\n", sample_scale); + //fflush(0); + + if (is_rgba_float) + omp_convert_to_half_float(kg_bin, rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride); + else + omp_film_convert_byte(kg_bin, rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride); + } + } + } + + *sample_finished = end_sample; + + //printf("MIC: sample_finished %d\n", *sample_finished); + //fflush(0); +} + +void omp_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu, char* signal_value) +{ + omp_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_omp, reqFinished_omp, nprocs_cpu); +} + +DEVICE_PTR omp_alloc_kg(int numDevice) +{ + DEVICE_PTR kg_bin; + + KernelGlobals *kg = new KernelGlobals(); + kg_bin = (DEVICE_PTR) kg; + + return (DEVICE_PTR) kg_bin; +} + +void omp_free_kg(int numDevice, DEVICE_PTR kg_bin) +{ + KernelGlobals *kg = (KernelGlobals *) kg_bin; + delete kg; +} + +void omp_mem_alloc(int numDevice, char *mem, size_t memSize) +{ +} + +void omp_mem_copy_to(int numDevice, char *mem, size_t memSize, char* signal_value) +{ +} + +void omp_mem_copy_from(int numDevice, char *mem, size_t offset, size_t memSize, char* signal_value) +{ +} + +void omp_mem_zero(int numDevice, char *mem, size_t memSize) +{ + memset(mem, 0, memSize); +} + +void omp_mem_free(int numDevice, char *mem, size_t memSize) +{ +} + +void omp_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name_bin, char *mem, size_t memSize) +{ +} + +int omp_get_pass_stride(DEVICE_PTR kg) +{ + return ((KernelGlobals*) kg)->__data.film.pass_stride; +} + +size_t omp_get_data_size(DEVICE_PTR kg) +{ + return ((KernelGlobals*) kg)->__data_size; +} + +void omp_kernel_path_trace(DEVICE_PTR _kg, + float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride) +{ + kernel_path_trace((KernelGlobals*) _kg, buffer, rng_state, sample, x, y, offset, stride); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernels/omp/kernel_omp.h b/intern/cycles/kernel/kernels/omp/kernel_omp.h new file mode 100644 index 0000000000000000000000000000000000000000..b00b30d6eb890a53ec5f9a057096315257135b23 --- /dev/null +++ b/intern/cycles/kernel/kernels/omp/kernel_omp.h @@ -0,0 +1,53 @@ +#ifndef __KERNEL_OMP_H__ +#define __KERNEL_OMP_H__ + +#include "client_api.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ +void omp_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu, char* signal_value); + +/* Device memory */ +DEVICE_PTR omp_alloc_kg(int numDevice); +void omp_free_kg(int numDevice, DEVICE_PTR kg); + +void omp_mem_alloc(int numDevice, char* mem, size_t memSize); +void omp_mem_copy_to(int numDevice, char* mem, size_t memSize, char* signal_value); +void omp_mem_copy_from(int numDevice, char* mem, size_t offset, size_t memSize, char* signal_value); +void omp_mem_zero(int numDevice, char* mem, size_t memSize); +void omp_mem_free(int numDevice, char* mem, size_t memSize); +void omp_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name, char* mem, size_t memSize); + +void omp_const_copy(int numDevice, DEVICE_PTR kg, const char *name, char *host, size_t size); +void omp_tex_copy(int numDevice, DEVICE_PTR kg_bin, + const char *name, + char* mem, + size_t size, + size_t width, + size_t height, + size_t depth, + int interpolation, + int extension); + +void omp_wait(int numDevice, char *signal_value); + +void omp_film_convert_byte(DEVICE_PTR _kg, + char *_rgba_byte, float *buffer, + float sample_scale, int x, int y, int offset, int stride); + +void omp_convert_to_half_float(DEVICE_PTR _kg, + char *_rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); + +int omp_get_pass_stride(DEVICE_PTR kg); +size_t omp_get_data_size(DEVICE_PTR kg); + +void omp_kernel_path_trace(DEVICE_PTR _kg, + float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); + +CCL_NAMESPACE_END + +#endif /* __KERNEL_OMP_H__ */ + diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 17ca6ce0f4848622e1f3fdf06365fca66a3ec4bf..c6a223b289eb5cbb13ec38ea7c88f8d86a7fb0e9 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -67,6 +67,18 @@ set(SRC_HEADERS tile.h ) +if (WITH_IT4I_MPI) + add_definitions(-DWITH_IT4I_MPI) +endif() + +if (WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) +endif() + +if (WITH_OPENMP) + add_definitions(-DWITH_OPENMP) +endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}") include_directories(${INC}) diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index 5bf5e5113ef8798c8e2f2dc13890f5409a6f4e08..6eb6d75923cad2ad4415b7ec5a11f0a7b1238b36 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -175,9 +175,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("d_input", d_input, MEM_READ_ONLY); + device->mem_copy_to("d_input", d_input); + device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; @@ -195,15 +195,15 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre device->task_wait(); if(progress.get_cancel()) { - device->mem_free(d_input); - device->mem_free(d_output); + device->mem_free("d_input", d_input); + device->mem_free("d_output", d_output); m_is_baking = false; return false; } - device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); - device->mem_free(d_input); - device->mem_free(d_output); + device->mem_copy_from("d_output", d_output, 0, 1, d_output.size(), sizeof(float4)); + device->mem_free("d_input", d_input); + device->mem_free("d_output", d_output); /* read result */ int k = 0; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fab3f701757e3d320ec98d8433ae4f530ed2aeb5..a9918f440e1899ac6255305329b65b1554ac752a 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -85,6 +85,10 @@ RenderTile::RenderTile() start_sample = 0; num_samples = 0; resolution = 0; + + num_samples_orig = 0; + progressive = false; + half_float = false; offset = 0; stride = 0; @@ -110,12 +114,12 @@ RenderBuffers::~RenderBuffers() void RenderBuffers::device_free() { if(buffer.device_pointer) { - device->mem_free(buffer); + device->mem_free("buffer", buffer); buffer.clear(); } if(rng_state.device_pointer) { - device->mem_free(rng_state); + device->mem_free("rng_state", rng_state); rng_state.clear(); } } @@ -129,8 +133,8 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) /* allocate buffer */ buffer.resize(params.width*params.height*params.get_passes_size()); - device->mem_alloc(buffer, MEM_READ_WRITE); - device->mem_zero(buffer); + device->mem_alloc("buffer", buffer, MEM_READ_WRITE); + device->mem_zero("buffer", buffer); /* allocate rng state */ rng_state.resize(params.width, params.height); @@ -142,8 +146,8 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) for(y = 0; y < height; y++) init_state[x + y*width] = hash_int_2d(params.full_x+x, params.full_y+y); - device->mem_alloc(rng_state, MEM_READ_WRITE); - device->mem_copy_to(rng_state); + device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE); + device->mem_copy_to("rng_state", rng_state); } bool RenderBuffers::copy_from_device() @@ -151,168 +155,14 @@ bool RenderBuffers::copy_from_device() if(!buffer.device_pointer) return false; - device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + device->mem_copy_from("buffer", buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); return true; } bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels) { - int pass_offset = 0; - - foreach(Pass& pass, params.passes) { - if(pass.type != type) { - pass_offset += pass.components; - continue; - } - - float *in = (float*)buffer.data_pointer + pass_offset; - int pass_stride = params.get_passes_size(); - - float scale = (pass.filter)? 1.0f/(float)sample: 1.0f; - float scale_exposure = (pass.exposure)? scale*exposure: scale; - - int size = params.width*params.height; - - if(components == 1) { - assert(pass.components == components); - - /* scalar */ - if(type == PASS_DEPTH) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = (f == 0.0f)? 1e10f: f*scale_exposure; - } - } - else if(type == PASS_MIST) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = saturate(f*scale_exposure); - } - } -#ifdef WITH_CYCLES_DEBUG - else if(type == PASS_BVH_TRAVERSAL_STEPS) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = f; - } - } - else if(type == PASS_RAY_BOUNCES) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = f; - } - } -#endif - else { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = f*scale_exposure; - } - } - } - else if(components == 3) { - assert(pass.components == 4); - - /* RGBA */ - if(type == PASS_SHADOW) { - for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f; - - pixels[0] = f.x*invw; - pixels[1] = f.y*invw; - pixels[2] = f.z*invw; - } - } - else if(pass.divide_type != PASS_NONE) { - /* RGB lighting passes that need to divide out color */ - pass_offset = 0; - foreach(Pass& color_pass, params.passes) { - if(color_pass.type == pass.divide_type) - break; - pass_offset += color_pass.components; - } - - float *in_divide = (float*)buffer.data_pointer + pass_offset; - - for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) { - float3 f = make_float3(in[0], in[1], in[2]); - float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]); - - f = safe_divide_even_color(f*exposure, f_divide); - - pixels[0] = f.x; - pixels[1] = f.y; - pixels[2] = f.z; - } - } - else { - /* RGB/vector */ - for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float3 f = make_float3(in[0], in[1], in[2]); - - pixels[0] = f.x*scale_exposure; - pixels[1] = f.y*scale_exposure; - pixels[2] = f.z*scale_exposure; - } - } - } - else if(components == 4) { - assert(pass.components == components); - - /* RGBA */ - if(type == PASS_SHADOW) { - for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f; - - pixels[0] = f.x*invw; - pixels[1] = f.y*invw; - pixels[2] = f.z*invw; - pixels[3] = 1.0f; - } - } - else if(type == PASS_MOTION) { - /* need to normalize by number of samples accumulated for motion */ - pass_offset = 0; - foreach(Pass& color_pass, params.passes) { - if(color_pass.type == PASS_MOTION_WEIGHT) - break; - pass_offset += color_pass.components; - } - - float *in_weight = (float*)buffer.data_pointer + pass_offset; - - for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float w = in_weight[0]; - float invw = (w > 0.0f)? 1.0f/w: 0.0f; - - pixels[0] = f.x*invw; - pixels[1] = f.y*invw; - pixels[2] = f.z*invw; - pixels[3] = f.w*invw; - } - } - else { - for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - - pixels[0] = f.x*scale_exposure; - pixels[1] = f.y*scale_exposure; - pixels[2] = f.z*scale_exposure; - - /* clamp since alpha might be > 1.0 due to russian roulette */ - pixels[3] = saturate(f.w*scale); - } - } - } - - return true; - } - - return false; + return device->get_pass_rect(type, exposure, sample, components, pixels, params, (float*) buffer.data_pointer); } /* Display Buffer */ diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 4fa1c51d821515c4f2940523d816210bdde2471a..75d2cb90ac4b947c347cef36248bb99dd2e8638d 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -134,10 +134,16 @@ public: int x, y, w, h; int start_sample; int num_samples; - int sample; + + int num_samples_orig; + bool progressive; + + int sample; int resolution; int offset; int stride; + + bool half_float; device_ptr buffer; device_ptr rng_state; diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 0bebdaf8a6712ea5b39e4ca860f806a63f613f6e..001ff9141f219b033d40a137e657b2c958cb31ab 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -61,7 +61,7 @@ void ImageManager::set_osl_texture_system(void *texture_system) void ImageManager::set_extended_image_limits(const DeviceInfo& info) { - if(info.type == DEVICE_CPU) { + if (info.type == DEVICE_CPU || info.type == DEVICE_OMP || info.type == DEVICE_MPI) { tex_num_images = TEX_EXTENDED_NUM_IMAGES_CPU; tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES; tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START; @@ -700,9 +700,15 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl device_vector<float4>& tex_img = dscene->tex_float_image[slot]; + string name; + + if(slot >= 100) name = string_printf("__tex_image_float_%d", slot); + else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot); + else name = string_printf("__tex_image_float_00%d", slot); + if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + device->tex_free(name.c_str(), tex_img); } if(!file_load_float_image(img, tex_img)) { @@ -715,12 +721,6 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl pixels[3] = TEX_IMAGE_MISSING_A; } - string name; - - if(slot >= 100) name = string_printf("__tex_image_float_%d", slot); - else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot); - else name = string_printf("__tex_image_float_00%d", slot); - if(!pack_images) { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), @@ -735,9 +735,15 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl device_vector<uchar4>& tex_img = dscene->tex_image[slot - tex_image_byte_start]; + string name; + + if(slot >= 100) name = string_printf("__tex_image_%d", slot); + else if(slot >= 10) name = string_printf("__tex_image_0%d", slot); + else name = string_printf("__tex_image_00%d", slot); + if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + device->tex_free(name.c_str(), tex_img); } if(!file_load_image(img, tex_img)) { @@ -750,12 +756,6 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl pixels[3] = (TEX_IMAGE_MISSING_A * 255); } - string name; - - if(slot >= 100) name = string_printf("__tex_image_%d", slot); - else if(slot >= 10) name = string_printf("__tex_image_0%d", slot); - else name = string_printf("__tex_image_00%d", slot); - if(!pack_images) { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), @@ -794,7 +794,7 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, int sl if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + device->tex_free("tex_float_image", tex_img); } tex_img.clear(); @@ -807,7 +807,7 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, int sl if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + device->tex_free("tex_image", tex_img); } tex_img.clear(); @@ -928,14 +928,14 @@ void ImageManager::device_pack_images(Device *device, if(dscene->tex_image_packed.size()) { if(dscene->tex_image_packed.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_packed); + device->tex_free("__tex_image_packed", dscene->tex_image_packed); } device->tex_alloc("__tex_image_packed", dscene->tex_image_packed); } if(dscene->tex_image_packed_info.size()) { if(dscene->tex_image_packed_info.device_pointer) { thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_packed_info); + device->tex_free("__tex_image_packed_info", dscene->tex_image_packed_info); } device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info); } @@ -959,8 +959,8 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene) for(size_t slot = 0; slot < float_images.size(); slot++) device_free_image(device, dscene, slot); - device->tex_free(dscene->tex_image_packed); - device->tex_free(dscene->tex_image_packed_info); + device->tex_free("__tex_image_packed", dscene->tex_image_packed); + device->tex_free("__tex_image_packed_info", dscene->tex_image_packed_info); dscene->tex_image_packed.clear(); dscene->tex_image_packed_info.clear(); diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 47489f6e007cd0e03de1f5a4132ca6ff2bc520a6..9923ac6e171727c9c4e47bf1658687a644988c0c 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -179,7 +179,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene void Integrator::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->sobol_directions); + device->tex_free("__sobol_directions", dscene->sobol_directions); dscene->sobol_directions.clear(); } diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 1637045ce84bf496099778937c34693bf1a7f756..bf7ba6a56d47401c8677e632034f85e58b0054c4 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("d_input", d_input, MEM_READ_ONLY); + device->mem_copy_to("d_input", d_input); + device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY); DeviceTask main_task(DeviceTask::SHADER); main_task.shader_input = d_input.device_pointer; @@ -77,11 +77,11 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res foreach(DeviceTask& task, split_tasks) { device->task_add(task); device->task_wait(); - device->mem_copy_from(d_output, task.shader_x, 1, task.shader_w, sizeof(float4)); + device->mem_copy_from("d_output", d_output, task.shader_x, 1, task.shader_w, sizeof(float4)); } - device->mem_free(d_input); - device->mem_free(d_output); + device->mem_free("d_input", d_input); + device->mem_free("d_output", d_output); d_input.clear(); @@ -801,10 +801,10 @@ void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *sce void LightManager::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->light_distribution); - device->tex_free(dscene->light_data); - device->tex_free(dscene->light_background_marginal_cdf); - device->tex_free(dscene->light_background_conditional_cdf); + device->tex_free("__light_distribution", dscene->light_distribution); + device->tex_free("__light_data", dscene->light_data); + device->tex_free("__light_background_marginal_cdf", dscene->light_background_marginal_cdf); + device->tex_free("__light_background_conditional_cdf", dscene->light_background_conditional_cdf); dscene->light_distribution.clear(); dscene->light_data.clear(); diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 705483112a17fbdcff0063c1ca723f94aaa572a4..df0956f127f1292e75546bc6f8effc8b05ab5a33 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -1332,24 +1332,24 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen void MeshManager::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->bvh_nodes); - device->tex_free(dscene->bvh_leaf_nodes); - device->tex_free(dscene->object_node); - device->tex_free(dscene->tri_woop); - device->tex_free(dscene->prim_type); - device->tex_free(dscene->prim_visibility); - device->tex_free(dscene->prim_index); - device->tex_free(dscene->prim_object); - device->tex_free(dscene->tri_shader); - device->tex_free(dscene->tri_vnormal); - device->tex_free(dscene->tri_vindex); - device->tex_free(dscene->tri_verts); - device->tex_free(dscene->curves); - device->tex_free(dscene->curve_keys); - device->tex_free(dscene->attributes_map); - device->tex_free(dscene->attributes_float); - device->tex_free(dscene->attributes_float3); - device->tex_free(dscene->attributes_uchar4); + device->tex_free("__bvh_nodes", dscene->bvh_nodes); + device->tex_free("__bvh_leaf_nodes", dscene->bvh_leaf_nodes); + device->tex_free("__object_node", dscene->object_node); + device->tex_free("__tri_woop", dscene->tri_woop); + device->tex_free("__prim_type", dscene->prim_type); + device->tex_free("__prim_visibility", dscene->prim_visibility); + device->tex_free("__prim_index", dscene->prim_index); + device->tex_free("__prim_object", dscene->prim_object); + device->tex_free("__tri_shader", dscene->tri_shader); + device->tex_free("__tri_vnormal", dscene->tri_vnormal); + device->tex_free("__tri_vindex", dscene->tri_vindex); + device->tex_free("__tri_verts", dscene->tri_verts); + device->tex_free("__curves", dscene->curves); + device->tex_free("__curve_keys", dscene->curve_keys); + device->tex_free("__attributes_map", dscene->attributes_map); + device->tex_free("__attributes_float", dscene->attributes_float); + device->tex_free("__attributes_float3", dscene->attributes_float3); + device->tex_free("__attributes_uchar4", dscene->attributes_uchar4); dscene->bvh_nodes.clear(); dscene->object_node.clear(); diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index dccfd74f17a81a326340406875511654156b5a81..184b4b7e910fee9ef0e5e8a30159ab35bdd8897f 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -110,9 +110,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("d_input", d_input, MEM_READ_ONLY); + device->mem_copy_to("d_input", d_input); + device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; @@ -127,14 +127,14 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me device->task_wait(); if(progress.get_cancel()) { - device->mem_free(d_input); - device->mem_free(d_output); + device->mem_free("d_input", d_input); + device->mem_free("d_output", d_output); return false; } - device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); - device->mem_free(d_input); - device->mem_free(d_output); + device->mem_copy_from("d_output", d_output, 0, 1, d_output.size(), sizeof(float4)); + device->mem_free("d_input", d_input); + device->mem_free("d_output", d_output); /* read result */ done.clear(); diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index ec85aa8f80bd5f9eca9b887754d600e040a6bfb7..230ba7675ec5a096f80db3453a9b330c458267dd 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -471,13 +471,13 @@ void ObjectManager::device_update_flags(Device *device, void ObjectManager::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->objects); + device->tex_free("__objects", dscene->objects); dscene->objects.clear(); - device->tex_free(dscene->objects_vector); + device->tex_free("__objects_vector", dscene->objects_vector); dscene->objects_vector.clear(); - device->tex_free(dscene->object_flag); + device->tex_free("__object_flag", dscene->object_flag); dscene->object_flag.clear(); } diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp index 8f9e8c6d6391cf24a21f8b5a02001704a2a0ed3f..9a3751ae21616bc7f18be7414be5ec1354cd820a 100644 --- a/intern/cycles/render/particles.cpp +++ b/intern/cycles/render/particles.cpp @@ -111,7 +111,7 @@ void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, S void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->particles); + device->tex_free("__particles", dscene->particles); dscene->particles.clear(); } diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 9842cd10a1e170fe7a4869546eefbde6079ef930..5efae1ec40d59ed0abf9a25a0a7f42a8d48b20f7 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -60,7 +60,7 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_) bake_manager = new BakeManager(); /* OSL only works on the CPU */ - if(device_info_.type == DEVICE_CPU) + if(device_info_.type == DEVICE_CPU ||device_info_.type == DEVICE_MPI || device_info_.type == DEVICE_OMP) shader_manager = ShaderManager::create(this, params.shadingsystem); else shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM); diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 84a420ce9b68229883386de6c2cdc14fbb0073d7..1a36f3e4fd39601f0d3d9fd2db66fd0a875846cf 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -49,7 +49,7 @@ Session::Session(const SessionParams& params_) max(params.device.multi_devices.size(), 1)), stats() { - device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background); + device_use_gl = ((params.device.type != DEVICE_CPU && params.device.type != DEVICE_MPI && params.device.type != DEVICE_OMP) && !params.background); TaskScheduler::init(params.threads); @@ -376,6 +376,11 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) rtile.h = tile.h; rtile.start_sample = tile_manager.state.sample; rtile.num_samples = tile_manager.state.num_samples; + + rtile.num_samples_orig = tile_manager.num_samples; + rtile.progressive = tile_manager.progressive; + rtile.half_float = (display != NULL) ? display->half_float : false; + rtile.resolution = tile_manager.state.resolution_divider; tile_lock.unlock(); @@ -830,7 +835,7 @@ void Session::update_status_time(bool show_pause, bool show_done) /* update status */ string status, substatus; - if(!params.progressive) { + if (!params.progressive) { const int progress_sample = progress.get_sample(), num_samples = tile_manager.num_samples; const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL; const bool is_multidevice = params.device.multi_devices.size() > 1; @@ -839,8 +844,8 @@ void Session::update_status_time(bool show_pause, bool show_done) substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); - if((is_gpu && !is_multidevice && !device->info.use_split_kernel) || - (is_cpu && (num_tiles == 1 || is_last_tile))) + if ((is_gpu && !is_multidevice && !device->info.use_split_kernel) || + (is_cpu && (num_tiles == 1 || is_last_tile))) { /* When using split-kernel (OpenCL) each thread in a tile will be working on a different * sample. Can't display sample number when device uses split-kernel @@ -854,12 +859,12 @@ void Session::update_status_time(bool show_pause, bool show_done) */ int status_sample = progress_sample; - if(tile > 1) { + if (tile > 1) { /* sample counter is global for all tiles, subtract samples * from already finished tiles to get sample counter for * current tile only */ - if(is_cpu && is_last_tile && num_tiles > 1) { + if (is_cpu && is_last_tile && num_tiles > 1) { status_sample = num_samples - (num_samples * num_tiles - progress_sample); } else { @@ -869,6 +874,10 @@ void Session::update_status_time(bool show_pause, bool show_done) substatus += string_printf(", Sample %d/%d", status_sample, num_samples); } + else if (params.device.type == DEVICE_MPI || params.device.type == DEVICE_OMP) + { + substatus = string_printf("Path Tracing Tile %d/%d", device->get_tile_id(), device->get_num_tiles()); + } } else if(tile_manager.num_samples == INT_MAX) substatus = string_printf("Path Tracing Sample %d", sample+1); @@ -900,9 +909,10 @@ void Session::update_status_time(bool show_pause, bool show_done) progress.set_tile(tile, tile_time); } -void Session::update_progress_sample() +void Session::update_progress_sample(int s) { - progress.increment_sample(); + //progress.increment_sample(); + progress.set_sample(s); } void Session::path_trace() @@ -914,7 +924,7 @@ void Session::path_trace() task.release_tile = function_bind(&Session::release_tile, this, _1); task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); - task.update_progress_sample = function_bind(&Session::update_progress_sample, this); + task.update_progress_sample = function_bind(&Session::update_progress_sample, this, _1); task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; task.requested_tile_size = params.tile_size; diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index c669bccd34bf028f5ce675fd4a7d9c51d6a0ed85..0ac6fffc49df000ecb4d94116f980570811f42b8 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -174,7 +174,7 @@ protected: void update_tile_sample(RenderTile& tile); void release_tile(RenderTile& tile); - void update_progress_sample(); + void update_progress_sample(int s); bool device_use_gl; diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 09a6061abea8561b4f784835deef3fb3f54fff5b..6b8b82cee36d434dcaee4bb7ea173150f01d34a5 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -331,7 +331,7 @@ void ShaderManager::device_update_common(Device *device, Scene *scene, Progress& /*progress*/) { - device->tex_free(dscene->shader_flag); + device->tex_free("__shader_flag", dscene->shader_flag); dscene->shader_flag.clear(); if(scene->shaders.size() == 0) @@ -423,7 +423,7 @@ void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scen beckmann_table_offset = TABLE_OFFSET_INVALID; } - device->tex_free(dscene->shader_flag); + device->tex_free("__shader_flag", dscene->shader_flag); dscene->shader_flag.clear(); } diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index f3d39c1bd72ddd01923213c90106ae1c50a6d290..6228d0b60bf5f5846586830438de625d32667859 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -103,7 +103,7 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s { device_free_common(device, dscene, scene); - device->tex_free(dscene->svm_nodes); + device->tex_free("__svm_nodes", dscene->svm_nodes); dscene->svm_nodes.clear(); } diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp index ad3f486607229e81349754232ef5ad531c3af16d..f0c915d3851c0c45a4f085adde5888550d471c99 100644 --- a/intern/cycles/render/tables.cpp +++ b/intern/cycles/render/tables.cpp @@ -42,7 +42,7 @@ void LookupTables::device_update(Device *device, DeviceScene *dscene) if(!need_update) return; - device->tex_free(dscene->lookup_table); + device->tex_free("__lookup_table", dscene->lookup_table); if(lookup_tables.size() > 0) device->tex_alloc("__lookup_table", dscene->lookup_table); @@ -52,7 +52,7 @@ void LookupTables::device_update(Device *device, DeviceScene *dscene) void LookupTables::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->lookup_table); + device->tex_free("__lookup_table", dscene->lookup_table); dscene->lookup_table.clear(); } diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 3fb60735b65a8fde0cd0cf45281dc84d0f142039..a6a764d6e1461001809a299676deb6919e602e13 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -110,6 +110,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_) params = params_; int divider = 1; + +#if !defined(WITH_IT4I_MIC_OFFLOAD) && !defined(WITH_IT4I_MPI) && !defined(WITH_OPENMP) int w = params.width, h = params.height; if(start_resolution != INT_MAX) { @@ -120,7 +122,7 @@ void TileManager::reset(BufferParams& params_, int num_samples_) divider *= 2; } } - +#endif num_samples = num_samples_; state.buffer = BufferParams(); diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 700e00c9e0ad3ef0daa4bd0ee0444ac46f4347fe..48ce0cd891c8d95ee340dd2f3849ac75988c60b3 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -70,6 +70,7 @@ public: } state; int num_samples; + bool progressive; TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution, bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); @@ -85,8 +86,7 @@ public: protected: void set_tiles(); - - bool progressive; + int2 tile_size; TileOrder tile_order; int start_resolution; diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 1fef0bd044e97cf131c971b34e08f4418bd235ed..2bff9a1322952c884614600901ad077b40bb08c3 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -18,6 +18,7 @@ #define __UTIL_OPTIMIZATION_H__ #ifndef __KERNEL_GPU__ +#ifndef __KERNEL_MIC__ /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) || \ @@ -117,5 +118,7 @@ #endif +#endif + #endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 0b35142ddb36d9a29250b32a71f538387b9d0c4b..d21594f7833cb69addbbffd81ec4944f43bd2127 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -192,6 +192,13 @@ public: sample++; } + + void set_sample(int s) + { + thread_scoped_lock lock(progress_mutex); + + sample = s; + } void increment_sample_update() { diff --git a/it4i/client/CMakeLists.txt b/it4i/client/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b009608692c434ce76350d8dc4db020a50bcf84 --- /dev/null +++ b/it4i/client/CMakeLists.txt @@ -0,0 +1,46 @@ +cmake_minimum_required(VERSION 2.8) + +project (blender_client) +#set( CMAKE_VERBOSE_MAKEFILE on ) + +option(WITH_IT4I_MIC_OFFLOAD "Enable MIC (has to be supported by the compiler)" OFF) +option(WITH_IT4I_MIC_NATIVE "Enable MIC_NATIVE (has to be supported by the compiler)" OFF) + + +if(WITH_IT4I_MIC_NATIVE) + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -mmic" CACHE STRING "CMAKE_CXX_FLAGS_RELEASE" FORCE) + set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -mmic" CACHE STRING "CMAKE_CXX_FLAGS_DEBUG" FORCE) + + set(MIC_FLAG "-mic") +else() + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "CMAKE_CXX_FLAGS_RELEASE" FORCE) + set(CMAKE_CXX_FLAGS_DEBUG "-g -O0" CACHE STRING "CMAKE_CXX_FLAGS_DEBUG" FORCE) +endif() + + +set(CMAKE_CXX_FLAGS "-qopenmp") + +add_definitions( + -DCCL_NAMESPACE_BEGIN= + -DCCL_NAMESPACE_END= +) + +# check flag +if(WITH_IT4I_MIC_NATIVE AND WITH_IT4I_MIC_OFFLOAD) + message(FATAL_ERROR "The flags WITH_IT4I_MIC_NATIVE and WITH_IT4I_MIC_OFFLOAD are not compatible.") +endif() + +# Subdirectories +add_subdirectory(main) +add_subdirectory(cycles_mpi) + +if(WITH_IT4I_MIC_NATIVE) + add_subdirectory(cycles_mic) +else() + add_subdirectory(cycles_omp) +endif() + +if(WITH_IT4I_MIC_OFFLOAD) + add_subdirectory(cycles_mic) +endif() + diff --git a/it4i/client/api/client_api.h b/it4i/client/api/client_api.h new file mode 100644 index 0000000000000000000000000000000000000000..180eca0949613d5fee2a47c74aeccf7d5af72b7e --- /dev/null +++ b/it4i/client/api/client_api.h @@ -0,0 +1,148 @@ +#ifndef __client_api_H__ +#define __client_api_H__ + +#include <cstdio> + +/////////////////////////cycles////////////////////////////////////// +#define MPI_TAG_mpi_cycles_start 1000 + +#define MPI_TAG_mpi_const_copy 1001 +#define MPI_TAG_mpi_tex_copy 1002 +#define MPI_TAG_mpi_path_trace 1003 +#define MPI_TAG_mpi_branched_path_trace 1004 +#define MPI_TAG_mpi_film_convert_half 1005 +#define MPI_TAG_mpi_film_convert_byte 1006 +#define MPI_TAG_mpi_bake 1007 +#define MPI_TAG_mpi_shader 1008 + +#define MPI_TAG_mpi_alloc_kg 1010 +#define MPI_TAG_mpi_free_kg 1011 +#define MPI_TAG_mpi_mem_alloc 1012 +#define MPI_TAG_mpi_mem_copy_to 1013 +#define MPI_TAG_mpi_mem_copy_from 1014 +#define MPI_TAG_mpi_mem_zero 1015 +#define MPI_TAG_mpi_mem_free 1016 + +#define MPI_TAG_mpi_path_trace_buffer 1017 +#define MPI_TAG_mpi_path_trace_rng_state 1018 +//#define MPI_TAG_mpi_path_trace_buffer_sample 1019 +#define MPI_TAG_mpi_path_trace_rgba 1020 +#define MPI_TAG_mpi_path_trace_rng 1021 +#define MPI_TAG_mpi_tex_copy_data 1022 +#define MPI_TAG_mpi_mem_copy_to_data 1023 +#define MPI_TAG_mpi_mem_copy_from_data 1024 +#define MPI_TAG_mpi_tex_free 1025 + +#define MPI_TAG_mpi_cycles_end 1999 + +//////////////////////////////OTHER////////////////////////////////////// +#define MPI_NAME_MAX_LENGTH 256 +#define DEVICE_PTR unsigned long long +//#define MAX_NODE_DEVICES 3 +//#define TILE_STEP 4 +/////////////////////////////////CYCLES/////////////////////////////////////////////// +/* Path Tracing */ +struct mpi_path_trace_struct +{ + DEVICE_PTR buffer; + DEVICE_PTR rng_state; + int start_sample; + int num_samples; + bool progressive; + int tile_x; + int tile_y; + int offset; + int stride; + int tile_h; + int tile_w; + + DEVICE_PTR rgba_pixels; + bool half_float; + size_t kg_data_size; + + //bool enable_mics; +}; + +/* Film */ +struct mpi_film_convert_struct +{ + DEVICE_PTR rgba; + DEVICE_PTR buffer; + float sample_scale; + int offset; + int stride; + int task_x; + int task_y; + int task_h; + int task_w; +}; + +/* Shader Evaluation */ +struct mpi_bake_struct +{ + DEVICE_PTR input; + DEVICE_PTR output; + int type; + int task_shader_x; + int task_shader_w; + int offset; + int sample; +}; + +struct mpi_shader_struct +{ + DEVICE_PTR input; + DEVICE_PTR output; + int type; + int task_shader_x; + int task_shader_w; + int sample; +}; + +struct mpi_mem_struct +{ + DEVICE_PTR mem; + char name[MPI_NAME_MAX_LENGTH]; + size_t offset; + size_t memSize; +}; + +struct mpi_const_copy_struct +{ + char name[MPI_NAME_MAX_LENGTH]; + DEVICE_PTR host; + size_t size; +}; + +struct mpi_tex_copy_struct +{ + char name[MPI_NAME_MAX_LENGTH]; + DEVICE_PTR mem; + size_t size; + size_t width; + size_t height; + size_t depth; + int interpolation; + int extension; +}; + +/////////////////////////////////////////////////////////////////////// +struct mpi_kernel_struct +{ + int mpi_tag; + int world_size; + int world_rank; + //bool enable_mics; + + ////////////cycles/////////////////////// + mpi_path_trace_struct mpi_path_trace_data; + mpi_film_convert_struct mpi_film_convert_data; + mpi_bake_struct mpi_bake_data; + mpi_shader_struct mpi_shader_data; + mpi_mem_struct mpi_mem_data; + mpi_const_copy_struct mpi_const_copy_data; + mpi_tex_copy_struct mpi_tex_copy_data; +}; + +#endif /* __client_api_H__ */ + diff --git a/it4i/client/cycles_mic/CMakeLists.txt b/it4i/client/cycles_mic/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..102e0bd8a871fe270aa4112d95ebecb8539c9857 --- /dev/null +++ b/it4i/client/cycles_mic/CMakeLists.txt @@ -0,0 +1,34 @@ +set(INC + . + ../../../intern/cycles/util + ../../../intern/cycles/kernel + ../../../intern/cycles/kernel/kernels/mic + ../../../intern/cycles/kernel/kernels/mpi + ../../../intern/cycles/kernel/kernels/omp + ../api + ${MPI_INCLUDE_DIR} +) + +set(SRC + ../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp +) + +add_definitions(-DBLENDER_CLIENT) + +if(WITH_IT4I_MIC_NATIVE) + add_definitions(-DWITH_IT4I_MIC_NATIVE) + set_source_files_properties(../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qoffload=none") +endif() + +if(WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) + #-ip -fp-model fast=2 + #set_source_files_properties(../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-g -O0 -qoffload-attribute-target=mic") + set_source_files_properties(../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qoffload-attribute-target=mic") +endif() + + +include_directories(${INC}) +add_library(cycles_mic${MIC_FLAG} SHARED ${SRC}) + +install (TARGETS cycles_mic${MIC_FLAG} DESTINATION lib) diff --git a/it4i/client/cycles_mpi/CMakeLists.txt b/it4i/client/cycles_mpi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a44ae8d66f80a73d07d54059a0009649a67717ec --- /dev/null +++ b/it4i/client/cycles_mpi/CMakeLists.txt @@ -0,0 +1,47 @@ +set(INC + . + ../../../intern/cycles/util + ../../../intern/cycles/kernel + ../../../intern/cycles/kernel/kernels/mic + ../../../intern/cycles/kernel/kernels/mpi + ../../../intern/cycles/kernel/kernels/omp + ../api + ${MPI_INCLUDE_DIR} +) + +set(SRC + cycles_mpi.cpp +) + +set(SRC_HEADERS + cycles_mpi.h +) + +add_definitions(-DBLENDER_CLIENT) + +if(WITH_IT4I_MIC_NATIVE) + add_definitions(-DWITH_IT4I_MIC_NATIVE) +endif() + +if(WITH_IT4I_MIC_OFFLOAD) + add_definitions(-DWITH_IT4I_MIC_OFFLOAD) +endif() + +include_directories(${INC}) +add_library(cycles_mpi${MIC_FLAG} SHARED ${SRC} ${SRC_HEADERS}) +target_link_libraries(cycles_mpi${MIC_FLAG} ${MPI_LIB_FILE}) + +if(WITH_IT4I_MIC_NATIVE) + add_dependencies(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG}) + target_link_libraries(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG}) +else() + add_dependencies(cycles_mpi${MIC_FLAG} cycles_omp${MIC_FLAG}) + target_link_libraries(cycles_mpi${MIC_FLAG} cycles_omp${MIC_FLAG}) +endif() + +if(WITH_IT4I_MIC_OFFLOAD) + add_dependencies(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG}) + target_link_libraries(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG}) +endif() + +install (TARGETS cycles_mpi${MIC_FLAG} DESTINATION lib) diff --git a/it4i/client/cycles_mpi/cycles_mpi.cpp b/it4i/client/cycles_mpi/cycles_mpi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..073d09979f1ae2bc1e510b24e2ecaabaf723d385 --- /dev/null +++ b/it4i/client/cycles_mpi/cycles_mpi.cpp @@ -0,0 +1,1169 @@ +#include "cycles_mpi.h" + +#include <stdlib.h> +#include <string.h> +#include <string> +#include <vector> +#include <map> + +#ifdef WITH_IT4I_MIC_OFFLOAD +#include "kernel_mic.h" +#endif + +#ifdef WITH_IT4I_MIC_NATIVE +#include "kernel_mic.h" +#else +#include "kernel_omp.h" +#endif + +#include <omp.h> +#include <mpi.h> + +#include <unistd.h> + +#define SIZEOF_UCHAR4 (sizeof(unsigned char)*4) + +CCL_NAMESPACE_BEGIN + +struct sMpiData +{ + DEVICE_PTR kernel_globals_cpu; + std::vector<DEVICE_PTR> kernel_globals_mics; + std::map<DEVICE_PTR, DEVICE_PTR> ptr_map; +}; + +sMpiData *mpiData = NULL; + +#ifdef WITH_IT4I_MIC_OFFLOAD + +void split(std::vector<std::string> &result, std::string s, std::string delimiter) +{ + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) + { + token = s.substr(0, pos); + //std::cout << token << std::endl; + //printf("dev: %s\n", token.c_str()); + result.push_back(token); + s.erase(0, pos + delimiter.length()); + } + //std::cout << s << std::endl; + //printf("dev: %s\n", s.c_str()); + //result.push_back(s); + + //std::cout << "end" << std::endl; + //printf("dev: %s\n", "end"); +} + +int micFindDevices() +{ +#if !defined(_WIN32) && !defined(__APPLE__) + FILE *handle = popen("micinfo -group Versions | grep 'Device Name'", "r"); + if (!handle) + return 0; + + char buffer[4096] = {0}; + int len = fread(buffer, 1, sizeof (buffer) - 1, handle); + buffer[len] = '\0'; + pclose(handle); + + if (!buffer[0]) + return 0; + + std::string mics = std::string(buffer); + std::vector<std::string> strDevices; + + split(strDevices, mics, "\n"); + + return strDevices.size(); +#endif + +#if defined(_WIN32) + return 1; //"fakeMIC" +#else + return 0; +#endif +} +#endif +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void mpi_const_copy(mpi_kernel_struct &data) +{ + //KernelGlobals *kg = (KernelGlobals *) mpiData->kernel_globals_cpu; + + //MPI_Bcast(&kg->__data, data.mpi_const_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD); + std::vector<char> kg_data(data.mpi_const_copy_data.size); + MPI_Bcast(&kg_data[0], data.mpi_const_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD); + +#ifdef WITH_IT4I_MIC_NATIVE + mic_const_copy(-1, mpiData->kernel_globals_cpu, data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size); +#else + omp_const_copy(-1, mpiData->kernel_globals_cpu, data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size); +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_const_copy(dev, mpiData->kernel_globals_mics[dev], data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size); + } +#endif +} + +void mpi_tex_copy(mpi_kernel_struct &data) +{ + + //printf("mpi_tex_copy: %s, %d\n", data.mpi_tex_copy_data.name, data.mpi_tex_copy_data.size); + + mpiData->ptr_map[data.mpi_tex_copy_data.mem] = (DEVICE_PTR)new char[data.mpi_tex_copy_data.size]; + MPI_Bcast((char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem], data.mpi_tex_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD); + +#ifdef WITH_IT4I_MIC_NATIVE + mic_tex_copy(-1, mpiData->kernel_globals_cpu, + data.mpi_tex_copy_data.name, + (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem], + data.mpi_tex_copy_data.size, + data.mpi_tex_copy_data.width, + data.mpi_tex_copy_data.height, + data.mpi_tex_copy_data.depth, + data.mpi_tex_copy_data.interpolation, + data.mpi_tex_copy_data.extension); +#else + omp_tex_copy(-1, mpiData->kernel_globals_cpu, + data.mpi_tex_copy_data.name, + (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem], + data.mpi_tex_copy_data.size, + data.mpi_tex_copy_data.width, + data.mpi_tex_copy_data.height, + data.mpi_tex_copy_data.depth, + data.mpi_tex_copy_data.interpolation, + data.mpi_tex_copy_data.extension); +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_tex_copy(dev, mpiData->kernel_globals_mics[dev], + data.mpi_tex_copy_data.name, + (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem], + data.mpi_tex_copy_data.size, + data.mpi_tex_copy_data.width, + data.mpi_tex_copy_data.height, + data.mpi_tex_copy_data.depth, + data.mpi_tex_copy_data.interpolation, + data.mpi_tex_copy_data.extension); + } +#endif +} + +void mpi_alloc_kg(mpi_kernel_struct &data) +{ + if (mpiData != NULL) + { + delete mpiData; + } + + mpiData = new sMpiData(); + +#ifdef WITH_IT4I_MIC_NATIVE + mpiData->kernel_globals_cpu = mic_alloc_kg(-1); +#else + mpiData->kernel_globals_cpu = omp_alloc_kg(-1); +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + int mics = micFindDevices(); + if (mics > 0/* && data.enable_mics*/) + { + mpiData->kernel_globals_mics.resize(mics); + //printf("mics: %d\n", mics); + } + + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mpiData->kernel_globals_mics[dev] = mic_alloc_kg(dev); + } +#endif +} + +void mpi_free_kg(mpi_kernel_struct &data) +{ +#ifdef WITH_IT4I_MIC_NATIVE + mic_free_kg(-1, mpiData->kernel_globals_cpu); +#else + omp_free_kg(-1, mpiData->kernel_globals_cpu); +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_free_kg(dev, mpiData->kernel_globals_mics[dev]); + } +#endif + + mpiData->kernel_globals_mics.clear(); +} + +void mpi_mem_alloc(mpi_kernel_struct &data) +{ + //printf("mpi_mem_alloc: %s, %zu\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize); + + mpiData->ptr_map[data.mpi_mem_data.mem] = (DEVICE_PTR) new char[data.mpi_mem_data.memSize]; + memset((char*) mpiData->ptr_map[data.mpi_mem_data.mem], 0, data.mpi_mem_data.memSize); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_alloc(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize); + } +#endif +} + +void mpi_mem_copy_to(mpi_kernel_struct &data) +{ + MPI_Bcast((char *) mpiData->ptr_map[data.mpi_mem_data.mem]/* + data.mpi_mem_data.offset*/, data.mpi_mem_data.memSize, MPI_BYTE, 0, MPI_COMM_WORLD); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_copy_to(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize, NULL); + } +#endif +} + +void mpi_mem_zero(mpi_kernel_struct &data) +{ + memset((char *) mpiData->ptr_map[data.mpi_mem_data.mem] /*+ data.mpi_mem_data.offset*/, 0, data.mpi_mem_data.memSize); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_zero(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize); + } +#endif +} + +void mpi_mem_free(mpi_kernel_struct &data) +{ + if (mpiData->ptr_map[data.mpi_mem_data.mem]) + { + //printf("mpi_mem_free: %s, %zu\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize); + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_free(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize); + } +#endif + + char *tmp = (char*) mpiData->ptr_map[data.mpi_mem_data.mem]; + delete tmp; + mpiData->ptr_map.erase(data.mpi_mem_data.mem); + } +} + +void mpi_tex_free(mpi_kernel_struct &data) +{ + if (mpiData->ptr_map[data.mpi_mem_data.mem]) + { + //printf("mpi_tex_free: %s, %d\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize); +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_tex_free(dev, mpiData->kernel_globals_mics[dev], data.mpi_mem_data.name, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize); + } +#endif + char *tmp = (char*) mpiData->ptr_map[data.mpi_mem_data.mem]; + delete tmp; + mpiData->ptr_map.erase(data.mpi_mem_data.mem); + } +} + +//offline rendering - native, cpu, cpu+offload +//#ifdef WITH_IT4I_MIC_OFFLOAD + +void mpi_path_trace_offline(mpi_kernel_struct &data) +{ + ///////////////////////////share nodes//////////////////////////////////// + + size_t offsetSample = 0; + size_t sizeSample = sizeof (int); + + int reqFinished = 0; +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int)); + } +#endif + int reqJob = -1; + size_t sizeJob = sizeof (int); + + int start_sample = data.mpi_path_trace_data.start_sample; + int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples; + +#ifdef WITH_IT4I_MIC_NATIVE + int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu); +#else + int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu); +#endif + + int offset = data.mpi_path_trace_data.offset; + int stride = data.mpi_path_trace_data.stride; + + int tile_x = data.mpi_path_trace_data.tile_x; + int tile_w = data.mpi_path_trace_data.tile_w; + + ////////////////////////////one node/////////////////////////////////// + omp_set_nested(1); + + int tile_step_node = 1; //TILE_STEP; + if (getenv("IT4I_OMP_TILE_STEP")) + { + tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP")); + printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node); + } + + int nprocs_mic = 240; + int nprocs_cpu = omp_get_max_threads() - 1; + + if (getenv("IT4I_OMP_CPU_NUM_THREADS")) + { + nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1; + } + + if (getenv("IT4I_OMP_MIC_NUM_THREADS")) + { + nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS")); + } + + int dev_node = data.world_rank - 1; + int devices_size_node = data.world_size - 1; + + int tile_h_node = tile_step_node; + +#ifdef WITH_IT4I_MIC_OFFLOAD + int tile_h_cpu = tile_step_node / 2.0; + if (tile_h_cpu < 1) + tile_h_cpu = 1; + int tile_h_mic = tile_step_node * 1.0 / 4.0; +#else + int tile_h_cpu = tile_step_node; + int tile_h_mic = 0; +#endif + int omp_path_trace_req = 0; + + int size_node = tile_h_node * tile_w; + int size_cpu = tile_h_cpu * tile_w; + int size_mic = tile_h_mic * tile_w; + + //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + size_t sizeBuf_node = size_node * pass_stride * sizeof (float); + size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float); + size_t sizeBuf_mic = size_mic * pass_stride * sizeof (float); + + //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; + size_t sizeByte_node = size_node * SIZEOF_UCHAR4; + size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4; + size_t sizeByte_mic = size_mic * SIZEOF_UCHAR4; + + //int sample_finished_node = 0; + ////////////////////////////MICS////////////////////////////////////// + int signal1, signal2, signal3, signal4; + + const int num_devices_cpu_mics = mpiData->kernel_globals_mics.size() + 1; + //const int num_devices_mics = mpiData->kernel_globals_mics.size(); + + std::vector<int> sample_finished_devices(num_devices_cpu_mics); + + reqJob = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; + std::vector<int> tile_y_devices(num_devices_cpu_mics); + + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { + sample_finished_devices[dev] = end_sample; + tile_y_devices[dev] = 0; + +#ifdef WITH_IT4I_MIC_OFFLOAD + if (dev > 0) + { + mic_mem_alloc(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int)); + } +#endif + } + + //tile_y_devices[0] = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; + + //int signal1, signal2, signal3, signal4; + + ////////////////////////////////////////////////////////////////// + +#pragma omp parallel num_threads(2) + { +#pragma omp single nowait + { +#pragma omp task + { + while (reqFinished == 0) + { +#pragma omp flush + if (omp_path_trace_req != 0) + { + //mic_path_trace(0, mpiData->kernel_globals_mics[0], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices + tile_h_cpu, offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[1], (char*) &reqFinished, nprocs_mic, signal1); + //mic_path_trace(1, mpiData->kernel_globals_mics[1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices + tile_h_cpu + tile_h_mic, offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[2], (char*) &reqFinished, nprocs_mic, signal2); + printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev_node, sample_finished_devices[0], end_sample, tile_y_devices[0], tile_h_cpu); + fflush(0); +#ifdef WITH_IT4I_MIC_NATIVE + mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_mic, NULL); +#else + omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL); +#endif + //mic_wait(0, signal1); + //mic_wait(1, signal2); + + omp_path_trace_req = 0; + } + usleep(100); + } + } + +#pragma omp task + { + while (true) + { + int min_count = end_sample; + + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { + if (reqJob >= 0) + { + sample_finished_devices[dev] = start_sample; + + if (dev == 0) + { + tile_y_devices[dev] = reqJob; + omp_path_trace_req = 1; + } + else + { +#ifdef WITH_IT4I_MIC_OFFLOAD + if (dev == 1) + { + tile_y_devices[dev] = reqJob + tile_h_cpu; + + //printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h_mic); + //fflush(0); + mic_path_trace(0, mpiData->kernel_globals_mics[dev - 1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal1); + } + if (dev == 2) + { + tile_y_devices[dev] = reqJob + tile_h_cpu + tile_h_mic; + + //printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h_mic); + //fflush(0); + mic_path_trace(1, mpiData->kernel_globals_mics[dev - 1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal2); + } +#endif + } + } +#ifdef WITH_IT4I_MIC_OFFLOAD + if (dev > 0) + { + if (tile_y_devices[dev] != 0) + { + if (data.mpi_path_trace_data.rgba_pixels != NULL) + { + size_t offsetByte_mic = (offset + tile_x + (tile_y_devices[dev]) * stride) * SIZEOF_UCHAR4; + mic_mem_copy_from(dev - 1, (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels], offsetByte_mic, sizeByte_mic, NULL); //(char*) &data.mpi_path_trace_data.rgba_pixels); + } + else + { + size_t offsetBuf_mic = (offset + tile_x + (tile_y_devices[dev]) * stride) * pass_stride * sizeof (float); + mic_mem_copy_from(dev - 1, (char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer], offsetBuf_mic, sizeBuf_mic, NULL); // (char*) &data.mpi_path_trace_data.buffer); + } + } + } +#endif +#pragma omp flush + if (min_count > sample_finished_devices[dev]) + min_count = sample_finished_devices[dev]; + } + + int req = (min_count == end_sample) ? 0 : 1; + +#pragma omp flush + MPI_Gatherv(&req, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Gatherv(&tile_y_devices[0], sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + + if (data.mpi_path_trace_data.rgba_pixels != NULL) + { + size_t offsetByte_node = (offset + tile_x + tile_y_devices[0] * stride) * SIZEOF_UCHAR4; + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } + else + { + size_t offsetBuf_node = (offset + tile_x + tile_y_devices[0] * stride) * pass_stride * sizeof (float); + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } + + MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (reqFinished != 0) + { + break; + } + } + } + } + +#pragma omp taskwait + } +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < num_devices_cpu_mics; dev++) + { + if (dev > 0) + { + //mic_wait(dev, (char*)&reqFinished); + //mic_wait(dev - 1, (char*) &data.mpi_path_trace_data.rng_state); + if (dev == 1) + mic_wait(dev - 1, signal1); + + if (dev == 2) + mic_wait(dev - 1, signal2); + + mic_mem_free(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int)); + mic_mem_free(dev - 1, (char*) &reqFinished, sizeof (int)); + } + } +#endif + + MPI_Gatherv(&tile_y_devices[0], sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + + if (data.mpi_path_trace_data.rgba_pixels != NULL) + { + size_t offsetByte_node = (offset + tile_x + tile_y_devices[0] * stride) * SIZEOF_UCHAR4; + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } + else + { + size_t offsetBuf_node = (offset + tile_x + tile_y_devices[0] * stride) * pass_stride * sizeof (float); + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } +} +//#endif + +//void mpi_path_trace_offline(mpi_kernel_struct &data) +//{ +// //printf("CLIENT: mpi_path_trace_offline\n"); +// ///////////////////////////share nodes//////////////////////////////////// +// +// size_t offsetSample = 0; +// size_t sizeSample = sizeof (int); +// +// int reqFinished = 0; +// int reqJob = -1; +// size_t sizeJob = sizeof (int); +// +// int start_sample = data.mpi_path_trace_data.start_sample; +// int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples; +// +//#ifdef WITH_IT4I_MIC_NATIVE +// int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu); +//#else +// int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu); +//#endif +// +// int offset = data.mpi_path_trace_data.offset; +// int stride = data.mpi_path_trace_data.stride; +// +// int tile_x = data.mpi_path_trace_data.tile_x; +// int tile_w = data.mpi_path_trace_data.tile_w; +// +// ////////////////////////////one node/////////////////////////////////// +// omp_set_nested(1); +// +// int nprocs_mic = 240; +// int nprocs_cpu = omp_get_max_threads() - 1; +// +// if (getenv("IT4I_OMP_CPU_NUM_THREADS")) +// { +// nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1; +// } +// +// if (getenv("IT4I_OMP_MIC_NUM_THREADS")) +// { +// nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS")); +// } +// +// int dev_node = data.world_rank - 1; +// int devices_size_node = data.world_size - 1; +// +// int tile_step_node = 1; //TILE_STEP; +// if (getenv("IT4I_OMP_TILE_STEP")) +// { +// tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP")); +// printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node); +// } +// +// int tile_h_node = tile_step_node; +// int omp_path_trace_req = 1; +// +// int size_node = tile_h_node * tile_w; +// +// //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); +// size_t sizeBuf_node = size_node * pass_stride * sizeof (float); +// +// //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; +// size_t sizeByte_node = size_node * SIZEOF_UCHAR4; +// +// //int sample_finished_node = 0; +// ////////////////////////////MICS////////////////////////////////////// +// +// //const int num_devices_cpu_mics = 1; +// //const int num_devices_mics = mpiData->kernel_globals_mics.size(); +// +// int sample_finished_devices = 0; +// int tile_y_devices = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; +// +// ////////////////////////////////////////////////////////////////// +// +//#pragma omp parallel shared(omp_path_trace_req) num_threads(2) +// { +//#pragma omp single nowait +// { +//#pragma omp task shared(omp_path_trace_req) +// { +// while (reqFinished == 0) +// { +//#pragma omp flush +// if (omp_path_trace_req != 0) +// { +// //printf("CLIENT: omp_path_trace: %d, %d, %f\n", 0, tile_y_devices, omp_get_wtime()); +// +//#ifdef WITH_IT4I_MIC_NATIVE +// mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices, (char*) &reqFinished, nprocs_cpu, NULL); +//#else +// omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices, (char*) &reqFinished, nprocs_cpu, NULL); +//#endif +// //usleep(100); +// omp_path_trace_req = 0; +// } +// usleep(100); +// } +// } +// +//#pragma omp task shared(omp_path_trace_req) +// { +// while (true) +// { +//#pragma omp flush +// MPI_Gatherv(&omp_path_trace_req, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// //printf("CLIENT: tile_y_devices: %d\n", tile_y_devices); +// MPI_Gatherv(&tile_y_devices, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// if (data.mpi_path_trace_data.rgba_pixels != NULL) +// { +// size_t offsetByte_node = (offset + tile_x + tile_y_devices * stride) * SIZEOF_UCHAR4; +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// else +// { +// size_t offsetBuf_node = (offset + tile_x + tile_y_devices * stride) * pass_stride * sizeof (float); +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// +// MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// if (reqJob >= 0) +// { +// sample_finished_devices = start_sample; +// tile_y_devices = reqJob; +// omp_path_trace_req = 1; +// } +// +// +// MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD); +// if (reqFinished != 0) +// { +// //printf("CLIENT: finished %f\n", omp_get_wtime()); +// //fflush(0); +// break; +// } +// +// +// +// } +// } +// } +// +//#pragma omp taskwait +// } +// +// MPI_Gatherv(&tile_y_devices, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// if (data.mpi_path_trace_data.rgba_pixels != NULL) +// { +// size_t offsetByte_node = (offset + tile_x + tile_y_devices * stride) * SIZEOF_UCHAR4; +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// else +// { +// size_t offsetBuf_node = (offset + tile_x + tile_y_devices * stride) * pass_stride * sizeof (float); +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +//} + +//#endif + + +//void mpi_path_trace_progressive(mpi_kernel_struct &data) +//{ +// printf("CLIENT: mpi_path_trace_progressive\n"); +// ///////////////////////////share nodes//////////////////////////////////// +// +// size_t offsetSample = 0; +// size_t sizeSample = sizeof (int); +// +// int reqFinished = -1; +// int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples; +// +//#ifdef WITH_IT4I_MIC_NATIVE +// int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu); +//#else +// int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu); +//#endif +// +// int offset = data.mpi_path_trace_data.offset; +// int stride = data.mpi_path_trace_data.stride; +// +// int tile_x = data.mpi_path_trace_data.tile_x; +// int tile_w = data.mpi_path_trace_data.tile_w; +// +// ////////////////////////////one node/////////////////////////////////// +// omp_set_nested(1); +// int nprocs_cpu = omp_get_max_threads() - 1; +// //printf("nprocs_cpu: %d\n", nprocs_cpu); +// +// int dev_node = data.world_rank - 1; +// int devices_size_node = data.world_size - 1; +// +// int tile_step_node = data.mpi_path_trace_data.tile_h / devices_size_node; +// int tile_last_node = data.mpi_path_trace_data.tile_h - (devices_size_node - 1) * tile_step_node; +// +// int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; +// int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node; +// +// int size_node = tile_h_node * tile_w; +// +// size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); +// size_t sizeBuf_node = size_node * pass_stride * sizeof (float); +// +// size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; +// size_t sizeByte_node = size_node * SIZEOF_UCHAR4; +// +// int sample_finished_node = 0; +// ////////////////////////////////////////////////////////////////// +// +//#ifdef WITH_IT4I_MIC_NATIVE +// mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, omp_get_max_threads(), NULL); +//#else +// omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, omp_get_max_threads(), NULL); +//#endif +// +// if (data.mpi_path_trace_data.rgba_pixels != NULL) +// { +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// else +// { +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +//} +// +//void mpi_path_trace(mpi_kernel_struct &data) +//{ +// printf("CLIENT: mpi_path_trace\n"); +// ///////////////////////////share nodes//////////////////////////////////// +// +// size_t offsetSample = 0; +// size_t sizeSample = sizeof (int); +// +// int reqFinished = 0; +// int reqJob = -1; +// size_t sizeJob = sizeof (int); +// +// int start_sample = data.mpi_path_trace_data.start_sample; +// int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples; +// +//#ifdef WITH_IT4I_MIC_NATIVE +// int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu); +//#else +// int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu); +//#endif +// +// int offset = data.mpi_path_trace_data.offset; +// int stride = data.mpi_path_trace_data.stride; +// +// int tile_x = data.mpi_path_trace_data.tile_x; +// int tile_w = data.mpi_path_trace_data.tile_w; +// +// ////////////////////////////one node/////////////////////////////////// +// omp_set_nested(1); +// int nprocs_cpu = omp_get_max_threads() - 1; +// +// int dev_node = data.world_rank - 1; +// int devices_size_node = data.world_size - 1; +// +// int tile_step_node = 1; +// +// int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; +// int tile_h_node = tile_step_node; +// int mic_path_trace_req = 1; +// +// int size_node = tile_h_node * tile_w; +// +// //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); +// size_t sizeBuf_node = size_node * pass_stride * sizeof (float); +// +// //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4); +// size_t sizeByte_node = size_node * SIZEOF_UCHAR4; +// +// int sample_finished_node = 0; +// ////////////////////////////////////////////////////////////////// +// +//#pragma omp parallel num_threads(2) +// { +//#pragma omp single nowait +// { +//#pragma omp task +// { +// while (reqFinished == 0) +// { +//#pragma omp flush +// if (mic_path_trace_req != 0) +// { +// //printf("mic_path_trace - tile_y_node: %d\n", tile_y_node); +//#ifdef WITH_IT4I_MIC_NATIVE +// mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, nprocs_cpu, NULL); +//#else +// omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, nprocs_cpu, NULL); +//#endif +// +// mic_path_trace_req = 0; +// } +// //usleep(1000); +// //#pragma omp wait +// } +// } +// +//#pragma omp task +// { +// while (true) +// { +// //MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD); +// //MPI_Bcast(&reqJob, 1, MPI_INT, 0, MPI_COMM_WORLD); +// +// //printf("mic_path_trace - reqJob: %d\n", reqJob); +// +// MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD); +// //MPI_Gatherv(&sample_finished_node, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// if (reqJob >= 0) +// { +// sample_finished_node = start_sample; +// } +// else if (reqJob == -2) +// { +// reqFinished = 1; +// } +// +//#pragma omp flush +// int sample_finished = sample_finished_node; +// +// MPI_Gatherv(&sample_finished, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// +// if (data.mpi_path_trace_data.rgba_pixels != NULL) +// { +// size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// else +// { +// size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); +// MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); +// } +// +// if (reqJob >= 0) +// { +// tile_y_node = reqJob; +// mic_path_trace_req = 1; +// +// } +// +// if (reqFinished != 0) +// { +// +// break; +// } +// +// } +// } +// } +// +//#pragma omp taskwait +// } +//} + +//interactive rendering - native, cpu, cpu+offload + +void mpi_path_trace_progressive(mpi_kernel_struct &data) +{ + /////////////////////////////share nodes//////////////////////////////////// + + size_t offsetSample = 0; + size_t sizeSample = sizeof (int); + + int reqFinished = 0; + + //#ifdef WITH_IT4I_MIC_OFFLOAD + // for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + // { + // mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int)); + // } + //#endif + + int num_samples_scale = 1; + if (getenv("IT4I_OMP_NUM_SAMPLES_SCALE")) + { + num_samples_scale = atoi(getenv("IT4I_OMP_NUM_SAMPLES_SCALE")); + } + + int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples * num_samples_scale; + +#ifdef WITH_IT4I_MIC_NATIVE + int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu); +#else + int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu); +#endif + + int offset = data.mpi_path_trace_data.offset; + int stride = data.mpi_path_trace_data.stride; + + int tile_x = data.mpi_path_trace_data.tile_x; + int tile_w = data.mpi_path_trace_data.tile_w; + + ////////////////////////////one node/////////////////////////////////// + //omp_set_nested(1); + int nprocs_mic = 240; + int nprocs_cpu = omp_get_max_threads(); + + if (getenv("IT4I_OMP_CPU_NUM_THREADS")) + { + nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")); + } + + if (getenv("IT4I_OMP_MIC_NUM_THREADS")) + { + nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS")); + } + + int dev_node = data.world_rank - 1; + int devices_size_node = data.world_size - 1; + + int tile_step_node = data.mpi_path_trace_data.tile_h / devices_size_node; + int tile_last_node = data.mpi_path_trace_data.tile_h - (devices_size_node - 1) * tile_step_node; + + int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node; + int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node; + + int size_node = tile_h_node * tile_w; + + size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float); + size_t sizeBuf_node = size_node * pass_stride * sizeof (float); + + size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4; + size_t sizeByte_node = size_node * SIZEOF_UCHAR4; + + int devices_size_cpu_mics = mpiData->kernel_globals_mics.size() + 2; + + int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics; + //int tile_last_cpu_mics = tile_h_node - (devices_size_cpu_mics - 1) * tile_step_cpu_mics; + + int dev_cpu_mics = 0; + + //////////////////////////mic0//////////////////////////////////// +#ifdef WITH_IT4I_MIC_OFFLOAD + int signal1, signal2, signal3, signal4; + + std::vector<int> sample_finished_mic0(mpiData->kernel_globals_mics.size()); + std::vector<int> tile_y_mic0(mpiData->kernel_globals_mics.size()); + std::vector<int> tile_h_mic0(mpiData->kernel_globals_mics.size()); + std::vector<int> size_mic0(mpiData->kernel_globals_mics.size()); + std::vector<size_t> offsetBuf_mic0(mpiData->kernel_globals_mics.size()); + std::vector<size_t> sizeBuf_mic0(mpiData->kernel_globals_mics.size()); + std::vector<size_t> offsetByte_mic0(mpiData->kernel_globals_mics.size()); + std::vector<size_t> sizeByte_mic0(mpiData->kernel_globals_mics.size()); + + //sync + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + sample_finished_mic0[dev] = 0; + + mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int)); + mic_mem_alloc(dev, (char*) &sample_finished_mic0[dev], sizeof (int)); + } + + //async + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + dev_cpu_mics = dev; + + //sample_finished_mic0[dev] = 0; + //mic_mem_alloc(dev, (char*)&sample_finished_mic0[dev], sizeof(int)); + + tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + tile_h_mic0[dev] = tile_step_cpu_mics; + + size_mic0[dev] = tile_h_mic0[dev] * tile_w; + + offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float); + sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float); + + offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * SIZEOF_UCHAR4; + sizeByte_mic0[dev] = size_mic0[dev] * SIZEOF_UCHAR4; + + //mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, (char *) rng_state); + //mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/ (char*) &sample_finished_mic0[dev]); + + if (dev == 0) + mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/signal1); + if (dev == 1) + mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/signal2); + } +#endif + //////////////////////////cpu///////////////////////////////////// + + dev_cpu_mics = mpiData->kernel_globals_mics.size(); + + int sample_finished_cpu = 0; + + int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics; + int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics; + + int size_cpu = tile_h_cpu * tile_w; + + size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float); + size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float); + + size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * SIZEOF_UCHAR4; + size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4; + ////////////////////////////////////////////////////////////////// + +#ifdef WITH_IT4I_MIC_NATIVE + mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_mic, NULL); +#else + omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL); +#endif + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + //mic_wait(dev, (char*) &data.mpi_path_trace_data.rng_state); + //mic_wait(dev, (char*) &sample_finished_mic0[dev]); + + if (dev == 0) + mic_wait(dev, signal1); + + if (dev == 1) + mic_wait(dev, signal2); + + if (data.mpi_path_trace_data.rgba_pixels != NULL) + { + mic_mem_copy_from(dev, (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels], offsetByte_mic0[dev], sizeByte_mic0[dev], NULL); + } + else + { + mic_mem_copy_from(dev, (char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer], offsetBuf_mic0[dev], sizeBuf_mic0[dev], NULL); + } + } +#endif + + if (data.mpi_path_trace_data.rgba_pixels != NULL) + { + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } + else + { + MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD); + } + + +#ifdef WITH_IT4I_MIC_OFFLOAD + for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++) + { + mic_mem_free(dev, (char*) &sample_finished_mic0[dev], sizeof (int)); + mic_mem_free(dev, (char*) &reqFinished, sizeof (int)); + } +#endif +} +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void mpi_render(mpi_kernel_struct &data) +{ + int action = data.mpi_tag; + if (action == MPI_TAG_mpi_const_copy) + { + mpi_const_copy(data); + } + else if (action == MPI_TAG_mpi_tex_copy) + { + mpi_tex_copy(data); + } + else if (action == MPI_TAG_mpi_path_trace) + { + if (data.mpi_path_trace_data.progressive) + { + mpi_path_trace_progressive(data); + } + else + { + //#ifdef WITH_IT4I_MIC_OFFLOAD + // mpi_path_trace_offline_offload(data); + //#else + mpi_path_trace_offline(data); + //#endif + + } + } + else if (action == MPI_TAG_mpi_alloc_kg) + { + mpi_alloc_kg(data); + } + else if (action == MPI_TAG_mpi_free_kg) + { + mpi_free_kg(data); + } + else if (action == MPI_TAG_mpi_mem_alloc) + { + mpi_mem_alloc(data); + } + else if (action == MPI_TAG_mpi_mem_copy_to) + { + mpi_mem_copy_to(data); + } + else if (action == MPI_TAG_mpi_mem_zero) + { + mpi_mem_zero(data); + } + else if (action == MPI_TAG_mpi_mem_free) + { + mpi_mem_free(data); + } + else if (action == MPI_TAG_mpi_tex_free) + { + mpi_tex_free(data); + } +} +CCL_NAMESPACE_END diff --git a/it4i/client/cycles_mpi/cycles_mpi.h b/it4i/client/cycles_mpi/cycles_mpi.h new file mode 100644 index 0000000000000000000000000000000000000000..31707c15833578ce57851744236c4e66ad45c66f --- /dev/null +++ b/it4i/client/cycles_mpi/cycles_mpi.h @@ -0,0 +1,48 @@ +#ifndef __CYCLES_MPI_H__ +#define __CYCLES_MPI_H__ + +//#define CCL_NAMESPACE_BEGIN +//#define CCL_NAMESPACE_END + +//#include "kernel_compat_mic.h" + +//#include "kernel_mpi.h" +//#include "util_types.h" + +#include "client_api.h" + + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ +void mpi_path_trace(mpi_kernel_struct &data); +//void mpi_branched_path_trace(mpi_kernel_struct &data); +// +///* Film */ +//void mpi_film_convert_half(); +//void mpi_film_convert_byte(); +// +///* Shader Evaluation */ +//void mpi_bake(); +//void mpi_shader(); + +/* Device memory */ +void mpi_alloc_kg(mpi_kernel_struct &data); +void mpi_free_kg(mpi_kernel_struct &data); + +void mpi_mem_alloc(mpi_kernel_struct &data); +void mpi_mem_copy_to(mpi_kernel_struct &data); +//void mpi_mem_copy_from(mpi_kernel_struct &data); +void mpi_mem_zero(mpi_kernel_struct &data); +void mpi_mem_free(mpi_kernel_struct &data); +void mpi_tex_free(mpi_kernel_struct &data); + +void mpi_const_copy(mpi_kernel_struct &data); +void mpi_tex_copy(mpi_kernel_struct &data); + +void mpi_render(mpi_kernel_struct &data); +CCL_NAMESPACE_END + + +#endif /* __CYCLES_MPI_H__ */ + diff --git a/it4i/client/cycles_omp/CMakeLists.txt b/it4i/client/cycles_omp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..604a6b00502abc0e710494bcf44535b0bd36f194 --- /dev/null +++ b/it4i/client/cycles_omp/CMakeLists.txt @@ -0,0 +1,23 @@ +set(INC + . + ../../../intern/cycles/util + ../../../intern/cycles/kernel + ../../../intern/cycles/kernel/kernels/mic + ../../../intern/cycles/kernel/kernels/mpi + ../../../intern/cycles/kernel/kernels/omp + ../api + ${MPI_INCLUDE_DIR} +) + +set(SRC + ../../../intern/cycles/kernel/kernels/omp/kernel_omp.cpp +) + +add_definitions(-DWITH_OPENMP) + +set_source_files_properties(../../../intern/cycles/kernel/kernels/omp/kernel_omp.cpp PROPERTIES COMPILE_FLAGS "-xCORE-AVX2") + +include_directories(${INC}) +add_library(cycles_omp${MIC_FLAG} SHARED ${SRC}) + +install (TARGETS cycles_omp${MIC_FLAG} DESTINATION lib) diff --git a/it4i/client/main/CMakeLists.txt b/it4i/client/main/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d56ef917f1bcb04d50422d3c5617fe0cd1afb9f6 --- /dev/null +++ b/it4i/client/main/CMakeLists.txt @@ -0,0 +1,49 @@ + +set(INC + . + ../../../intern/cycles/util + ../../../intern/cycles/kernel + ../../../intern/cycles/kernel/kernels/mpi + ../api + ../cycles_mpi + ${MPI_INCLUDE_DIR} +) + +set(SRC + main.cpp +) + +set(SRC_HEADERS + main.h +) + +include_directories(${INC}) + +add_executable(blender_client${MIC_FLAG} ${SRC} ${SRC_HEADERS}) +target_link_libraries (blender_client${MIC_FLAG} cycles_mpi${MIC_FLAG} ${MPI_LIB_FILE}) +add_dependencies(blender_client${MIC_FLAG} cycles_mpi${MIC_FLAG}) + +#if(WITH_IT4I_MIC_NATIVE) +# add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) +#else() +# add_dependencies(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG}) +#endif() +# +#if(WITH_IT4I_MIC_OFFLOAD) +# add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) +#endif() + +if(WITH_IT4I_MIC_NATIVE) + add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) + target_link_libraries(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) +else() + add_dependencies(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG}) + target_link_libraries(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG}) +endif() + +if(WITH_IT4I_MIC_OFFLOAD) + add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) + target_link_libraries(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG}) +endif() + +install (TARGETS blender_client${MIC_FLAG} DESTINATION bin) diff --git a/it4i/client/main/main.cpp b/it4i/client/main/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..161407cda9b1f79b34a9fef963ea5b5f876849ad --- /dev/null +++ b/it4i/client/main/main.cpp @@ -0,0 +1,54 @@ +#include "main.h" +#include "cycles_mpi.h" + +#include <mpi.h> + +int main(int argc, char** argv) +{ + // Initialize the MPI environment + //MPI_Init(&argc, &argv); + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + // Print off a hello world message + printf("Start from processor %s, rank %d" + " out of %d processors\n", + processor_name, world_rank, world_size); + + while (true) + { + mpi_kernel_struct data; + MPI_Bcast(&data, sizeof (mpi_kernel_struct), MPI_BYTE, 0, MPI_COMM_WORLD); + data.world_rank = world_rank; + data.world_size = world_size; + + if (MPI_TAG_mpi_cycles_start <= data.mpi_tag && data.mpi_tag <= MPI_TAG_mpi_cycles_end) + { + mpi_render(data); + } + } + + printf("End from processor %s, rank %d" + " out of %d processors\n", + processor_name, world_rank, world_size); + + // Finalize the MPI environment. + MPI_Finalize(); + + + return 0; +} + diff --git a/it4i/client/main/main.h b/it4i/client/main/main.h new file mode 100644 index 0000000000000000000000000000000000000000..8511f08b5caec0c6852025cc50b5787b2e3a9546 --- /dev/null +++ b/it4i/client/main/main.h @@ -0,0 +1,5 @@ +#ifndef MAIN_H +#define MAIN_H + +#endif /* MAIN_H */ + diff --git a/it4i/scripts/build_blender.sh b/it4i/scripts/build_blender.sh new file mode 100644 index 0000000000000000000000000000000000000000..ee034a816f485055f3dd83e5e03776e8b2b491df --- /dev/null +++ b/it4i/scripts/build_blender.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +lib_dir=${ROOT_DIR}/install +output=${ROOT_DIR}/install/blender +src=${ROOT_DIR}/src + +#boost_1_60_0 +#ilmbase-2.2.0 +#openexr-2.2.0 +#tiff-4.0.6 +#gdcm-2.6.2 +#oiio +#Python-3.4.4 +#zlib-1.2.8 + +export CC=mpiicc +export CXX=mpiicpc + +#-----------blender-------------- +mkdir ${ROOT_DIR}/build/blender +cd ${ROOT_DIR}/build/blender + +make_d="${src}/blender" +make_d="${make_d} -DPYTHON_LIBRARY=${lib_dir}/Python-3.5.2/lib/libpython3.5m.a" +make_d="${make_d} -DPYTHON_LIBPATH=${lib_dir}/Python-3.5.2/lib" +make_d="${make_d} -DPYTHON_INCLUDE_DIR=${lib_dir}/Python-3.5.2/include/python3.5m" +make_d="${make_d} -DPYTHON_INCLUDE_CONFIG_DIR=${lib_dir}/Python-3.5.2/include/python3.5m" +make_d="${make_d} -DWITH_OPENIMAGEIO=ON" +make_d="${make_d} -DWITH_CYCLES=ON" +make_d="${make_d} -DOPENEXR_INCLUDE_DIR=${lib_dir}/openexr-2.2.0/include" +make_d="${make_d} -DOPENEXR_ILMIMF_LIBRARIES=${lib_dir}/openexr-2.2.0/lib/libIlmImf.so" +make_d="${make_d} -DOPENEXR_ILMTHREAD_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIlmThread.so" +make_d="${make_d} -DOPENEXR_IMATH_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libImath.so" +make_d="${make_d} -DOPENEXR_ILMIMF_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIlmImf.so" +make_d="${make_d} -DOPENEXR_HALF_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libHalf.so" +make_d="${make_d} -DOPENEXR_IEX_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIex.so" +make_d="${make_d} -DBOOST_ROOT=${lib_dir}/boost_1_60_0" +make_d="${make_d} -DTIFF_LIBRARY=${lib_dir}/tiff-4.0.6/lib/libtiff.so" +make_d="${make_d} -DTIFF_INCLUDE_DIR=${lib_dir}/tiff-4.0.6/include" +make_d="${make_d} -DOPENIMAGEIO_LIBRARY=${lib_dir}/oiio/lib/libOpenImageIO.so" +make_d="${make_d} -DOPENIMAGEIO_INCLUDE_DIR=${lib_dir}/oiio/include" +make_d="${make_d} -DWITH_SYSTEM_GLEW=OFF" +make_d="${make_d} -DZLIB_INCLUDE_DIR=${lib_dir}/zlib-1.2.8/include" +make_d="${make_d} -DZLIB_LIBRARY=${lib_dir}/zlib-1.2.8/lib/libz.so" +make_d="${make_d} -DCMAKE_CXX_FLAGS='-wd47,177,858,1875,2621,1011,780,1292'" +make_d="${make_d} -DCMAKE_C_FLAGS='-wd47,177,858,1875,2621,1011,780,1292'" +make_d="${make_d} -DJPEG_INCLUDE_DIR:PATH=${lib_dir}/libjpeg-turbo-1.4.2/include" +make_d="${make_d} -DJPEG_LIBRARY:FILEPATH=${lib_dir}/libjpeg-turbo-1.4.2/lib/libjpeg.a" +make_d="${make_d} -DFREETYPE_INCLUDE_DIR_freetype2=${lib_dir}/freetype-2.6.3/include" +make_d="${make_d} -DFREETYPE_INCLUDE_DIR_ft2build=${lib_dir}/freetype-2.6.3/include/freetype2" +make_d="${make_d} -DFREETYPE_LIBRARY=${lib_dir}/freetype-2.6.3/lib/libfreetype.so" +make_d="${make_d} -DWITH_OPENMP=ON" +make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=ON" +make_d="${make_d} -DWITH_IT4I_MPI:BOOL=ON" +make_d="${make_d} -DWITH_CYCLES_DEVICE_OPENCL=OFF" +make_d="${make_d} -DWITH_GAMEENGINE=OFF" +make_d="${make_d} -DWITH_AUDASPACE=OFF" +make_d="${make_d} -DWITH_OPENAL=OFF" +make_d="${make_d} -DX11_Xi_LIB=/usr/lib64/libXi.so.6" +make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}" +make_d="${make_d} -DCMAKE_BUILD_TYPE=Debug" + +cmake ${make_d} + +make -j24 install diff --git a/it4i/scripts/build_client.sh b/it4i/scripts/build_client.sh new file mode 100644 index 0000000000000000000000000000000000000000..03e6b2c5cea6bc1e0ad8707e5d5e3d9fb26c8c3e --- /dev/null +++ b/it4i/scripts/build_client.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +lib_dir=${ROOT_DIR}/install +output=${ROOT_DIR}/install/blender_client_offload +src=${ROOT_DIR}/src + +#boost_1_60_0 +#ilmbase-2.2.0 +#openexr-2.2.0 +#tiff-4.0.6 +#gdcm-2.6.2 +#oiio +#Python-3.4.4 +#zlib-1.2.8 + +export CC=mpiicc +export CXX=mpiicpc + +#-----------blender_client-------------- +mkdir ${ROOT_DIR}/build/blender_client_offload +cd ${ROOT_DIR}/build/blender_client_offload + +make_d="${src}/blender/it4i/client" +make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=ON" +make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=OFF" +make_d="${make_d} -DCMAKE_BUILD_TYPE=Release" +make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}" + +cmake ${make_d} +make -j24 install diff --git a/it4i/scripts/build_client_mic.sh b/it4i/scripts/build_client_mic.sh new file mode 100644 index 0000000000000000000000000000000000000000..062860c2bcef9ff18c66fa377fdc3a05828a9da5 --- /dev/null +++ b/it4i/scripts/build_client_mic.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +lib_dir=${ROOT_DIR}/install +output=${ROOT_DIR}/install/blender_client_symmetric +src=${ROOT_DIR}/src + +#boost_1_60_0 +#ilmbase-2.2.0 +#openexr-2.2.0 +#tiff-4.0.6 +#gdcm-2.6.2 +#oiio +#Python-3.4.4 +#zlib-1.2.8 + +export CC=mpiicc +export CXX=mpiicpc + +#-----------blender_client cpu-------------- +mkdir ${ROOT_DIR}/build/blender_client_symmetric +cd ${ROOT_DIR}/build/blender_client_symmetric + +make_d="${src}/blender/it4i/client" +make_d="${make_d} -DWITH_IT4I_MPI=ON" +make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=OFF" +make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=OFF" +make_d="${make_d} -DCMAKE_BUILD_TYPE=Release" +make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}" + +cmake ${make_d} +make -j24 install + +#-----------blender_client mic-------------- +make_d="${src}/cyclesphi/it4i/client" +make_d="${make_d} -DWITH_IT4I_MPI=ON" +make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=OFF" +make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=ON" +make_d="${make_d} -DCMAKE_BUILD_TYPE=Release" +make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}" + +cmake ${make_d} +make -j24 install diff --git a/it4i/scripts/build_lib.sh b/it4i/scripts/build_lib.sh new file mode 100644 index 0000000000000000000000000000000000000000..8cb038ca52f6810dc74b5321d8a567b2445bace4 --- /dev/null +++ b/it4i/scripts/build_lib.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +path_main=${ROOT_DIR} +path_lib=${path_main}/lib +path_src=${path_main}/src +path_build=${path_main}/build +path_install=${path_main}/install +num_cores_flag=-j24 + +cd ${path_main} + +mkdir ${path_src} +mkdir ${path_build} +mkdir ${path_install} +mkdir ${path_lib} + +#-----boost + +tar -xvf boost_1_60_0.tar.gz -C ${path_src} + +path_boost=${path_install}/boost_1_60_0 +cd ${path_src}/boost_1_60_0 + +./bootstrap.sh +./bjam install ${num_cores_flag} -a --prefix=${path_boost} + +cd ${path_main} + +#--------------ilmbase + +tar -xvf ilmbase-2.2.0.tar.gz -C ${path_src} + +path_ilmbase=${path_install}/ilmbase-2.2.0 +cd ${path_src}/ilmbase-2.2.0 + +./configure --prefix=${path_ilmbase} +make ${num_cores_flag} +make install + +cd ${path_main} + + +#-----------openexr----------------- +tar -xvf openexr-2.2.0.tar.gz -C ${path_src} + +path_openexr=${path_install}/openexr-2.2.0 +cd ${path_src}/openexr-2.2.0 + +./configure --disable-ilmbasetest --with-ilmbase-prefix=${path_ilmbase} --prefix=${path_openexr} +make ${num_cores_flag} +make install + +cd ${path_main} + +#-----------tiff----------------- +tar -xvf tiff-4.0.6.tar.gz -C ${path_src} + +path_tiff=${path_install}/tiff-4.0.6 +cd ${path_src}/tiff-4.0.6 + +./configure --prefix=${path_tiff} +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------openimageio----------------- +tar -xvf oiio.tar.gz -C ${path_src} + +path_oiio=${path_install}/oiio +mkdir ${path_build}/oiio +cd ${path_build}/oiio + +cmake ${path_src}/oiio -DILMBASE_HOME=${path_ilmbase} -DOPENEXR_HOME=${path_openexr} -DBOOST_ROOT=${path_boost} -DTIFF_LIBRARY=${path_tiff}/lib/libtiff.so -DTIFF_INCLUDE_DIR=${path_tiff}/include -DCMAKE_INSTALL_PREFIX=${path_oiio} + +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------zlib----------------- +tar -xvf zlib-1.2.8.tar.gz -C ${path_src} + +path_zlib=${path_install}/zlib-1.2.8 +cd ${path_src}/zlib-1.2.8 + +./configure --prefix=${path_zlib} +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------png----------------- +tar -xvf libpng-1.6.21.tar.gz -C ${path_src} + +path_png=${path_install}/libpng-1.6.21 +cd ${path_src}/libpng-1.6.21 + +./configure --prefix=${path_png} +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------jpeg----------------- +tar -xvf libjpeg-turbo-1.4.2.tar.gz -C ${path_src} + +path_jpeg=${path_install}/libjpeg-turbo-1.4.2 +cd ${path_src}/libjpeg-turbo-1.4.2 + +./configure --prefix=${path_jpeg} --without-simd +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------freetype----------------- +tar -xvf freetype-2.6.3.tar.gz -C ${path_src} + +path_freetype=${path_install}/freetype-2.6.3 +cd ${path_src}/freetype-2.6.3 + +./configure --prefix=${path_freetype} +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------python----------------- +tar -xvf Python-3.5.2.tgz -C ${path_src} + +path_python=${path_install}/Python-3.5.2 +cd ${path_src}/Python-3.5.2 + +./configure --prefix=${path_python} + +make ${num_cores_flag} +make install + +cd ${path_main} +#-----------blender-------------- +cp -r ${path_ilmbase}/include/* ${path_oiio}/include/. +cp -r ${path_ilmbase}/include/* ${path_openexr}/include/. +cp -r ${path_ilmbase}/lib/* ${path_openexr}/lib/. + + +cp -r ${path_boost}/lib/*.so* ${path_lib}/. +cp -r ${path_ilmbase}/lib/*.so* ${path_lib}/. +cp -r ${path_openexr}/lib/*.so* ${path_lib}/. +cp -r ${path_tiff}/lib/*.so* ${path_lib}/. +cp -r ${path_oiio}/lib/*.so* ${path_lib}/. +cp -r ${path_zlib}/lib/*.so* ${path_lib}/. +cp -r ${path_png}/lib/*.so* ${path_lib}/. +cp -r ${path_jpeg}/lib/*.so* ${path_lib}/. +cp -r ${path_freetype}/lib/*.so* ${path_lib}/. +cp -r ${path_gdcm}/lib/*.so* ${path_lib}/. +cp -r ${path_python}/lib/*.so* ${path_lib}/. + + + diff --git a/it4i/scripts/run_blender.sh b/it4i/scripts/run_blender.sh new file mode 100644 index 0000000000000000000000000000000000000000..1d0f490f12a82458d91fcadd07282d22fe530949 --- /dev/null +++ b/it4i/scripts/run_blender.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:/opt/intel/opencl/lib64 +export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/opt/intel/opencl/lib64 + +#export LD_LIBRARY_PATH=/apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/compilers_and_libraries_2016.3.210/linux/tbb/lib/mic:$LD_LIBRARY_PATH +#export LD_PRELOAD=libtbbmalloc_proxy.so.2:libtbbmalloc.so.2:$LD_PRELOAD + +#export MIC_LD_LIBRARY_PATH=/apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/compilers_and_libraries_2016.3.210/linux/tbb/lib/mic:$MIC_LD_LIBRARY_PATH +#export MIC_LD_PRELOAD=libtbbmalloc_proxy.so.2:libtbbmalloc.so.2:$MIC_LD_PRELOAD + +export MIC_USE_2MB_BUFFERS=100k + +export IT4I_OMP_TILE_STEP=2 +export IT4I_OMP_CPU_NUM_THREADS=24 +export IT4I_OMP_MIC_NUM_THREADS=240 + +cd ${ROOT_DIR}/install/blender +./blender diff --git a/it4i/scripts/run_ddt.sh b/it4i/scripts/run_ddt.sh new file mode 100644 index 0000000000000000000000000000000000000000..2d9b14f4a3928f26b3f81050c233955b419b9822 --- /dev/null +++ b/it4i/scripts/run_ddt.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +module load Forge/6.0.6 +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 +#module load DDT/5.0.1 +#module load OpenCL-runtime/15.1 + +ROOT_DIR=${PWD} + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib +#/home/milanjaros/intel/opencl-1.2-4.5.0.8/lib64 +export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib +#export ALLINEA_DISABLE_THREAD_SPARKLINES=1 + +cd ${ROOT_DIR}/install/blender +ddt + diff --git a/it4i/scripts/run_mpi_offload.sh b/it4i/scripts/run_mpi_offload.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5c6c3107261a1ecdaafb1a3db93d0e937c59e06 --- /dev/null +++ b/it4i/scripts/run_mpi_offload.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client_offload/lib + +JOBID=$(ls "/lscratch/") +NODEFILECN="/lscratch/$JOBID/nodefile-cn-sn" +TEMP=$(wc -l < "$NODEFILECN") +NUMOFCN=128 #$((TEMP-1)) + +export IT4I_OMP_NUM_SAMPLES_SCALE=8 +export IT4I_OMP_TILE_STEP=4 +export IT4I_OMP_CPU_NUM_THREADS=24 +export IT4I_OMP_MIC_NUM_THREADS=240 +export I_MPI_DEBUG=2 + +export MIC_USE_2MB_BUFFERS=100k + +#mpirun -n 1 ${ROOT_DIR}/install/blender/blender : -n 1 ${ROOT_DIR}/install/blender_client_offload/bin/blender_client + +mpirun -n 1 -machinefile $NODEFILECN ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_offload/bin/blender_client + diff --git a/it4i/scripts/run_mpi_symmetric.sh b/it4i/scripts/run_mpi_symmetric.sh new file mode 100644 index 0000000000000000000000000000000000000000..065718aec3d2356088d31dd97c80e8d4c17dde78 --- /dev/null +++ b/it4i/scripts/run_mpi_symmetric.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +export MIC_ENV_PREFIX=MIC + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client_symmetric/lib +export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client_symmetric/lib + +export I_MPI_MIC=1 +export I_MPI_FABRICS=shm:dapl +export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1u,ofa-v2-scif0,ofa-v2-mcm-1 +export I_MPI_MIC_POSTFIX=-mic +export I_MPI_DEBUG=1 + +export IT4I_OMP_NUM_SAMPLES_SCALE=1 +export IT4I_OMP_TILE_STEP=4 +export IT4I_OMP_CPU_NUM_THREADS=24 +export IT4I_OMP_MIC_NUM_THREADS=240 + +export MIC_USE_2MB_BUFFERS=100k + +JOBID=$(ls "/lscratch/") +NODEFILECN="/lscratch/$JOBID/nodefile-cn-sn" +#NODEFILEMIC="/lscratch/$JOBID/nodefile-mic-sn" +NODEFILECN2="/home/milanjaros/nodes.txt" +NUMOFCN=64 #number of clients + +let I=0 +let zero=0 +while read -r line +do + if ((I == 0)) + then + #hosts="${line}" + echo "${line}" > ${NODEFILECN2} + else + #hosts="${hosts};${line};${line}-mic0;${line}-mic1" + echo "${line}" >> ${NODEFILECN2} + #echo "${line}-mic0" >> ${NODEFILECN2} + #echo "${line}-mic1" >> ${NODEFILECN2} + fi + + if ((I == NUMOFCN)) + then + break + fi + + I=$((I+1)) +done < "$NODEFILECN" + +#let I=0 +#let zero=0 +#while read -r line +#do +# if ((I == 0)) +# then +# #hosts="${line}" +# echo "${line}" #> ${NODEFILECN2} +# else +# #hosts="${hosts};${line};${line}-mic0;${line}-mic1" +# #echo "${line}" >> ${NODEFILECN2} +# echo "${line}-mic0" >> ${NODEFILECN2} +# echo "${line}-mic1" >> ${NODEFILECN2} +# fi +# +# if ((I == NUMOFCN)) +# then +# break +# fi +# +# I=$((I+1)) +#done < "$NODEFILECN" + +TEMP=$(wc -l < "$NODEFILECN2") +NUMOFCN2=$((TEMP-1)) + +cp ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client-mic ${ROOT_DIR}/install/blender/blender-mic + +#mpirun -n 1 ${ROOT_DIR}/install/blender/blender : -n 1 ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client + +#mpirun -n 1 -machinefile $NODEFILECN ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client + +mpirun -genv LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH -machine $NODEFILECN2 -n 1 ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN2 ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client + +#mpirun -genv LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH -hosts $hosts -n 1 ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client diff --git a/it4i/scripts/run_netbeans.sh b/it4i/scripts/run_netbeans.sh new file mode 100644 index 0000000000000000000000000000000000000000..c56384d4e9e6ae5b7014557cce62f60ceffe62b5 --- /dev/null +++ b/it4i/scripts/run_netbeans.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib +export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib + +source /apps/all/icc/2016.1.150-GCC-4.9.3-2.25/debugger_2016/bin/debuggervars.sh +alias gdb="gdb-ia" + +/home/milanjaros/netbeans-8.1/bin/netbeans diff --git a/it4i/scripts/run_vtune.sh b/it4i/scripts/run_vtune.sh new file mode 100644 index 0000000000000000000000000000000000000000..27d0d8ea983f6304a9af0c69a9422c0083ebdc01 --- /dev/null +++ b/it4i/scripts/run_vtune.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +module load intel/2016.03-GCC-5.3 +module load CMake/3.3.1-GCC-5.3.0-2.25 + +ROOT_DIR=${PWD} + +source /apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/vtune_amplifier_xe_2016.3.0.463186/amplxe-vars.sh + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib +export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib + +cd ${ROOT_DIR}/install/blender +amplxe-gui + diff --git a/source/blender/blenlib/BLI_utildefines.h b/source/blender/blenlib/BLI_utildefines.h index d504e503c686441e77e6bf4aafcae6f642da8541..d6c743760db3778c81ff6fc098b0a180aa4e4cdf 100644 --- a/source/blender/blenlib/BLI_utildefines.h +++ b/source/blender/blenlib/BLI_utildefines.h @@ -435,7 +435,7 @@ extern "C" { } (void)0 /* assuming a static array */ -#if defined(__GNUC__) && !defined(__cplusplus) && !defined(__clang__) +#if defined(__GNUC__) && !defined(__cplusplus) && !defined(__clang__) && !defined(__INTEL_COMPILER) # define ARRAY_SIZE(arr) \ ((sizeof(struct {int isnt_array : ((const void *)&(arr) == &(arr)[0]);}) * 0) + \ (sizeof(arr) / sizeof(*(arr)))) diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c index fa14ca96fe2cca6b9a599298c6622edded1fa23d..8ab1de95b4137b41fba3dd5608a17e969a3059e5 100644 --- a/source/blender/editors/space_view3d/space_view3d.c +++ b/source/blender/editors/space_view3d/space_view3d.c @@ -329,7 +329,7 @@ static SpaceLink *view3d_new(const bContext *C) v3d->grid = 1.0f; v3d->gridlines = 16; v3d->gridsubdiv = 10; - v3d->drawtype = OB_SOLID; + v3d->drawtype = OB_BOUNDBOX; v3d->gridflag = V3D_SHOW_X | V3D_SHOW_Y | V3D_SHOW_FLOOR; diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h index af1dfc62894fff9d4871a03c684699872202bb59..8444a03cb86759fc242a42ba508cad94b8ff676b 100644 --- a/source/blender/makesdna/DNA_userdef_types.h +++ b/source/blender/makesdna/DNA_userdef_types.h @@ -869,6 +869,8 @@ typedef enum eCompute_Device_Type { USER_COMPUTE_DEVICE_NONE = 0, USER_COMPUTE_DEVICE_OPENCL = 1, USER_COMPUTE_DEVICE_CUDA = 2, + USER_COMPUTE_DEVICE_OMP = 4, + USER_COMPUTE_DEVICE_MPI = 5, } eCompute_Device_Type; diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c index f4c6fdf42f5c5da470ecafee7451f872ebf594fb..00dce381f9678777719b37da25e5cf3397e55f0a 100644 --- a/source/blender/makesrna/intern/rna_userdef.c +++ b/source/blender/makesrna/intern/rna_userdef.c @@ -57,6 +57,8 @@ static EnumPropertyItem compute_device_type_items[] = { {USER_COMPUTE_DEVICE_NONE, "NONE", 0, "None", "Don't use compute device"}, {USER_COMPUTE_DEVICE_CUDA, "CUDA", 0, "CUDA", "Use CUDA for GPU acceleration"}, {USER_COMPUTE_DEVICE_OPENCL, "OPENCL", 0, "OpenCL", "Use OpenCL for GPU acceleration"}, + {USER_COMPUTE_DEVICE_OMP, "OMP", 0, "OMP", "Use OMP/MIC for acceleration"}, + {USER_COMPUTE_DEVICE_MPI, "MPI", 0, "MPI", "Use MPI for acceleration"}, { 0, NULL, 0, NULL, NULL} }; #endif @@ -473,6 +475,10 @@ static EnumPropertyItem *rna_userdef_compute_device_type_itemf(bContext *UNUSED( RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_CUDA); if (CCL_compute_device_list(1)) RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_OPENCL); + if (CCL_compute_device_list(4)) + RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_OMP); + if (CCL_compute_device_list(5)) + RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_MPI); RNA_enum_item_end(&item, &totitem); *r_free = true; @@ -504,11 +510,33 @@ static EnumPropertyItem *rna_userdef_compute_device_itemf(bContext *UNUSED(C), P } else { /* get device list from cycles. it would be good to make this generic - * once we have more subsystems using opencl, for now this is easiest */ - int opencl = (U.compute_device_type == USER_COMPUTE_DEVICE_OPENCL); - CCLDeviceInfo *devices = CCL_compute_device_list(opencl); + * once we have more subsystems using opencl, for now this is easiest */ + + //int opencl = (U.compute_device_type == USER_COMPUTE_DEVICE_OPENCL); + //CCLDeviceInfo *devices = CCL_compute_device_list(opencl); + CCLDeviceInfo *devices = NULL; + int deviceType = -1; int a; + switch(U.compute_device_type) + { + case USER_COMPUTE_DEVICE_CUDA: + deviceType = 0; + break; + case USER_COMPUTE_DEVICE_OPENCL: + deviceType = 1; + break; + case USER_COMPUTE_DEVICE_OMP: + deviceType = 4; + break; + case USER_COMPUTE_DEVICE_MPI: + deviceType = 5; + break; + } + + devices = CCL_compute_device_list(deviceType); + + if (devices) { for (a = 0; devices[a].identifier[0]; a++) { tmp.value = devices[a].value; diff --git a/source/creator/CMakeLists.txt b/source/creator/CMakeLists.txt index ff6544cf0e36e2b2ffb2b86469cacbe5163230e2..11eabf8e45ca810d26176d1e810a7a5422ffd5e1 100644 --- a/source/creator/CMakeLists.txt +++ b/source/creator/CMakeLists.txt @@ -48,6 +48,10 @@ if(WIN32) blender_include_dirs(../../intern/utfconv) endif() +if(WITH_IT4I_MPI) + add_definitions(-DWITH_IT4I_MPI) +endif() + if(WITH_LIBMV) blender_include_dirs(../../intern/libmv) add_definitions(-DWITH_LIBMV) diff --git a/source/creator/creator.c b/source/creator/creator.c index bf8347d59bba29af1a70bbd192c40aa9ac93d429..24d51d65c6d4e4b4ede7a64ddd14c3502e66d87e 100644 --- a/source/creator/creator.c +++ b/source/creator/creator.c @@ -123,6 +123,10 @@ # define BUILD_DATE #endif +#ifdef WITH_IT4I_MPI + #include <mpi.h> +#endif + /* for passing information between creator and gameengine */ #ifdef WITH_GAMEENGINE # include "BL_System.h" @@ -1796,7 +1800,8 @@ int main( #ifdef WIN32 const char **UNUSED(argv_c) #else - const char **argv + //const char **argv + char **argv #endif ) { @@ -1814,6 +1819,12 @@ int main( /* --- end declarations --- */ +#ifdef WITH_IT4I_MPI + // Initialize the MPI environment + //MPI_Init(&argc, &argv); + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); +#endif #ifdef WIN32 /* We delay loading of openmp so we can set the policy here. */ @@ -2100,6 +2111,11 @@ int main( } WM_main(C); + +#ifdef WITH_IT4I_MPI + // Finalize the MPI environment. + MPI_Finalize(); +#endif return 0; } /* end of int main(argc, argv) */