diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1efaa14262566108d5c472ff84e7c259b0614a2d..0e08ef23032702bb0b04b835109121545ffd9d19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,6 +309,10 @@ option(WITH_IMAGE_CINEON        "Enable CINEON and DPX Image Support" ON)
 option(WITH_IMAGE_HDR           "Enable HDR Image Support" ON)
 option(WITH_IMAGE_FRAMESERVER   "Enable image FrameServer Support for rendering" ON)
 
+#IT4I
+option(WITH_IT4I_MPI        "Enable MPI (has to be supported by the compiler)" OFF)
+option(WITH_IT4I_MIC_OFFLOAD        "Enable MIC (has to be supported by the compiler)" OFF)
+
 # Audio/Video format support
 option(WITH_CODEC_AVI           "Enable Blenders own AVI file support (raw/jpeg)" ON)
 option(WITH_CODEC_FFMPEG        "Enable FFMPeg Support (http://ffmpeg.org)" ${_init_CODEC_FFMPEG})
@@ -3035,12 +3039,14 @@ if(FIRST_RUN)
 	info_cfg_option(WITH_CYCLES)
 	info_cfg_option(WITH_FREESTYLE)
 	info_cfg_option(WITH_OPENCOLORIO)
-	info_cfg_option(WITH_OPENVDB)
+	info_cfg_option(WITH_OPENVDB)  
 
 	info_cfg_text("Compiler Options:")
 	info_cfg_option(WITH_BUILDINFO)
 	info_cfg_option(WITH_OPENMP)
 	info_cfg_option(WITH_RAYOPTIMIZATION)
+	info_cfg_option(WITH_IT4I_MPI)
+  info_cfg_option(WITH_IT4I_MIC_OFFLOAD)  
 
 	info_cfg_text("System Options:")
 	info_cfg_option(WITH_INSTALL_PORTABLE)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake
index 7bdf79098258e5d4d2473bf81e0c4674f1325f5d..99546d3e6aa7b817443ff2497d9c70e677e5646d 100644
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -502,6 +502,21 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		endif()
 	endif()
 
+	if(WITH_IT4I_MPI)
+		list(APPEND BLENDER_LINK_LIBS
+			cycles_kernel_mpi)      
+	endif()  
+
+	if(WITH_IT4I_MIC_OFFLOAD)
+		list(APPEND BLENDER_LINK_LIBS
+			cycles_kernel_mic)      
+	endif()
+  
+	if(WITH_OPENMP)
+		list(APPEND BLENDER_LINK_LIBS
+			cycles_kernel_omp)      
+	endif()  
+
 	# Sort libraries
 	set(BLENDER_SORTED_LIBS
 		bf_windowmanager
@@ -607,6 +622,9 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		cycles_bvh
 		cycles_device
 		cycles_kernel
+                cycles_kernel_mic
+                cycles_kernel_omp
+                cycles_kernel_mpi
 		cycles_util
 		cycles_subd
 		bf_intern_opencolorio
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index a8cc4907cbf56ac14a7c4f47bb82dee57ed387f7..fe9b21ec1079c62487bb068d86c97c17b8497ebd 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -56,6 +56,18 @@ if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()
 
+if (WITH_IT4I_MPI)
+  add_definitions(-DWITH_IT4I_MPI)
+endif()
+
+if (WITH_IT4I_MIC_OFFLOAD)
+  add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+endif()
+
+if (WITH_OPENMP)
+  add_definitions(-DWITH_OPENMP)
+endif()
+
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}")
 
 # avoid link failure with clang 3.4 debug
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 96dc3a59ef2df73319241f85a5f5ad8635e3934b..aa399ac27c3def588e898c898b369ef78d260db9 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -155,6 +155,14 @@ def with_osl():
 def with_network():
     import _cycles
     return _cycles.with_network
+    
+def with_openmp():
+    import _cycles
+    return _cycles.with_openmp
+    
+def with_it4i_mpi():
+    import _cycles
+    return _cycles.with_it4i_mpi        
 
 
 def system_info():
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 01aa619b3068a265d3827868c8aed39aa1018814..da0d8e59dd6f8c0afae8ba12da637e3445fb1690 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -29,12 +29,12 @@ import _cycles
 
 enum_devices = (
     ('CPU', "CPU", "Use CPU for rendering"),
-    ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in user preferences"),
+    ('Acc', "Acc Compute", "Use GPU/MIC/MPI compute device for rendering, configured in user preferences"),
     )
 
 if _cycles.with_network:
     enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),)
-
+      
 enum_feature_set = (
     ('SUPPORTED', "Supported", "Only use finished and supported features"),
     ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index b9e51dfddd4483a45cc232194f808d0dfbed13ba..ef561252f686af3a6277f7c7b5bb74acbad93c08 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -57,7 +57,7 @@ def use_cpu(context):
     cscene = context.scene.cycles
     device_type = context.user_preferences.system.compute_device_type
 
-    return (device_type == 'NONE' or cscene.device == 'CPU')
+    return (device_type == 'NONE' or cscene.device == 'CPU' or cscene.device == 'OMP' or cscene.device == 'MPI')
 
 
 def use_opencl(context):
@@ -1609,7 +1609,7 @@ def draw_device(self, context):
         layout.prop(cscene, "feature_set")
 
         device_type = context.user_preferences.system.compute_device_type
-        if device_type in {'CUDA', 'OPENCL', 'NETWORK'}:
+        if device_type in {'CUDA', 'OPENCL', 'NETWORK', 'OMP', 'MPI'}:
             layout.prop(cscene, "device")
 
         if engine.with_osl() and use_cpu(context):
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 27eab0c7f681184675534ef1f4f7678be18fcace..3600a1ffd9eed1a59fba7f049b779a52957ec5fd 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -738,6 +738,22 @@ void *CCL_python_module_init()
 	Py_INCREF(Py_False);
 #endif /* WITH_NETWORK */
 
+#ifdef WITH_OPENMP
+	PyModule_AddObject(mod, "with_openmp", Py_True);
+	Py_INCREF(Py_True);
+#else /* WITH_OPENMP */
+	PyModule_AddObject(mod, "with_openmp", Py_False);
+	Py_INCREF(Py_False);
+#endif /* WITH_OPENMP */
+
+#ifdef WITH_IT4I_MPI
+	PyModule_AddObject(mod, "with_it4i_mpi", Py_True);
+	Py_INCREF(Py_True);
+#else /* WITH_IT4I_MPI */
+	PyModule_AddObject(mod, "with_it4i_mpi", Py_False);
+	Py_INCREF(Py_False);
+#endif /* WITH_IT4I_MPI */
+
 	return (void*)mod;
 }
 
@@ -754,6 +770,12 @@ CCLDeviceInfo *CCL_compute_device_list(int device_type)
 		case 2:
 			type = ccl::DEVICE_NETWORK;
 			break;
+		case 4:
+			type = ccl::DEVICE_OMP;
+			break;
+		case 5:
+			type = ccl::DEVICE_MPI;
+			break;
 		default:
 			type = ccl::DEVICE_NONE;
 			break;
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index f1b524f7b447ddea7b2e80dca6e7db094f284fcd..80f08c4e027fef783c7f7e56f4b5528406fe6360 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -108,7 +108,23 @@ void BlenderSession::create()
 void BlenderSession::create_session()
 {
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	bool is_cpu = session_params.device.type == DEVICE_CPU;
+        
+//	if(session_params.device.type == DEVICE_OMP || session_params.device.type == DEVICE_MPI)
+//        {
+//            if (!background)
+//            {
+//                session_params.tile_size = make_int2(width, height);
+//            }
+//            else
+//            {            
+//		int tile_x = b_engine.resolution_x();
+//		int tile_y = b_engine.resolution_y();
+//
+//		session_params.tile_size = make_int2(tile_x, tile_y);
+//            }
+//	}         
+//        
+	bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP;
 	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
@@ -170,7 +186,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_)
 	b_scene = b_scene_;
 
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
+	const bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP;
 	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 
 	width = render_resolution_x(b_render);
@@ -763,7 +779,7 @@ void BlenderSession::synchronize()
 
 	/* on session/scene parameter changes, we recreate session entirely */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
+	const bool is_cpu = session_params.device.type == DEVICE_CPU;// || session_params.device.type == DEVICE_MPI || session_params.device.type == DEVICE_OMP;
 	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
@@ -913,6 +929,12 @@ void BlenderSession::get_progress(float& progress, double& total_time, double& r
 
 	session->progress.get_tile(tile, total_time, render_time, tile_time);
 
+    if (background && (session->params.device.type == DEVICE_MPI || session->params.device.type == DEVICE_OMP))
+    {
+        progress = ((float) session->device->get_tile_id()) / session->device->get_num_tiles();
+    }
+    else
+    {      
 	sample = session->progress.get_sample();
 	samples_per_tile = session->tile_manager.num_samples;
 
@@ -922,6 +944,7 @@ void BlenderSession::get_progress(float& progress, double& total_time, double& r
 		progress = ((float)samples) / total_samples;
 	else
 		progress = 0.0;
+    }
 }
 
 void BlenderSession::update_bake_progress()
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 749b8c0319bb7c3255cf22c1389fdabde0085ffc..c3c97fca6a170036d0716de37dd391b10be65802 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -509,14 +509,18 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	/* device default CPU */
 	params.device = devices[0];
 
+//#if !defined(WITH_IT4I_MIC_OFFLOAD) && !defined(WITH_IT4I_MPI) && !defined(WITH_OPENMP)       
 	if(get_enum(cscene, "device") == 2) {
 		/* find network device */
 		foreach(DeviceInfo& info, devices)
 			if(info.type == DEVICE_NETWORK)
 				params.device = info;
 	}
-	else if(get_enum(cscene, "device") == 1) {
-		/* find GPU device with given id */
+        //GPU, MIC, MPI
+	else if(get_enum(cscene, "device") == 1) 
+//#endif            
+        {
+		/* find device with given id */
 		PointerRNA systemptr = b_userpref.system().ptr;
 		PropertyRNA *deviceprop = RNA_struct_find_property(&systemptr, "compute_device");
 		int device_id = b_userpref.system().compute_device();
@@ -569,7 +573,13 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	/* tiles */
-	if(params.device.type != DEVICE_CPU && !background) {
+	if((params.device.type == DEVICE_OPENCL || 
+            params.device.type == DEVICE_CUDA ||
+            params.device.type == DEVICE_NETWORK || 
+            params.device.type == DEVICE_MULTI) && 
+            !background
+         )                
+	{
 		/* currently GPU could be much slower than CPU when using tiles,
 		 * still need to be investigated, but meanwhile make it possible
 		 * to work in viewport smoothly
@@ -578,7 +588,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 
 		params.tile_size = make_int2(debug_tile_size, debug_tile_size);
 	}
-	else {
+	else 
+        {
 		int tile_x = b_engine.tile_x();
 		int tile_y = b_engine.tile_y();
 
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 2a9ec0c38182eec2d6c74dc72514f1a9a10a5a3c..5d23c75378880596c6f3f08e9b089284fbcf9b30 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -4,9 +4,14 @@ set(INC
 	../kernel
 	../kernel/svm
 	../kernel/osl
+	../kernel/kernels/mic
+	../kernel/kernels/mpi 
+	../kernel/kernels/omp 
 	../util
 	../render
 	../../glew-mx
+        ../../../it4i/client/api
+	${MPI_INCLUDE_DIR}  
 )
 
 set(INC_SYS
@@ -63,7 +68,26 @@ if(WITH_CYCLES_DEVICE_MULTI)
 	add_definitions(-DWITH_MULTI)
 endif()
 
+if (WITH_IT4I_MPI)
+  add_definitions(-DWITH_IT4I_MPI)
+	list(APPEND SRC
+		device_mpi.cpp
+	)
+endif()
+
+if (WITH_IT4I_MIC_OFFLOAD)
+  add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+endif()
+
+if (WITH_OPENMP)
+  add_definitions(-DWITH_OPENMP)
+	list(APPEND SRC
+		   device_omp.cpp
+	)
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
 add_library(cycles_device ${SRC} ${SRC_HEADERS})
+target_link_libraries (cycles_device ${MPI_LIB_FILE})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 8c01bcb116fe9edf2081b53e6d28e1ecd44d8896..0b58619168189f0b8fe5b7ea1404c9fa3de6068f 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -70,20 +70,20 @@ Device::~Device()
 
 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc(mem, MEM_READ_WRITE);
+	mem_alloc("pixel", mem, MEM_READ_WRITE);
 }
 
 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
 {
 	if(mem.data_type == TYPE_HALF)
-		mem_copy_from(mem, y, w, h, sizeof(half4));
+		mem_copy_from("pixel", mem, y, w, h, sizeof(half4));
 	else
-		mem_copy_from(mem, y, w, h, sizeof(uchar4));
+		mem_copy_from("pixel", mem, y, w, h, sizeof(uchar4));
 }
 
 void Device::pixels_free(device_memory& mem)
 {
-	mem_free(mem);
+	mem_free("pixel", mem);
 }
 
 void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
@@ -214,7 +214,23 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
 	switch(info.type) {
 		case DEVICE_CPU:
 			device = device_cpu_create(info, stats, background);
+			break;  
+#ifdef WITH_OPENMP
+		case DEVICE_OMP:
+			if (device_omp_init())
+				device = device_omp_create(info, stats, background);
+			else
+				device = NULL;
+			break;  
+#endif
+#ifdef WITH_IT4I_MPI
+		case DEVICE_MPI:
+			if (device_mpi_init())
+				device = device_mpi_create(info, stats, background);
+			else
+				device = NULL;
 			break;
+#endif
 #ifdef WITH_CUDA
 		case DEVICE_CUDA:
 			if(device_cuda_init())
@@ -276,6 +292,10 @@ string Device::string_from_type(DeviceType type)
 		return "network";
 	else if(type == DEVICE_MULTI)
 		return "multi";
+	else if (type == DEVICE_OMP)
+		return "omp";
+	else if (type == DEVICE_MPI)
+		return "mpi";
 	
 	return "";
 }
@@ -286,6 +306,16 @@ vector<DeviceType>& Device::available_types()
 		types.clear();
 		types.push_back(DEVICE_CPU);
 
+#ifdef WITH_OPENMP
+		if (device_omp_init())
+			types.push_back(DEVICE_OMP);
+#endif
+
+#ifdef WITH_IT4I_MPI
+		if (device_mpi_init())
+			types.push_back(DEVICE_MPI);
+#endif
+
 #ifdef WITH_CUDA
 		if(device_cuda_init())
 			types.push_back(DEVICE_CUDA);
@@ -313,6 +343,17 @@ vector<DeviceInfo>& Device::available_devices()
 {
 	if(need_devices_update) {
 		devices.clear();
+
+#ifdef WITH_OPENMP
+		if (device_omp_init())
+			device_omp_info(devices);
+#endif
+
+#ifdef WITH_IT4I_MPI
+		if (device_mpi_init())
+			device_mpi_info(devices);
+#endif
+
 #ifdef WITH_CUDA
 		if(device_cuda_init())
 			device_cuda_info(devices);
@@ -374,4 +415,163 @@ void Device::free_memory()
 	devices.free_memory();
 }
 
+bool Device::get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams &params, float* buffer_data_pointer)
+{
+	int pass_offset = 0;
+
+	foreach(Pass& pass, params.passes) {
+		if(pass.type != type) {
+			pass_offset += pass.components;
+			continue;
+		}
+
+		float *in = (float*)buffer_data_pointer + pass_offset;
+		int pass_stride = params.get_passes_size();
+
+		float scale = (pass.filter)? 1.0f/(float)sample: 1.0f;
+		float scale_exposure = (pass.exposure)? scale*exposure: scale;
+
+		int size = params.width*params.height;
+
+		if(components == 1) {
+			assert(pass.components == components);
+
+			/* scalar */
+			if(type == PASS_DEPTH) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = (f == 0.0f)? 1e10f: f*scale_exposure;
+				}
+			}
+			else if(type == PASS_MIST) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = saturate(f*scale_exposure);
+				}
+			}
+#ifdef WITH_CYCLES_DEBUG
+			else if(type == PASS_BVH_TRAVERSAL_STEPS) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f;
+				}
+			}
+			else if(type == PASS_RAY_BOUNCES) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f;
+				}
+			}
+#endif
+			else {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+					float f = *in;
+					pixels[0] = f*scale_exposure;
+				}
+			}
+		}
+		else if(components == 3) {
+			assert(pass.components == 4);
+
+			/* RGBA */
+			if(type == PASS_SHADOW) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+					float4 f = make_float4(in[0], in[1], in[2], in[3]);
+					float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f;
+
+					pixels[0] = f.x*invw;
+					pixels[1] = f.y*invw;
+					pixels[2] = f.z*invw;
+				}
+			}
+			else if(pass.divide_type != PASS_NONE) {
+				/* RGB lighting passes that need to divide out color */
+				pass_offset = 0;
+				foreach(Pass& color_pass, params.passes) {
+					if(color_pass.type == pass.divide_type)
+						break;
+					pass_offset += color_pass.components;
+				}
+
+				float *in_divide = (float*)buffer_data_pointer + pass_offset;
+
+				for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
+					float3 f = make_float3(in[0], in[1], in[2]);
+					float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+
+					f = safe_divide_even_color(f*exposure, f_divide);
+
+					pixels[0] = f.x;
+					pixels[1] = f.y;
+					pixels[2] = f.z;
+				}
+			}
+			else {
+				/* RGB/vector */
+				for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+					float3 f = make_float3(in[0], in[1], in[2]);
+
+					pixels[0] = f.x*scale_exposure;
+					pixels[1] = f.y*scale_exposure;
+					pixels[2] = f.z*scale_exposure;
+				}
+			}
+		}
+		else if(components == 4) {
+			assert(pass.components == components);
+
+			/* RGBA */
+			if(type == PASS_SHADOW) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+					float4 f = make_float4(in[0], in[1], in[2], in[3]);
+					float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f;
+
+					pixels[0] = f.x*invw;
+					pixels[1] = f.y*invw;
+					pixels[2] = f.z*invw;
+					pixels[3] = 1.0f;
+				}
+			}
+			else if(type == PASS_MOTION) {
+				/* need to normalize by number of samples accumulated for motion */
+				pass_offset = 0;
+				foreach(Pass& color_pass, params.passes) {
+					if(color_pass.type == PASS_MOTION_WEIGHT)
+						break;
+					pass_offset += color_pass.components;
+				}
+
+				float *in_weight = (float*)buffer_data_pointer + pass_offset;
+
+				for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
+					float4 f = make_float4(in[0], in[1], in[2], in[3]);
+					float w = in_weight[0];
+					float invw = (w > 0.0f)? 1.0f/w: 0.0f;
+
+					pixels[0] = f.x*invw;
+					pixels[1] = f.y*invw;
+					pixels[2] = f.z*invw;
+					pixels[3] = f.w*invw;
+				}
+			}
+			else {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+					float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+					pixels[0] = f.x*scale_exposure;
+					pixels[1] = f.y*scale_exposure;
+					pixels[2] = f.z*scale_exposure;
+
+					/* clamp since alpha might be > 1.0 due to russian roulette */
+					pixels[3] = saturate(f.w*scale);
+				}
+			}
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 30d0003b94070f45e1768f9b0fac50c362e09cb4..2a772b4897f6aaecfe8a33d2dbdbd73f112ba9d3 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,8 @@
 #include "util_types.h"
 #include "util_vector.h"
 
+#include "buffers.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Progress;
@@ -42,7 +44,9 @@ enum DeviceType {
 	DEVICE_OPENCL,
 	DEVICE_CUDA,
 	DEVICE_NETWORK,
-	DEVICE_MULTI
+	DEVICE_MULTI,
+	DEVICE_OMP,
+	DEVICE_MPI
 };
 
 class DeviceInfo {
@@ -201,14 +205,16 @@ public:
 
 	/* statistics */
 	Stats &stats;
+        
+        
 
 	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
-	virtual void mem_copy_to(device_memory& mem) = 0;
-	virtual void mem_copy_from(device_memory& mem,
+	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
+	virtual void mem_copy_to(const char *name, device_memory& mem) = 0;
+	virtual void mem_copy_from(const char *name, device_memory& mem,
 		int y, int w, int h, int elem) = 0;
-	virtual void mem_zero(device_memory& mem) = 0;
-	virtual void mem_free(device_memory& mem) = 0;
+	virtual void mem_zero(const char *name, device_memory& mem) = 0;
+	virtual void mem_free(const char *name, device_memory& mem) = 0;
 
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
@@ -222,7 +228,7 @@ public:
 		(void)interpolation;  /* Ignored. */
 		(void)extension;  /* Ignored. */
 	};
-	virtual void tex_free(device_memory& /*mem*/) {};
+	virtual void tex_free(const char *name, device_memory& /*mem*/) {};
 
 	/* pixel memory */
 	virtual void pixels_alloc(device_memory& mem);
@@ -242,6 +248,13 @@ public:
 	virtual void task_add(DeviceTask& task) = 0;
 	virtual void task_wait() = 0;
 	virtual void task_cancel() = 0;
+        
+         /* tiles */
+	virtual int get_tile_id(){return 0;};
+	virtual int get_num_tiles(){return 0;};  
+        
+        /* pass */
+        virtual bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams &params, float* buffer_data_pointer);
 	
 	/* opengl drawing */
 	virtual void draw_pixels(device_memory& mem, int y, int w, int h,
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 676b1279a80ce3c80212bf69e37b4dfc5e79f346..eec6a061998fca3a06db57759f6e3cf5354e2c9f 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -46,6 +46,10 @@
 #include "util_system.h"
 #include "util_thread.h"
 
+#ifdef WITH_OPENMP
+    #include <omp.h>
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 class CPUDevice : public Device
@@ -112,31 +116,31 @@ public:
 		task_pool.stop();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
 		mem.device_pointer = mem.data_pointer;
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
 
-	void mem_copy_to(device_memory& /*mem*/)
+	void mem_copy_to(const char *name, device_memory& /*mem*/)
 	{
 		/* no-op */
 	}
 
-	void mem_copy_from(device_memory& /*mem*/,
+	void mem_copy_from(const char *name, device_memory& /*mem*/,
 	                   int /*y*/, int /*w*/, int /*h*/,
 	                   int /*elem*/)
 	{
 		/* no-op */
 	}
 
-	void mem_zero(device_memory& mem)
+	void mem_zero(const char *name, device_memory& mem)
 	{
 		memset((void*)mem.device_pointer, 0, mem.memory_size());
 	}
 
-	void mem_free(device_memory& mem)
+	void mem_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			mem.device_pointer = 0;
@@ -155,6 +159,8 @@ public:
 	               InterpolationType interpolation,
 	               ExtensionType extension)
 	{
+                printf("tex_alloc: %s\n", name);
+            
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 		kernel_tex_copy(&kernel_globals,
 		                name,
@@ -169,8 +175,10 @@ public:
 		stats.mem_alloc(mem.device_size);
 	}
 
-	void tex_free(device_memory& mem)
+	void tex_free(const char *name, device_memory& mem)
 	{
+                printf("tex_free: %s\n", name);
+            
 		if(mem.device_pointer) {
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
@@ -188,13 +196,22 @@ public:
 	}
 
 	void thread_run(DeviceTask *task)
-	{
+	{   
+#ifdef WITH_OPENMP            
+            double t1 = omp_get_wtime();
+#endif            
+            
 		if(task->type == DeviceTask::PATH_TRACE)
 			thread_path_trace(*task);
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
 			thread_shader(*task);
+                
+#ifdef WITH_OPENMP            
+            double t2 = omp_get_wtime();
+            printf("DEVICE: CPU, %f\n", t2 - t1);    
+#endif                 
 	}
 
 	class CPUDeviceTask : public DeviceTask {
@@ -460,6 +477,7 @@ public:
 
 	void task_add(DeviceTask& task)
 	{
+                
 		/* split task into smaller ones */
 		list<DeviceTask> tasks;
 
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 80c8cb1e59229b7f75fa765b00f45257d231b375..734b8100fe31b7da82eddcd31dfa5ead77d810c0 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -397,7 +397,7 @@ public:
 		return (result == CUDA_SUCCESS);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
 		cuda_push_context();
 		CUdeviceptr device_pointer;
@@ -409,7 +409,7 @@ public:
 		cuda_pop_context();
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(const char *name, device_memory& mem)
 	{
 		cuda_push_context();
 		if(mem.device_pointer)
@@ -417,7 +417,7 @@ public:
 		cuda_pop_context();
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
 	{
 		size_t offset = elem*y*w;
 		size_t size = elem*w*h;
@@ -433,7 +433,7 @@ public:
 		cuda_pop_context();
 	}
 
-	void mem_zero(device_memory& mem)
+	void mem_zero(const char *name, device_memory& mem)
 	{
 		memset((void*)mem.data_pointer, 0, mem.memory_size());
 
@@ -443,7 +443,7 @@ public:
 		cuda_pop_context();
 	}
 
-	void mem_free(device_memory& mem)
+	void mem_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			cuda_push_context();
@@ -596,8 +596,8 @@ public:
 			else {
 				cuda_pop_context();
 
-				mem_alloc(mem, MEM_READ_ONLY);
-				mem_copy_to(mem);
+				mem_alloc(name, mem, MEM_READ_ONLY);
+				mem_copy_to(name, mem);
 
 				cuda_push_context();
 
@@ -627,8 +627,8 @@ public:
 			cuda_pop_context();
 		}
 		else {
-			mem_alloc(mem, MEM_READ_ONLY);
-			mem_copy_to(mem);
+			mem_alloc(name, mem, MEM_READ_ONLY);
+			mem_copy_to(name, mem);
 
 			cuda_push_context();
 
@@ -654,7 +654,7 @@ public:
 		tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
 	}
 
-	void tex_free(device_memory& mem)
+	void tex_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			if(tex_interp_map[mem.device_pointer]) {
@@ -670,7 +670,7 @@ public:
 			}
 			else {
 				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
-				mem_free(mem);
+				mem_free(name, mem);
 			}
 		}
 	}
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 47584ae6d226714f1d24610d2790ffddfb5b772e..d4622ce0acd9608bf12eb0962325d9deddb4ad13 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -39,6 +39,20 @@ string device_cpu_capabilities(void);
 string device_opencl_capabilities(void);
 string device_cuda_capabilities(void);
 
+#ifdef WITH_OPENMP
+string device_omp_capabilities(void);
+bool device_omp_init(void);
+Device *device_omp_create(DeviceInfo& info, Stats &stats, bool background);
+void device_omp_info(vector<DeviceInfo>& devices);
+#endif
+
+#ifdef WITH_IT4I_MPI
+string device_mpi_capabilities(void);
+void device_mpi_info(vector<DeviceInfo>& devices);
+bool device_mpi_init(void);
+Device *device_mpi_create(DeviceInfo& info, Stats &stats, bool background);
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_mpi.cpp b/intern/cycles/device/device_mpi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1adf5f2e353eb9dc7ddd45bd9d0aa66eac586381
--- /dev/null
+++ b/intern/cycles/device/device_mpi.cpp
@@ -0,0 +1,1034 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,task_pool
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "device.h"
+#include "device_intern.h"
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+#include "util_foreach.h"
+
+#include "kernel_mpi.h"
+
+#include <mpi.h>
+#include <omp.h>
+
+CCL_NAMESPACE_BEGIN
+
+class MultiMPIDevice : public Device
+{
+public:
+    DedicatedTaskPool task_pool;
+    KernelGlobals kernel_globals;
+    device_ptr rgba_pixels;
+
+    int tile_id;
+    int num_tiles;
+
+    MultiMPIDevice(DeviceInfo& info, Stats &stats, bool background_)
+    : Device(info, stats, background_)
+    {
+        tile_id = 0;
+        num_tiles = 0;
+        rgba_pixels = NULL;
+        mpi_alloc_kg(info.num == 1);
+    }
+
+    ~MultiMPIDevice()
+    {
+        mpi_free_kg();
+    }
+
+    void mem_alloc(const char *name, device_memory& mem, MemoryType type)
+    {
+        mem.device_pointer = mem.data_pointer;
+        mem.device_size = mem.memory_size();
+        stats.mem_alloc(mem.device_size);
+
+        if (!strcmp(name, "pixel"))
+        {
+            rgba_pixels = mem.device_pointer;
+        }
+
+        mpi_mem_alloc(name, mem.device_pointer, mem.device_size);
+    }
+
+    void mem_copy_to(const char *name, device_memory& mem)
+    {
+        mpi_mem_copy_to(mem.device_pointer, mem.device_size, 0);
+    }
+
+    void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
+    {
+    }
+
+    void mem_zero(const char *name, device_memory& mem)
+    {
+        if (mem.device_pointer)
+        {
+            memset((void*) mem.device_pointer, 0, mem.memory_size());
+
+            mpi_mem_zero(mem.device_pointer, mem.device_size, 0);
+        }
+    }
+
+    void mem_free(const char *name, device_memory& mem)
+    {
+        if (mem.device_pointer)
+        {
+            if (!strcmp(name, "pixel"))
+            {
+                rgba_pixels = NULL;
+            }
+
+            mpi_mem_free(mem.device_pointer, mem.device_size);
+
+            mem.device_pointer = 0;
+            stats.mem_free(mem.device_size);
+            mem.device_size = 0;
+        }
+    }
+
+    void const_copy_to(const char *name, void *host, size_t size)
+    {
+        kernel_const_copy(&kernel_globals, name, host, size);
+        mpi_const_copy(name, (char*) host, size);
+    }
+
+    void tex_alloc(const char *name,
+            device_memory& mem,
+            InterpolationType
+            interpolation,
+            ExtensionType extension)
+    {
+
+        mem.device_size = mem.memory_size();
+        stats.mem_alloc(mem.device_size);
+        mem.device_pointer = mem.data_pointer;
+
+        kernel_tex_copy(&kernel_globals,
+                name,
+                mem.data_pointer,
+                mem.data_width,
+                mem.data_height,
+                mem.data_depth,
+                interpolation,
+                extension);
+
+        mpi_tex_copy(name,
+                mem.device_pointer,
+                mem.device_size,
+                mem.data_width,
+                mem.data_height,
+                mem.data_depth,
+                interpolation,
+                (int) extension);
+    }
+
+    void tex_free(const char *name, device_memory& mem)
+    {
+        if (mem.device_pointer)
+        {
+            mpi_tex_free(name, mem.device_pointer, mem.device_size);
+
+            mem.device_pointer = 0;
+            stats.mem_free(mem.device_size);
+            mem.device_size = 0;
+        }
+    }
+
+    int get_split_task_count(DeviceTask& task)
+    {
+        return 1;
+    }
+
+    class MultiMPIDeviceTask : public DeviceTask
+    {
+    public:
+
+        MultiMPIDeviceTask(MultiMPIDevice *device, DeviceTask& task)
+        : DeviceTask(task)
+        {
+            run = function_bind(&MultiMPIDevice::thread_run, device, this);
+        }
+    };
+
+    void task_add(DeviceTask& task)
+    {
+        task_pool.push(new MultiMPIDeviceTask(this, task));
+    }
+
+    void task_wait()
+    {
+        task_pool.wait();
+    }
+
+    void task_cancel()
+    {
+        task_pool.cancel();
+    }
+
+    void thread_run(DeviceTask *task)
+    {
+#ifdef WITH_OPENMP            
+        double t1 = omp_get_wtime();
+#endif   
+
+        if (task->type == DeviceTask::PATH_TRACE)
+            thread_path_trace(*task);
+        else if (task->type == DeviceTask::FILM_CONVERT)
+            thread_film_convert(*task);
+        else if (task->type == DeviceTask::SHADER)
+            thread_shader(*task);
+
+#ifdef WITH_OPENMP            
+        double t2 = omp_get_wtime();
+        printf("DEVICE: MPI, %f\n", t2 - t1);
+#endif             
+    }
+
+    void receive_path_buffer_progressive(DeviceTask& task, RenderTile &tile, int offset, int stride)
+    {
+        const int dev_count = info.multi_devices.size();
+
+        int tile_x = tile.buffers->params.full_x;
+        int tile_y = tile.buffers->params.full_y;
+        int tile_h = tile.buffers->params.height;
+        int tile_w = tile.buffers->params.width;
+
+        int pass_stride = tile.buffers->params.get_passes_size();
+        int end_sample = tile.start_sample + tile.num_samples;
+
+        int tile_step = tile_h / info.multi_devices.size();
+        int tile_last = tile_h - (info.multi_devices.size() - 1) * tile_step;
+
+        const int dev_countAll = dev_count + 1;
+        std::vector<int> displsBuf(dev_countAll);
+        std::vector<int> recvcountsBuf(dev_countAll);
+        displsBuf[0] = 0;
+        recvcountsBuf[0] = 0;
+
+        std::vector<int> displsByte(dev_countAll);
+        std::vector<int> recvcountsByte(dev_countAll);
+        displsByte[0] = 0;
+        recvcountsByte[0] = 0;
+
+        std::vector<int> sample_finished(dev_count);
+        std::vector<int> displsSample(dev_countAll);
+        std::vector<int> recvcountsSample(dev_countAll);
+        displsSample[0] = 0;
+        recvcountsSample[0] = 0;
+
+        for (int dev = 0; dev < dev_count; dev++)
+        {
+            int tile_y2 = tile_y + tile_step * dev;
+            int tile_h2 = (dev_count - 1 == dev) ? tile_last : tile_step;
+
+            displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float);
+            recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float);
+
+            displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4);
+            recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4);
+
+            displsSample[dev + 1] = dev * sizeof (int);
+            recvcountsSample[dev + 1] = sizeof (int);
+        }
+
+        int reqFinished = 0;
+
+        if (rgba_pixels != NULL)
+            MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+        else
+            MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        tile.sample = end_sample;
+        task.update_progress(&tile);
+    }
+
+    virtual int get_tile_id()
+    {
+        return tile_id;
+    };
+
+    virtual int get_num_tiles()
+    {
+        return num_tiles;
+    };
+
+    void receive_path_buffer_offline(DeviceTask& task, RenderTile &tile, int offset, int stride)
+    {
+        const int dev_count = info.multi_devices.size();
+
+        int tile_x = tile.buffers->params.full_x;
+        int tile_y = tile.buffers->params.full_y;
+        int tile_h = tile.buffers->params.height;
+        int tile_w = tile.buffers->params.width;
+
+        tile_x = tile.x;
+        tile_y = tile.y;
+        tile_h = tile.h;
+        tile_w = tile.w;
+
+        offset = tile.offset;
+        stride = tile.stride;
+
+        num_tiles = tile_h;
+        tile_id = 0;
+
+        int pass_stride = tile.buffers->params.get_passes_size();
+        int end_sample = tile.start_sample + tile.num_samples;
+
+        int tile_step = 1;//TILE_STEP;
+        
+        if (getenv("IT4I_OMP_TILE_STEP"))
+        {
+            tile_step = atoi(getenv("IT4I_OMP_TILE_STEP"));
+            printf("IT4I_OMP_TILE_STEP: %d\n", tile_step);
+        }        
+
+        const int dev_countAll = dev_count + 1;
+        std::vector<int> displsBuf(dev_countAll);
+        std::vector<int> recvcountsBuf(dev_countAll);
+        displsBuf[0] = 0;
+        recvcountsBuf[0] = 0;
+
+        std::vector<int> displsByte(dev_countAll);
+        std::vector<int> recvcountsByte(dev_countAll);
+        displsByte[0] = 0;
+        recvcountsByte[0] = 0;
+
+        std::vector<int> sample_finished(dev_count);
+        std::vector<int> displsSample(dev_countAll);
+        std::vector<int> recvcountsSample(dev_countAll);
+        displsSample[0] = 0;
+        recvcountsSample[0] = 0;
+
+        std::vector<int> row_finished(dev_count);
+        std::vector<int> displsRow(dev_countAll);
+        std::vector<int> recvcountsRow(dev_countAll);
+        displsRow[0] = 0;
+        recvcountsRow[0] = 0;
+
+        std::vector<int> reqJob(dev_count);
+        std::vector<int> displsJob(dev_countAll);
+        std::vector<int> sendcountsJob(dev_countAll);
+        displsJob[0] = 0;
+        sendcountsJob[0] = 0;
+
+        int tile_y_node = tile_y + tile_step*dev_count;
+
+        for (int dev = 0; dev < dev_count; dev++)
+        {
+            int tile_y2 = tile_y + tile_step * dev;
+            int tile_h2 = tile_step;
+
+            displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float);
+            recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float);
+
+            displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4);
+            recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4);
+
+            displsSample[dev + 1] = dev * sizeof (int);
+            recvcountsSample[dev + 1] = sizeof (int);
+
+            displsRow[dev + 1] = dev * sizeof (int);
+            recvcountsRow[dev + 1] = sizeof (int);
+
+            displsJob[dev + 1] = dev * sizeof (int);
+            sendcountsJob[dev + 1] = sizeof (int);
+            reqJob[dev] = -1;
+        }
+
+        int reqFinished = 0;
+
+        while (true)
+        {
+            MPI_Gatherv(NULL, 0, MPI_BYTE, &sample_finished[0], &recvcountsSample[0], &displsSample[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+            MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+            for (int i = 0; i < dev_count; i++)
+            {
+                //printf("SERVER: sample_finished: %d\n", sample_finished[i]);
+                //printf("SERVER: row_finished: %d\n", row_finished[i]);
+
+                displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float);
+                displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4);
+            }
+
+            if (rgba_pixels != NULL)
+                MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+            else
+                MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+            int min_count = 1;
+            for (int i = 0; i < dev_count; i++)
+            {
+                if (sample_finished[i] == 0 && tile_y_node < tile_h)
+                {
+                    reqJob[i] = tile_y_node;
+                    sample_finished[i] = 1;
+
+                    //displsBuf[i + 1] = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+                    //displsByte[i + 1] = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4);
+
+                    tile_y_node += tile_step;
+
+                    tile_id += tile_step;
+                }
+                else
+                {
+                    reqJob[i] = -1;
+                }
+
+                if (min_count > sample_finished[i])
+                    min_count = sample_finished[i];
+            }
+
+            task.update_progress(&tile);
+
+            if (reqFinished != 0)
+            {
+                for (int i = 0; i < dev_count; i++)
+                {
+                    reqJob[i] = -2;
+                }
+            }
+
+            MPI_Scatterv(&reqJob[0], &sendcountsJob[0], &displsJob[0], MPI_BYTE, NULL, 0, MPI_BYTE, 0, MPI_COMM_WORLD);
+            MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+            if (reqFinished != 0)
+            {
+                //printf("SERVER: finished %f\n", omp_get_wtime());
+                //fflush(0);
+                break;
+            }
+
+            if (min_count == 0 && tile_y_node >= tile_h)
+            {
+                reqFinished = 1;
+            }
+
+            if (task_pool.canceled())
+            {
+                if (task.need_finish_queue == false)
+                    reqFinished = 1;
+            }
+        }
+
+        MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        for (int i = 0; i < dev_count; i++)
+        {
+            displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float);
+            displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4);
+        }
+
+        if (rgba_pixels != NULL)
+            MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+        else
+            MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        //        {
+        //            for (int d = 0; d < MAX_NODE_DEVICES; d++)
+        //            {        
+        //                MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+        //
+        //                for (int i = 0; i < dev_count; i++)
+        //                {
+        //                    displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float);
+        //                    displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4);
+        //                }
+        //
+        //                if (rgba_pixels != NULL)
+        //                    MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+        //                else
+        //                    MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+        //            }
+        //        }
+    }
+
+    //    void receive_path_buffer_offline(DeviceTask& task, RenderTile &tile, int offset, int stride)
+    //    {
+    //        const int dev_count = info.multi_devices.size();
+    //
+    //        int tile_x = tile.buffers->params.full_x;
+    //        int tile_y = tile.buffers->params.full_y;
+    //        int tile_h = tile.buffers->params.height;
+    //        int tile_w = tile.buffers->params.width;
+    //
+    //        tile_x = tile.x;
+    //        tile_y = tile.y;
+    //        tile_h = tile.h;
+    //        tile_w = tile.w;
+    //
+    //        offset = tile.offset;
+    //        stride = tile.stride;
+    //
+    //        num_tiles = tile_h;
+    //        tile_id = 0;
+    //
+    //        int pass_stride = tile.buffers->params.get_passes_size();
+    //        int end_sample = tile.start_sample + tile.num_samples;
+    //
+    //        int tile_step = TILE_STEP;
+    //
+    //        const int dev_countAll = dev_count + 1;
+    //        std::vector<int> displsBuf(dev_countAll);
+    //        std::vector<int> recvcountsBuf(dev_countAll);
+    //        displsBuf[0] = 0;
+    //        recvcountsBuf[0] = 0;
+    //
+    //        std::vector<int> displsByte(dev_countAll);
+    //        std::vector<int> recvcountsByte(dev_countAll);
+    //        displsByte[0] = 0;
+    //        recvcountsByte[0] = 0;
+    //
+    //        std::vector<int> sample_finished(dev_count);
+    //        std::vector<int> displsSample(dev_countAll);
+    //        std::vector<int> recvcountsSample(dev_countAll);
+    //        displsSample[0] = 0;
+    //        recvcountsSample[0] = 0;
+    //
+    //        std::vector<int> row_finished(dev_count);
+    //        std::vector<int> displsRow(dev_countAll);
+    //        std::vector<int> recvcountsRow(dev_countAll);
+    //        displsRow[0] = 0;
+    //        recvcountsRow[0] = 0;
+    //
+    //        std::vector<int> reqJob(dev_count);
+    //        std::vector<int> displsJob(dev_countAll);
+    //        std::vector<int> sendcountsJob(dev_countAll);
+    //        displsJob[0] = 0;
+    //        sendcountsJob[0] = 0;
+    //
+    //        int tile_y_node = tile_y + dev_count*TILE_STEP;
+    //
+    //        for (int dev = 0; dev < dev_count; dev++)
+    //        {
+    //            int tile_y2 = tile_y + tile_step * dev;
+    //            int tile_h2 = tile_step;
+    //
+    //            displsBuf[dev + 1] = (offset + tile_x + tile_y2 * stride) * pass_stride * sizeof (float);
+    //            recvcountsBuf[dev + 1] = tile_w * tile_h2 * pass_stride * sizeof (float);
+    //
+    //            displsByte[dev + 1] = (offset + tile_x + tile_y2 * stride) * sizeof (uchar4);
+    //            recvcountsByte[dev + 1] = tile_w * tile_h2 * sizeof (uchar4);
+    //
+    //            displsSample[dev + 1] = dev * sizeof (int);
+    //            recvcountsSample[dev + 1] = sizeof (int);
+    //
+    //            displsRow[dev + 1] = dev * sizeof (int);
+    //            recvcountsRow[dev + 1] = sizeof (int);
+    //
+    //            displsJob[dev + 1] = dev * sizeof (int);
+    //            sendcountsJob[dev + 1] = sizeof (int);
+    //            reqJob[dev] = -1;
+    //        }
+    //
+    //        int reqFinished = 0;
+    //
+    //        while (true)
+    //        {
+    //            MPI_Gatherv(NULL, 0, MPI_BYTE, &sample_finished[0], &recvcountsSample[0], &displsSample[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    //
+    //            MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    //
+    //            for (int i = 0; i < dev_count; i++)
+    //            {
+    //                //printf("SERVER: sample_finished: %d\n", sample_finished[i]);
+    //                //printf("SERVER: row_finished: %d\n", row_finished[i]);
+    //
+    //                displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float);
+    //                displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4);
+    //            }
+    //
+    //            if (rgba_pixels != NULL)
+    //                MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    //            else
+    //                MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    //
+    //            int min_count = end_sample;
+    //            for (int i = 0; i < dev_count; i++)
+    //            {
+    //                if (min_count > sample_finished[i])
+    //                    min_count = sample_finished[i];
+    //
+    //                if (sample_finished[i] == end_sample && tile_y_node < tile_h)
+    //                {
+    //                    reqJob[i] = tile_y_node;
+    //
+    //                    //displsBuf[i + 1] = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+    //                    //displsByte[i + 1] = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4);
+    //
+    //                    tile_y_node+=TILE_STEP;
+    //
+    //                    tile_id+=TILE_STEP;
+    //                }
+    //                else
+    //                {
+    //                    reqJob[i] = -1;
+    //                }
+    //            }
+    //
+    //            task.update_progress(&tile);
+    //
+    //            if (reqFinished != 0)
+    //            {
+    //                for (int i = 0; i < dev_count; i++)
+    //                {
+    //                    reqJob[i] = -2;
+    //                }
+    //            }
+    //
+    //            MPI_Scatterv(&reqJob[0], &sendcountsJob[0], &displsJob[0], MPI_BYTE, NULL, 0, MPI_BYTE, 0, MPI_COMM_WORLD);
+    //            //MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    //
+    //            if (reqFinished != 0)
+    //            {
+    //                break;
+    //            }
+    //
+    //            if (min_count == end_sample && tile_y_node >= tile_h)
+    //            {
+    //                reqFinished = 1;
+    //            }
+    //            
+    //            if (task_pool.canceled())
+    //            {
+    //                if (task.need_finish_queue == false)
+    //                    reqFinished = 1;
+    //            }
+    //        }
+    //        
+    ////        {
+    ////            for (int d = 0; d < MAX_NODE_DEVICES; d++)
+    ////            {        
+    ////                MPI_Gatherv(NULL, 0, MPI_BYTE, &row_finished[0], &recvcountsRow[0], &displsRow[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    ////
+    ////                for (int i = 0; i < dev_count; i++)
+    ////                {
+    ////                    displsBuf[i + 1] = (offset + tile_x + row_finished[i] * stride) * pass_stride * sizeof (float);
+    ////                    displsByte[i + 1] = (offset + tile_x + row_finished[i] * stride) * sizeof (uchar4);
+    ////                }
+    ////
+    ////                if (rgba_pixels != NULL)
+    ////                    MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) rgba_pixels, &recvcountsByte[0], &displsByte[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    ////                else
+    ////                    MPI_Gatherv(NULL, 0, MPI_BYTE, (char*) tile.buffer, &recvcountsBuf[0], &displsBuf[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    ////            }
+    ////        }
+    //    }
+
+    void thread_path_trace(DeviceTask& task)
+    {
+        //printf("SERVER: thread_path_trace\n");
+
+        //double t[256];
+        //int index = 0;
+
+        //t[index++] = omp_get_wtime();
+
+        if (task_pool.canceled())
+        {
+            if (task.need_finish_queue == false)
+                return;
+        }
+
+        RenderTile tile;
+
+        int tile_h = 0;
+        int tile_w = 0;
+        int num_samples_orig = 0;
+
+        while (task.acquire_tile(this, tile))
+        {
+            int offset, stride;
+            tile.buffers->params.get_offset_stride(offset, stride);
+
+            int tile_x = tile.buffers->params.full_x;
+            int tile_y = tile.buffers->params.full_y;
+            tile_h = tile.buffers->params.height;
+            tile_w = tile.buffers->params.width;
+            num_samples_orig = tile.num_samples_orig;
+
+            tile.sample = tile.start_sample + tile.num_samples;
+
+            if (!tile.progressive)
+            {
+                tile_x = tile.x;
+                tile_y = tile.y;
+                tile_h = tile.h;
+                tile_w = tile.w;
+
+                offset = tile.offset;
+                stride = tile.stride;
+            }
+
+            bool progressive = tile.progressive; // || background;
+
+            mpi_path_trace(kernel_globals.__data_size, (char*) rgba_pixels, tile.half_float, (char*) tile.buffer, (char*) tile.rng_state, progressive, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w);
+
+            if (progressive)
+            {
+                receive_path_buffer_progressive(task, tile, offset, stride);
+            }
+            else
+            {
+//                if (tile_h % TILE_STEP != 0)
+//                {
+//                    printf("ERROR: tile_h is not divided with %d\n", TILE_STEP);
+//                    return;
+//                }
+                receive_path_buffer_offline(task, tile, offset, stride);
+            }
+
+            tile_id = tile_h;
+
+            task.release_tile(tile);
+
+            if (task_pool.canceled())
+            {
+                if (task.need_finish_queue == false)
+                    break;
+            }
+        }
+
+        //t[index++] = omp_get_wtime();
+        //printf("=========MPI: thread_path_trace========: t: %f, w: %d, h:%d, s:%d\n", t[index - 1] - t[0], tile_w, tile_h, num_samples_orig);
+    }
+
+    void thread_film_convert(DeviceTask& task)
+    {
+    }
+
+    void thread_shader(DeviceTask& task)
+    {
+    }
+
+    bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams &params, float* buffer)
+    {
+        printf("get_pass_rect, sample: %d\n", sample);
+
+        int pass_offset = 0;
+
+        foreach(Pass& pass, params.passes)
+        {
+            if (pass.type != type)
+            {
+                pass_offset += pass.components;
+                continue;
+            }
+
+            float *in = (float*) buffer + pass_offset;
+            int pass_stride = params.get_passes_size();
+
+            float scale = (pass.filter) ? 1.0f / (float) sample : 1.0f;
+            float scale_exposure = (pass.exposure) ? scale * exposure : scale;
+
+            int size = params.width * params.height;
+
+            if (components == 1)
+            {
+                assert(pass.components == components);
+
+                /* scalar */
+                if (type == PASS_DEPTH)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = (f == 0.0f) ? 1e10f : f*scale_exposure;
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+                else if (type == PASS_MIST)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = saturate(f * scale_exposure);
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+#ifdef WITH_CYCLES_DEBUG
+                else if (type == PASS_BVH_TRAVERSAL_STEPS)
+                {
+                    for (int i = 0; i < size; i++, in += pass_stride, pixels++)
+                    {
+                        float f = *in;
+                        pixels[0] = f;
+                    }
+                }
+                else if (type == PASS_RAY_BOUNCES)
+                {
+                    for (int i = 0; i < size; i++, in += pass_stride, pixels++)
+                    {
+                        float f = *in;
+                        pixels[0] = f;
+                    }
+                }
+#endif
+                else
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = f*scale_exposure;
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+            }
+            else if (components == 3)
+            {
+                assert(pass.components == 4);
+
+                /* RGBA */
+                if (type == PASS_SHADOW)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 3
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
+
+                        pixels[i * 3 + 0] = f.x*invw;
+                        pixels[i * 3 + 1] = f.y*invw;
+                        pixels[i * 3 + 2] = f.z*invw;
+                    }
+                    in += size*pass_stride;
+                    pixels += size * 3;
+
+                }
+                else if (pass.divide_type != PASS_NONE)
+                {
+                    /* RGB lighting passes that need to divide out color */
+                    pass_offset = 0;
+
+                    foreach(Pass& color_pass, params.passes)
+                    {
+                        if (color_pass.type == pass.divide_type)
+                            break;
+                        pass_offset += color_pass.components;
+                    }
+
+                    float *in_divide = (float*) buffer + pass_offset;
+
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, in_divide += pass_stride, pixels += 3
+                        float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]);
+                        float3 f_divide = make_float3(in_divide[i * pass_stride + 0], in_divide[i * pass_stride + 1], in_divide[i * pass_stride + 2]);
+
+                        f = safe_divide_even_color(f*exposure, f_divide);
+
+                        pixels[i * 3 + 0] = f.x;
+                        pixels[i * 3 + 1] = f.y;
+                        pixels[i * 3 + 2] = f.z;
+                    }
+
+                    in += size*pass_stride;
+                    in_divide += size*pass_stride;
+                    pixels += size * 3;
+                }
+                else
+                {
+                    /* RGB/vector */
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 3
+                        float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]);
+
+                        pixels[i * 3 + 0] = f.x*scale_exposure;
+                        pixels[i * 3 + 1] = f.y*scale_exposure;
+                        pixels[i * 3 + 2] = f.z*scale_exposure;
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 3;
+                }
+            }
+            else if (components == 4)
+            {
+                assert(pass.components == components);
+
+                /* RGBA */
+                if (type == PASS_SHADOW)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
+
+                        pixels[i * 4 + 0] = f.x*invw;
+                        pixels[i * 4 + 1] = f.y*invw;
+                        pixels[i * 4 + 2] = f.z*invw;
+                        pixels[i * 4 + 3] = 1.0f;
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 4;
+                }
+                else if (type == PASS_MOTION)
+                {
+                    /* need to normalize by number of samples accumulated for motion */
+                    pass_offset = 0;
+
+                    foreach(Pass& color_pass, params.passes)
+                    {
+                        if (color_pass.type == PASS_MOTION_WEIGHT)
+                            break;
+                        pass_offset += color_pass.components;
+                    }
+
+                    float *in_weight = (float*) buffer + pass_offset;
+
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, in_weight += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float w = in_weight[i * pass_stride + 0];
+                        float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
+
+                        pixels[i * 4 + 0] = f.x*invw;
+                        pixels[i * 4 + 1] = f.y*invw;
+                        pixels[i * 4 + 2] = f.z*invw;
+                        pixels[i * 4 + 3] = f.w*invw;
+                    }
+
+                    in += size*pass_stride;
+                    in_weight += size*pass_stride;
+                    pixels += size * 4;
+                }
+                else
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+
+                        pixels[i * 4 + 0] = f.x*scale_exposure;
+                        pixels[i * 4 + 1] = f.y*scale_exposure;
+                        pixels[i * 4 + 2] = f.z*scale_exposure;
+
+                        /* clamp since alpha might be > 1.0 due to russian roulette */
+                        //pixels[i * 4 + 3] = saturate(f.w*scale);
+                        pixels[i * 4 + 3] = saturate(f.w);
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 4;
+                }
+            }
+
+            return true;
+        }
+        return false;
+    }
+
+
+};
+
+Device *device_mpi_create(DeviceInfo& info, Stats &stats, bool background)
+{
+    return new MultiMPIDevice(info, stats, background);
+}
+
+bool device_mpi_init(void)
+{
+    return true;
+}
+
+void device_mpi_info(vector<DeviceInfo>& devices)
+{
+    if (getCountOfDevices() < 1)
+        return;
+
+    DeviceInfo info;
+
+    info.type = DEVICE_MPI;
+    info.description = string_printf("MPI_%d", getCountOfDevices());
+    info.num = 0;
+    info.id = string_printf("MPI_0");
+    info.advanced_shading = true;
+    info.pack_images = false;
+
+    //#ifdef WITH_IT4I_MIC_OFFLOAD
+    //    DeviceInfo infoMICS;
+    //
+    //    infoMICS.type = DEVICE_MPI;
+    //    infoMICS.description = string_printf("MPI_%d with MICS", getCountOfDevices());
+    //    infoMICS.num = 1;
+    //    infoMICS.id = string_printf("MPI_1");
+    //    infoMICS.advanced_shading = true;
+    //    infoMICS.pack_images = false;
+    //    
+    //#endif      
+
+    for (int i = 0; i < getCountOfDevices(); i++)
+    {
+        DeviceInfo subinfo;
+
+        subinfo.type = DEVICE_MPI;
+        subinfo.description = string_printf("MPI_%d", i);
+        subinfo.num = i + 1;
+        subinfo.id = string_printf("MPI_%d", i);
+        subinfo.advanced_shading = true;
+        subinfo.pack_images = false;
+
+        info.multi_devices.push_back(subinfo);
+
+        //#ifdef WITH_IT4I_MIC_OFFLOAD
+        //        infoMICS.multi_devices.push_back(subinfo);
+        //#endif      
+    }
+
+    devices.insert(devices.begin(), info);
+
+    //#ifdef WITH_IT4I_MIC_OFFLOAD
+    //    devices.insert(devices.begin(), infoMICS);    
+    //#endif      
+}
+
+string device_mpi_capabilities(void)
+{
+    return "";
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 069305e8a292cae9a0945744eb3894d8bcdb7189..8822c60115983f93f93d5895d89190d02498fc72 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -98,30 +98,30 @@ public:
 		return true;
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
+			sub.device->mem_alloc(name, mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
 		mem.device_pointer = unique_ptr++;
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(const char *name, device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->mem_copy_to(mem);
+			sub.device->mem_copy_to(name, mem);
 		}
 
 		mem.device_pointer = tmp;
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
 	{
 		device_ptr tmp = mem.device_pointer;
 		int i = 0, sub_h = h/devices.size();
@@ -131,32 +131,32 @@ public:
 			int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
 
 			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->mem_copy_from(mem, sy, w, sh, elem);
+			sub.device->mem_copy_from(name, mem, sy, w, sh, elem);
 			i++;
 		}
 
 		mem.device_pointer = tmp;
 	}
 
-	void mem_zero(device_memory& mem)
+	void mem_zero(const char *name, device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->mem_zero(mem);
+			sub.device->mem_zero(name, mem);
 		}
 
 		mem.device_pointer = tmp;
 	}
 
-	void mem_free(device_memory& mem)
+	void mem_free(const char *name, device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->mem_free(mem);
+			sub.device->mem_free(name, mem);
 			sub.ptr_map.erase(sub.ptr_map.find(tmp));
 		}
 
@@ -186,13 +186,13 @@ public:
 		mem.device_pointer = unique_ptr++;
 	}
 
-	void tex_free(device_memory& mem)
+	void tex_free(const char *name, device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
-			sub.device->tex_free(mem);
+			sub.device->tex_free(name, mem);
 			sub.ptr_map.erase(sub.ptr_map.find(tmp));
 		}
 
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index cf4a05de8fc35f321b33fda81549432a202f3fe4..f0286d4a5559ff6d6837b3a0f274e3084482bf73 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -82,7 +82,7 @@ public:
 		snd.write();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		thread_scoped_lock lock(rpc_lock);
 
@@ -95,7 +95,7 @@ public:
 		snd.write();
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(const char *name, device_memory& mem)
 	{
 		thread_scoped_lock lock(rpc_lock);
 
@@ -106,7 +106,7 @@ public:
 		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
 	{
 		thread_scoped_lock lock(rpc_lock);
 
@@ -125,7 +125,7 @@ public:
 		rcv.read_buffer((void*)mem.data_pointer, data_size);
 	}
 
-	void mem_zero(device_memory& mem)
+	void mem_zero(const char *name, device_memory& mem)
 	{
 		thread_scoped_lock lock(rpc_lock);
 
@@ -135,7 +135,7 @@ public:
 		snd.write();
 	}
 
-	void mem_free(device_memory& mem)
+	void mem_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			thread_scoped_lock lock(rpc_lock);
@@ -186,7 +186,7 @@ public:
 		snd.write_buffer((void*)mem.data_pointer, mem.memory_size());
 	}
 
-	void tex_free(device_memory& mem)
+	void tex_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			thread_scoped_lock lock(rpc_lock);
@@ -654,7 +654,7 @@ protected:
 
 			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
 			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-			task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this);
+			task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this, _1);
 			task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
 			task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
 
diff --git a/intern/cycles/device/device_omp.cpp b/intern/cycles/device/device_omp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79dcf664651e499da9ea7649c97f76ede0205033
--- /dev/null
+++ b/intern/cycles/device/device_omp.cpp
@@ -0,0 +1,1354 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "device.h"
+#include "device_intern.h"
+
+#include "kernel.h"
+//#include "kernel_compat_omp.h"
+//
+//#include "kernel_types.h"
+//#include "kernel_globals.h"
+//
+//#include "buffers.h"
+//
+//#include "util_debug.h"
+#include "util_foreach.h"
+//#include "util_function.h"
+//#include "util_logging.h"
+//#include "util_opengl.h"
+//#include "util_progress.h"
+//#include "util_system.h"
+//#include "util_thread.h"
+//
+//#include "kernel_omp.h"
+
+#include <boost/algorithm/string.hpp>
+#include <omp.h>
+
+#include "kernel_omp.h"
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#include "kernel_mic.h"
+#endif
+
+#define SIZEOF_UCHAR4 (sizeof(unsigned char)*4)
+
+CCL_NAMESPACE_BEGIN
+
+class OMPDevice : public Device
+{
+public:
+    DedicatedTaskPool task_pool;
+    device_ptr kernel_globals_cpu;
+    std::vector<device_ptr> kernel_globals_mics;
+
+    device_ptr rgba_pixels;
+    int tile_id;
+    int num_tiles;
+
+    OMPDevice(DeviceInfo& info, Stats &stats, bool background)
+    : Device(info, stats, background)
+    {
+        printf("OMPDevice\n");
+        rgba_pixels = NULL;
+        tile_id = 0;
+        num_tiles = 0;
+
+        kernel_globals_mics.resize(info.multi_devices.size());
+        kernel_globals_cpu = omp_alloc_kg(info.num);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            kernel_globals_mics[dev] = mic_alloc_kg(info.multi_devices[dev].num);
+        }
+#endif
+    }
+
+    ~OMPDevice()
+    {
+        printf("~OMPDevice\n");
+        omp_free_kg(info.num, kernel_globals_cpu);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            mic_free_kg(info.multi_devices[dev].num, kernel_globals_mics[dev]);
+        }
+#endif
+
+        task_pool.stop();
+    }
+
+    void mem_alloc(const char *name, device_memory& mem, MemoryType type)
+    {
+        printf("mem_alloc: %s\n", name);
+        mem.device_pointer = mem.data_pointer;
+        mem.device_size = mem.memory_size();
+        stats.mem_alloc(mem.device_size);
+
+        //printf("");
+
+        if (!strcmp(name, "pixel"))
+        {
+            rgba_pixels = mem.device_pointer;
+        }
+
+        omp_mem_alloc(info.num, (char*) mem.device_pointer, mem.device_size);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            mic_mem_alloc(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size);
+        }
+#endif
+    }
+
+    void mem_copy_to(const char *name, device_memory& mem)
+    {
+        printf("mem_copy_to: %s\n", name);
+
+        omp_mem_copy_to(info.num, (char*) mem.device_pointer, mem.device_size, NULL);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            mic_mem_copy_to(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size, NULL);
+        }
+#endif
+    }
+
+    void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
+    {
+    }
+
+    void mem_zero(const char *name, device_memory& mem)
+    {
+        printf("mem_zero: %s\n", name);
+
+        if (mem.device_pointer)
+        {
+            memset((void*) mem.device_pointer, 0, mem.memory_size());
+
+            omp_mem_zero(info.num, (char*) mem.device_pointer, mem.device_size);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+            for (int dev = 0; dev < info.multi_devices.size(); dev++)
+            {
+                mic_mem_zero(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size);
+            }
+#endif
+        }
+    }
+
+    void mem_free(const char *name, device_memory& mem)
+    {
+        printf("mem_free: %s\n", name);
+
+        if (mem.device_pointer)
+        {
+            if (!strcmp(name, "pixel"))
+            {
+                rgba_pixels = NULL;
+            }
+
+            omp_mem_free(info.num, (char*) mem.device_pointer, mem.device_size);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+            for (int dev = 0; dev < info.multi_devices.size(); dev++)
+            {
+                mic_mem_free(info.multi_devices[dev].num, (char*) mem.device_pointer, mem.device_size);
+            }
+#endif
+            mem.device_pointer = 0;
+            stats.mem_free(mem.device_size);
+            mem.device_size = 0;
+        }
+
+    }
+
+    void const_copy_to(const char *name, void *host, size_t size)
+    {
+        printf("const_copy_to: %s\n", name);
+
+        omp_const_copy(info.num, kernel_globals_cpu, name, (char*) host, size);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            mic_const_copy(info.multi_devices[dev].num, kernel_globals_mics[dev], name, (char*) host, size);
+        }
+#endif
+    }
+
+    void tex_alloc(const char *name,
+            device_memory& mem,
+            InterpolationType interpolation,
+            ExtensionType extension)
+    {
+        printf("tex_alloc: %s\n", name);
+
+        mem.device_pointer = mem.data_pointer;
+        mem.device_size = mem.memory_size();
+        stats.mem_alloc(mem.device_size);
+
+        omp_tex_copy(info.num, kernel_globals_cpu,
+                name,
+                (char*) mem.device_pointer,
+                mem.device_size,
+                mem.data_width,
+                mem.data_height,
+                mem.data_depth,
+                interpolation,
+                (int) extension);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+        {
+            mic_tex_copy(info.multi_devices[dev].num, kernel_globals_mics[dev],
+                    name,
+                    (char*) mem.device_pointer,
+                    mem.device_size,
+                    mem.data_width,
+                    mem.data_height,
+                    mem.data_depth,
+                    interpolation,
+                    (int) extension);
+        }
+#endif
+    }
+
+    void tex_free(const char *name, device_memory& mem)
+    {
+        printf("tex_free: %s\n", name);
+
+        if (mem.device_pointer)
+        {
+            omp_tex_free(info.num, kernel_globals_cpu, name, (char*) mem.device_pointer, mem.device_size);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+            for (int dev = 0; dev < info.multi_devices.size(); dev++)
+            {
+                mic_tex_free(info.multi_devices[dev].num, kernel_globals_mics[dev], name, (char*) mem.device_pointer, mem.device_size);
+            }
+#endif
+
+            mem.device_pointer = 0;
+            stats.mem_free(mem.device_size);
+            mem.device_size = 0;
+        }
+    }
+
+    void *osl_memory()
+    {
+        return NULL;
+    }
+
+    void thread_run(DeviceTask *task)
+    {
+        printf("thread_run: %d\n", task->type);
+
+        double t1 = omp_get_wtime();
+
+        if (task->type == DeviceTask::PATH_TRACE)
+            thread_path_trace(*task);
+        else if (task->type == DeviceTask::FILM_CONVERT)
+            thread_film_convert(*task);
+        else if (task->type == DeviceTask::SHADER)
+            thread_shader(*task);
+
+        double t2 = omp_get_wtime();
+        printf("DEVICE: OMP, %f\n", t2 - t1);
+    }
+
+    class OMPDeviceTask : public DeviceTask
+    {
+    public:
+
+        OMPDeviceTask(OMPDevice *device, DeviceTask& task)
+        : DeviceTask(task)
+        {
+            run = function_bind(&OMPDevice::thread_run, device, this);
+        }
+    };
+
+    //    void path_trace(size_t kg_data_size, char *buffer, char *rng_state, bool progressive, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile)
+    //    {
+    //        ///////////////////////////share nodes////////////////////////////////////
+    //
+    //        size_t offsetSample = 0;
+    //        size_t sizeSample = sizeof (int);
+    //
+    //        int reqFinished = 0;
+    //
+    //#ifdef WITH_IT4I_MIC_OFFLOAD
+    //        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+    //        {
+    //            mic_mem_alloc(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int));
+    //        }
+    //#endif        
+    //
+    //        int end_sample = start_sample + num_samples;
+    //
+    //        int pass_stride = omp_get_pass_stride(kernel_globals_cpu);
+    //
+    //        ////////////////////////////one node///////////////////////////////////    
+    //        omp_set_nested(1);
+    //        int nprocs_cpu = omp_get_max_threads() - 1;
+    //        //printf("nprocs_cpu: %d\n", nprocs_cpu);
+    //
+    //        int tile_y_node = tile_y;
+    //        int tile_h_node = tile_h;
+    //
+    //        int size_node = tile_h_node * tile_w;
+    //
+    //        size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+    //        size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+    //
+    //        size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4);
+    //        size_t sizeByte_node = size_node * sizeof (uchar4);
+    //
+    //#ifdef WITH_IT4I_MIC_OFFLOAD        
+    //        int devices_size_cpu_mics = info.multi_devices.size() + 2;
+    //
+    //        int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics;
+    //        int dev_cpu_mics = 0;
+    //
+    //        //////////////////////////mic0////////////////////////////////////
+    //        vector<int> sample_finished_mic0(info.multi_devices.size());
+    //        vector<int> tile_y_mic0(info.multi_devices.size());
+    //        vector<int> tile_h_mic0(info.multi_devices.size());
+    //        vector<int> size_mic0(info.multi_devices.size());
+    //        vector<size_t> offsetBuf_mic0(info.multi_devices.size());
+    //        vector<size_t> sizeBuf_mic0(info.multi_devices.size());
+    //        vector<size_t> offsetByte_mic0(info.multi_devices.size());
+    //        vector<size_t> sizeByte_mic0(info.multi_devices.size());
+    //
+    //        //sync
+    //        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+    //        {
+    //            sample_finished_mic0[dev] = 0;
+    //
+    //            mic_mem_alloc(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], sizeof (int));
+    //        }
+    //
+    //        //async
+    //        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+    //        {
+    //            dev_cpu_mics = dev;
+    //
+    //            //sample_finished_mic0[dev] = 0;
+    //            //mic_mem_alloc(info.multi_devices[dev].num, (char*)&sample_finished_mic0[dev], sizeof(int));
+    //
+    //            tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+    //            tile_h_mic0[dev] = tile_step_cpu_mics;
+    //
+    //            size_mic0[dev] = tile_h_mic0[dev] * tile_w;
+    //
+    //            offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float);
+    //            sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float);
+    //
+    //            offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * sizeof (uchar4);
+    //            sizeByte_mic0[dev] = size_mic0[dev] * sizeof (uchar4);
+    //
+    //            mic_path_trace(info.multi_devices[dev].num, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, 240, (char *) rng_state);
+    //        }
+    //        dev_cpu_mics = info.multi_devices.size();
+    //
+    //        //////////////////////////cpu/////////////////////////////////////        
+    //        int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+    //        int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics;
+    //#else        
+    //        //////////////////////////cpu/////////////////////////////////////
+    //
+    //        int tile_y_cpu = tile_y_node;
+    //        int tile_h_cpu = tile_h_node;
+    //#endif        
+    //
+    //        int sample_finished_cpu = 0;
+    //
+    //        //        int size_cpu = tile_h_cpu * tile_w;
+    //        //
+    //        //        size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float);
+    //        //        size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float);
+    //        //
+    //        //        size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * sizeof (uchar4);
+    //        //        size_t sizeByte_cpu = size_cpu * sizeof (uchar4);
+    //        //////////////////////////////////////////////////////////////////
+    //
+    //#pragma omp parallel num_threads(2)
+    //        {
+    //#pragma omp single nowait
+    //            {
+    //#pragma omp task
+    //                {
+    //                    omp_path_trace(info.num, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL);
+    //                }
+    //
+    //#pragma omp task
+    //                {
+    //                    while (true)
+    //                    {
+    //                        int sample_finished = sample_finished_cpu;
+    //
+    //#ifdef WITH_IT4I_MIC_OFFLOAD                        
+    //                        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+    //                        {
+    //                            mic_mem_copy_to(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int), (char*) &reqFinished);
+    //                            mic_mem_copy_from(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], 0, sizeof (int), (char*) &sample_finished_mic0[dev]);
+    //
+    //                            if (rgba_pixels != NULL)
+    //                            {
+    //                                mic_mem_copy_from(info.multi_devices[dev].num, (char*) rgba_pixels, offsetByte_mic0[dev], sizeByte_mic0[dev], (char*) rgba_pixels);
+    //                            }
+    //                            else
+    //                            {
+    //                                mic_mem_copy_from(info.multi_devices[dev].num, (char*) buffer, offsetBuf_mic0[dev], sizeBuf_mic0[dev], (char*) buffer);
+    //                            }
+    //
+    //                            mic_wait(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev]);
+    //
+    //                            if (sample_finished_mic0[dev] < sample_finished)
+    //                                sample_finished = sample_finished_mic0[dev];
+    //                        }
+    //
+    //#endif                         
+    //
+    //                        if (sample_finished > 0 && tile.sample != sample_finished)
+    //                        {
+    //                            tile.sample = sample_finished;
+    //                            task.update_progress(&tile);
+    //                        }
+    //
+    //                        if (reqFinished != 0)
+    //                        {
+    //                            break;
+    //                        }
+    //
+    //                        if (sample_finished == end_sample)
+    //                        {
+    //                            reqFinished = 1;
+    //                        }
+    //
+    //                        if (task_pool.canceled())
+    //                        {
+    //                            if (task.need_finish_queue == false)
+    //                                reqFinished = 1;
+    //                        }
+    //                    }
+    //                }
+    //            }
+    //
+    //#pragma omp taskwait  
+    //        }
+    //
+    //#ifdef WITH_IT4I_MIC_OFFLOAD        
+    //        for (int dev = 0; dev < info.multi_devices.size(); dev++)
+    //        {
+    //            //mic_wait(info.multi_devices[dev].num, (char*)&reqFinished);
+    //            mic_mem_free(info.multi_devices[dev].num, (char*) &sample_finished_mic0[dev], sizeof (int));
+    //            mic_mem_free(info.multi_devices[dev].num, (char*) &reqFinished, sizeof (int));
+    //        }
+    //#endif          
+    //    }
+
+    virtual int get_tile_id()
+    {
+        return tile_id;
+    };
+
+    virtual int get_num_tiles()
+    {
+        return num_tiles;
+    };
+
+    void path_trace_progressive(size_t kg_data_size, char *buffer, char *rng_state, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile)
+    {
+        size_t offsetSample = 0;
+        size_t sizeSample = sizeof (int);
+
+        int reqFinished = 0;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int));
+        }
+#endif
+
+        int end_sample = start_sample + num_samples;
+        int pass_stride = omp_get_pass_stride(kernel_globals_cpu);
+
+        ////////////////////////////one node///////////////////////////////////    
+        //omp_set_nested(1);
+        int nprocs_mic = 240;
+        int nprocs_cpu = omp_get_max_threads() - 1;
+
+        if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+        {
+            nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS"));
+            printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu);
+        }
+
+        if (getenv("IT4I_OMP_MIC_NUM_THREADS"))
+        {
+            nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS"));
+            printf("IT4I_OMP_MIC_NUM_THREADS: %d\n", nprocs_mic);
+        }
+
+        int dev_node = 0;
+        int devices_size_node = 1;
+
+        int tile_step_node = tile_h / devices_size_node;
+        int tile_last_node = tile_h - (devices_size_node - 1) * tile_step_node;
+
+        int tile_y_node = tile_y + tile_step_node * dev_node;
+        int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node;
+
+        int size_node = tile_h_node * tile_w;
+
+        size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+        size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+
+        size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+        size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+
+        int devices_size_cpu_mics = kernel_globals_mics.size() + 2;
+
+        int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics;
+        //int tile_last_cpu_mics = tile_h_node - (devices_size_cpu_mics - 1) * tile_step_cpu_mics;
+
+        int dev_cpu_mics = 0;
+
+        int signal1, signal2, signal3, signal4;
+
+        //////////////////////////mic0////////////////////////////////////
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+        std::vector<int> sample_finished_mic0(kernel_globals_mics.size());
+        std::vector<int> tile_y_mic0(kernel_globals_mics.size());
+        std::vector<int> tile_h_mic0(kernel_globals_mics.size());
+        std::vector<int> size_mic0(kernel_globals_mics.size());
+        std::vector<size_t> offsetBuf_mic0(kernel_globals_mics.size());
+        std::vector<size_t> sizeBuf_mic0(kernel_globals_mics.size());
+        std::vector<size_t> offsetByte_mic0(kernel_globals_mics.size());
+        std::vector<size_t> sizeByte_mic0(kernel_globals_mics.size());
+
+        //sync
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            sample_finished_mic0[dev] = 0;
+
+            mic_mem_alloc(dev, (char*) &sample_finished_mic0[dev], sizeof (int));
+        }
+
+        //async
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            dev_cpu_mics = dev;
+
+            //sample_finished_mic0[dev] = 0;
+            //mic_mem_alloc(dev, (char*)&sample_finished_mic0[dev], sizeof(int));
+
+            tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+            tile_h_mic0[dev] = tile_step_cpu_mics;
+
+            size_mic0[dev] = tile_h_mic0[dev] * tile_w;
+
+            offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float);
+            sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float);
+
+            offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * SIZEOF_UCHAR4;
+            sizeByte_mic0[dev] = size_mic0[dev] * SIZEOF_UCHAR4;
+
+            if (dev == 0)
+                mic_path_trace(dev, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, signal1);
+
+            if (dev == 1)
+                mic_path_trace(dev, kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, signal2);
+        }
+#endif    
+        //////////////////////////cpu/////////////////////////////////////
+
+        dev_cpu_mics = kernel_globals_mics.size();
+
+        int sample_finished_cpu = 0;
+
+        int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+        int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics;
+
+        int size_cpu = tile_h_cpu * tile_w;
+
+        size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float);
+        size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float);
+
+        size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * SIZEOF_UCHAR4;
+        size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4;
+        //////////////////////////////////////////////////////////////////
+
+        omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL);
+
+
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            if (dev == 0)
+                mic_wait(dev, signal1);
+
+            if (dev == 1)
+                mic_wait(dev, signal2);
+            //            while(true)
+            //            {
+            //#pragma omp flush                
+            //                if (sample_finished_mic0[dev] == end_sample)
+            //                    break;
+            //            }
+
+            if (rgba_pixels != NULL)
+            {
+                mic_mem_copy_from(dev, (char*) rgba_pixels, offsetByte_mic0[dev], sizeByte_mic0[dev], NULL);
+            }
+            else
+            {
+                mic_mem_copy_from(dev, (char*) buffer, offsetBuf_mic0[dev], sizeBuf_mic0[dev], NULL);
+            }
+        }
+#endif    
+
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            //mic_wait(dev, (char*)&reqFinished);
+            mic_mem_free(dev, (char*) &sample_finished_mic0[dev], sizeof (int));
+            mic_mem_free(dev, (char*) &reqFinished, sizeof (int));
+        }
+#endif    
+
+        tile_id = tile_h;
+        task.update_progress(&tile);
+    }
+
+    void path_trace_offline(size_t kg_data_size, char *buffer, char *rng_state, int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, DeviceTask& task, RenderTile &tile)
+    {
+        size_t offsetSample = 0;
+        size_t sizeSample = sizeof (int);
+
+        int reqFinished = 0;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+        for (int dev = 0; dev < kernel_globals_mics.size(); dev++)
+        {
+            mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int));
+        }
+#endif
+
+        int end_sample = start_sample + num_samples;
+        int pass_stride = omp_get_pass_stride(kernel_globals_cpu);
+
+        ////////////////////////////one node///////////////////////////////////    
+        omp_set_nested(1);
+
+        int tile_step_node = 1;
+
+        if (getenv("IT4I_OMP_TILE_STEP"))
+        {
+            tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP"));
+            printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node);
+        }
+
+        int tile_h_node = tile_step_node;
+        int omp_path_trace_req = 0;
+
+        int size_node = tile_h_node * tile_w;
+
+        //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+        size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+
+        //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+        size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+
+        //int sample_finished_node = 0;    
+
+        ////////////////////////////MICS//////////////////////////////////////
+        int signal1, signal2, signal3, signal4;
+
+        const int num_devices_cpu_mics = kernel_globals_mics.size() + 1;
+        //const int num_devices_mics = mpiData->kernel_globals_mics.size();
+
+        std::vector<int> sample_finished_devices(num_devices_cpu_mics);
+
+        if (kernel_globals_mics.size() == 0)
+        {
+            int nprocs_cpu = omp_get_max_threads();
+
+            if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+            {
+                nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS"));
+                printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu);
+            }
+
+            //omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL);
+            int size = tile_h*tile_w;
+            num_tiles = size;
+            sample_finished_devices[0] = start_sample;
+
+#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1)
+            for (int i = 0; i < size; i++)
+            {
+                int y = i / tile_w;
+                int x = i - y * tile_w;
+
+                for (int sample = start_sample; sample < end_sample; sample++)
+                {
+                    omp_kernel_path_trace(kernel_globals_cpu, (float *) buffer, (unsigned int*) rng_state, sample, x + tile_x, y + tile_y, offset, stride);
+
+                    if (rgba_pixels != NULL)
+                    {
+                        float sample_scale = 1.0f / (sample + 1.0f);
+
+                        if (tile.half_float)
+                            omp_convert_to_half_float(kernel_globals_cpu, (char*) rgba_pixels, (float *) buffer, sample_scale, x + tile_x, y + tile_y, offset, stride);
+                        else
+                            omp_film_convert_byte(kernel_globals_cpu, (char*) rgba_pixels, (float *) buffer, sample_scale, x + tile_x, y + tile_y, offset, stride);
+                    }
+                }
+
+                int tid = omp_get_thread_num();
+                if (tid == 0)
+                {
+                    tile_id = i;
+                    task.update_progress(&tile);
+                }
+            }
+
+            sample_finished_devices[0] = end_sample;
+
+            tile_id = size;
+        }
+        else
+        {
+            int nprocs_mic = 240;
+            int nprocs_cpu = omp_get_max_threads() - 1;
+
+            if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+            {
+                nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1;
+                printf("IT4I_OMP_CPU_NUM_THREADS: %d\n", nprocs_cpu);
+            }
+
+            if (getenv("IT4I_OMP_MIC_NUM_THREADS"))
+            {
+                nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS"));
+                printf("IT4I_OMP_MIC_NUM_THREADS: %d\n", nprocs_mic);
+            }
+
+            std::vector<int> tile_y_devices(num_devices_cpu_mics);
+            int tile_y_node = tile_y;
+
+            for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+            {
+                //sample_finished_devices[dev] = 0;
+                sample_finished_devices[dev] = end_sample;
+                tile_y_devices[dev] = 0;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD        
+                if (dev > 0)
+                {
+                    //sync
+                    mic_mem_alloc(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int));
+                }
+#endif        
+            }
+            //////////////////////////////////////////////////////////////////
+
+#pragma omp parallel num_threads(2)
+            {
+#pragma omp single nowait
+                {
+#pragma omp task
+                    {
+                        while (reqFinished == 0)
+                        {
+#pragma omp flush
+                            if (omp_path_trace_req != 0)
+                            {
+                                //omp_path_trace(info.num, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL);
+                                printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", 0, sample_finished_devices[0], end_sample, tile_y_devices[0], tile_h);
+                                omp_path_trace(-1, kernel_globals_cpu, (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL);
+                                omp_path_trace_req = 0;
+                            }
+                            usleep(100);
+                        }
+                    }
+
+#pragma omp task
+                    {
+                        while (true)
+                        {
+                            int min_count = end_sample;
+
+                            for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+                            {
+#ifdef WITH_IT4I_MIC_OFFLOAD                        
+                                if (dev > 0)
+                                {
+                                    if (tile_y_devices[dev] != 0)
+                                    {
+                                        if (rgba_pixels != NULL)
+                                        {
+                                            size_t offsetByte_node = (offset + tile_x + tile_y_devices[dev] * stride) * SIZEOF_UCHAR4;
+                                            mic_mem_copy_from(dev - 1, (char*) rgba_pixels, offsetByte_node, sizeByte_node, NULL/*, (char*) &rgba_pixels*/);
+                                        }
+                                        else
+                                        {
+                                            size_t offsetBuf_node = (offset + tile_x + tile_y_devices[dev] * stride) * pass_stride * sizeof (float);
+                                            mic_mem_copy_from(dev - 1, (char*) buffer, offsetBuf_node, sizeBuf_node, NULL/*, (char*) &buffer*/);
+                                        }
+                                    }
+                                }
+#endif                      
+
+#pragma omp flush
+                                if (min_count > sample_finished_devices[dev])
+                                    min_count = sample_finished_devices[dev];
+
+                                if (sample_finished_devices[dev] == end_sample && tile_y_node < tile_h)
+                                {
+                                    sample_finished_devices[dev] = start_sample;
+                                    tile_y_devices[dev] = tile_y_node;
+
+                                    tile_y_node += tile_step_node;
+                                    tile_id += tile_step_node;
+
+                                    if (tile_y_node > tile_h)
+                                    {
+                                        tile_y_node = tile_h;
+                                        tile_id = tile_h;
+                                    }
+
+                                    if (dev == 0)
+                                    {
+                                        omp_path_trace_req = 1;
+                                    }
+#ifdef WITH_IT4I_MIC_OFFLOAD                            
+                                    else
+                                    {
+                                        printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h);
+
+                                        if (dev == 1)
+                                            mic_path_trace(dev - 1, kernel_globals_mics[dev - 1], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal1);
+
+                                        if (dev == 2)
+                                            mic_path_trace(dev - 1, kernel_globals_mics[dev - 1], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal2);
+                                    }
+#endif                                                                    
+                                }
+                            }
+
+                            task.update_progress(&tile);
+
+
+                            if (min_count == end_sample && tile_y_node >= tile_h)
+                            {
+                                reqFinished = 1;
+                            }
+
+                            if (task_pool.canceled())
+                            {
+                                if (task.need_finish_queue == false)
+                                    reqFinished = 1;
+                            }
+
+                            if (reqFinished != 0)
+                            {
+                                break;
+                            }
+
+                        }
+                    }
+                }
+
+#pragma omp taskwait  
+            }
+
+            printf("tasks finished\n");
+#ifdef WITH_IT4I_MIC_OFFLOAD    
+            for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+            {
+                if (dev > 0)
+                {
+                    if (dev == 1)
+                        mic_wait(dev - 1, signal1);
+
+                    if (dev == 2)
+                        mic_wait(dev - 1, signal2);
+
+                    mic_mem_free(dev - 1, (char*) &reqFinished, sizeof (int));
+                    mic_mem_free(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int));
+                }
+            }
+#endif  
+
+            tile_id = tile_h;
+        }
+
+        task.update_progress(&tile);
+    }
+
+    void thread_path_trace(DeviceTask& task)
+    {
+        printf("thread_path_trace start\n");
+
+        if (task_pool.canceled())
+        {
+            if (task.need_finish_queue == false)
+                return;
+        }
+
+        RenderTile tile;
+
+        int tile_h = 0;
+        int tile_w = 0;
+        int num_samples_orig = 0;
+
+        while (task.acquire_tile(this, tile))
+        {
+            printf("task.acquire_tile\n");
+
+            int offset, stride;
+            tile.buffers->params.get_offset_stride(offset, stride);
+
+            int tile_x = tile.buffers->params.full_x;
+            int tile_y = tile.buffers->params.full_y;
+            tile_h = tile.buffers->params.height;
+            tile_w = tile.buffers->params.width;
+            num_samples_orig = tile.num_samples_orig;
+
+            tile.sample = tile.start_sample + tile.num_samples;
+
+            if (!tile.progressive)
+            {
+                tile_x = tile.x;
+                tile_y = tile.y;
+                tile_h = tile.h;
+                tile_w = tile.w;
+
+                offset = tile.offset;
+                stride = tile.stride;
+            }
+
+            num_tiles = tile_h;
+            tile_id = 0;
+
+            if (/*background ||*/ tile.progressive)
+                path_trace_progressive(omp_get_data_size(kernel_globals_cpu), (char*) tile.buffer, (char*) tile.rng_state, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w, task, tile);
+            else
+                path_trace_offline(omp_get_data_size(kernel_globals_cpu), (char*) tile.buffer, (char*) tile.rng_state, tile.start_sample, tile.num_samples, tile_x, tile_y, offset, stride, tile_h, tile_w, task, tile);
+
+
+            //tile_id = tile_h;
+            task.release_tile(tile);
+
+            if (task_pool.canceled())
+            {
+                if (task.need_finish_queue == false)
+                    break;
+            }
+        }
+
+        printf("thread_path_trace finish\n");
+    }
+
+    void thread_film_convert(DeviceTask& task)
+    {
+    }
+
+    void thread_shader(DeviceTask& task)
+    {
+    }
+
+    int get_split_task_count(DeviceTask& task)
+    {
+        return 1;
+    }
+
+    void task_add(DeviceTask& task)
+    {
+        task_pool.push(new OMPDeviceTask(this, task));
+    }
+
+    void task_wait()
+    {
+        task_pool.wait();
+    }
+
+    void task_cancel()
+    {
+        task_pool.cancel();
+    }
+
+    bool get_pass_rect(PassType &type, float exposure, int sample, int components, float *pixels, BufferParams &params, float* buffer)
+    {
+        int pass_offset = 0;
+
+        foreach(Pass& pass, params.passes)
+        {
+            if (pass.type != type)
+            {
+                pass_offset += pass.components;
+                continue;
+            }
+
+            float *in = (float*) buffer + pass_offset;
+            int pass_stride = params.get_passes_size();
+
+            float scale = (pass.filter) ? 1.0f / (float) sample : 1.0f;
+            float scale_exposure = (pass.exposure) ? scale * exposure : scale;
+
+            int size = params.width * params.height;
+
+            if (components == 1)
+            {
+                assert(pass.components == components);
+
+                /* scalar */
+                if (type == PASS_DEPTH)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = (f == 0.0f) ? 1e10f : f*scale_exposure;
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+                else if (type == PASS_MIST)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = saturate(f * scale_exposure);
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+#ifdef WITH_CYCLES_DEBUG
+                else if (type == PASS_BVH_TRAVERSAL_STEPS)
+                {
+                    for (int i = 0; i < size; i++, in += pass_stride, pixels++)
+                    {
+                        float f = *in;
+                        pixels[0] = f;
+                    }
+                }
+                else if (type == PASS_RAY_BOUNCES)
+                {
+                    for (int i = 0; i < size; i++, in += pass_stride, pixels++)
+                    {
+                        float f = *in;
+                        pixels[0] = f;
+                    }
+                }
+#endif
+                else
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //in += pass_stride, pixels++
+                        float f = in[i * pass_stride];
+                        pixels[i] = f*scale_exposure;
+                    }
+                    in += size*pass_stride;
+                    pixels += size;
+                }
+            }
+            else if (components == 3)
+            {
+                assert(pass.components == 4);
+
+                /* RGBA */
+                if (type == PASS_SHADOW)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 3
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
+
+                        pixels[i * 3 + 0] = f.x*invw;
+                        pixels[i * 3 + 1] = f.y*invw;
+                        pixels[i * 3 + 2] = f.z*invw;
+                    }
+                    in += size*pass_stride;
+                    pixels += size * 3;
+
+                }
+                else if (pass.divide_type != PASS_NONE)
+                {
+                    /* RGB lighting passes that need to divide out color */
+                    pass_offset = 0;
+
+                    foreach(Pass& color_pass, params.passes)
+                    {
+                        if (color_pass.type == pass.divide_type)
+                            break;
+                        pass_offset += color_pass.components;
+                    }
+
+                    float *in_divide = (float*) buffer + pass_offset;
+
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, in_divide += pass_stride, pixels += 3
+                        float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]);
+                        float3 f_divide = make_float3(in_divide[i * pass_stride + 0], in_divide[i * pass_stride + 1], in_divide[i * pass_stride + 2]);
+
+                        f = safe_divide_even_color(f*exposure, f_divide);
+
+                        pixels[i * 3 + 0] = f.x;
+                        pixels[i * 3 + 1] = f.y;
+                        pixels[i * 3 + 2] = f.z;
+                    }
+
+                    in += size*pass_stride;
+                    in_divide += size*pass_stride;
+                    pixels += size * 3;
+                }
+                else
+                {
+                    /* RGB/vector */
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 3
+                        float3 f = make_float3(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2]);
+
+                        pixels[i * 3 + 0] = f.x*scale_exposure;
+                        pixels[i * 3 + 1] = f.y*scale_exposure;
+                        pixels[i * 3 + 2] = f.z*scale_exposure;
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 3;
+                }
+            }
+            else if (components == 4)
+            {
+                assert(pass.components == components);
+
+                /* RGBA */
+                if (type == PASS_SHADOW)
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
+
+                        pixels[i * 4 + 0] = f.x*invw;
+                        pixels[i * 4 + 1] = f.y*invw;
+                        pixels[i * 4 + 2] = f.z*invw;
+                        pixels[i * 4 + 3] = 1.0f;
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 4;
+                }
+                else if (type == PASS_MOTION)
+                {
+                    /* need to normalize by number of samples accumulated for motion */
+                    pass_offset = 0;
+
+                    foreach(Pass& color_pass, params.passes)
+                    {
+                        if (color_pass.type == PASS_MOTION_WEIGHT)
+                            break;
+                        pass_offset += color_pass.components;
+                    }
+
+                    float *in_weight = (float*) buffer + pass_offset;
+
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, in_weight += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+                        float w = in_weight[i * pass_stride + 0];
+                        float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
+
+                        pixels[i * 4 + 0] = f.x*invw;
+                        pixels[i * 4 + 1] = f.y*invw;
+                        pixels[i * 4 + 2] = f.z*invw;
+                        pixels[i * 4 + 3] = f.w*invw;
+                    }
+
+                    in += size*pass_stride;
+                    in_weight += size*pass_stride;
+                    pixels += size * 4;
+                }
+                else
+                {
+#pragma omp parallel for
+                    for (int i = 0; i < size; i++)
+                    {
+                        //, in += pass_stride, pixels += 4
+                        float4 f = make_float4(in[i * pass_stride + 0], in[i * pass_stride + 1], in[i * pass_stride + 2], in[i * pass_stride + 3]);
+
+                        pixels[i * 4 + 0] = f.x*scale_exposure;
+                        pixels[i * 4 + 1] = f.y*scale_exposure;
+                        pixels[i * 4 + 2] = f.z*scale_exposure;
+
+                        /* clamp since alpha might be > 1.0 due to russian roulette */
+                        //pixels[i * 4 + 3] = saturate(f.w*scale);
+                        pixels[i * 4 + 3] = saturate(f.w);
+                    }
+
+                    in += size*pass_stride;
+                    pixels += size * 4;
+                }
+            }
+
+            return true;
+        }
+        return false;
+    }
+
+};
+
+Device *device_omp_create(DeviceInfo& info, Stats &stats, bool background)
+{
+    return new OMPDevice(info, stats, background);
+}
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+
+string micFindDevices()
+{
+    //    return "";
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+    FILE *handle = popen("micinfo -group Versions | grep 'Device Name'", "r");
+    if (handle)
+    {
+        char buffer[4096] = {0};
+        int len = fread(buffer, 1, sizeof (buffer) - 1, handle);
+        buffer[len] = '\0';
+        pclose(handle);
+
+        if (buffer[0])
+            return string(buffer);
+    }
+#endif
+
+#if defined(_WIN32)
+    return "fakeMIC";
+#else
+    return "";
+#endif 
+}
+
+#endif
+
+bool device_omp_init(void)
+{
+    return true;
+}
+
+void device_omp_info(vector<DeviceInfo>& devices)
+{
+    DeviceInfo infoCPU;
+
+    infoCPU.type = DEVICE_OMP;
+    infoCPU.description = "CPU";
+    infoCPU.id = "OMP_CPU";
+    infoCPU.num = -1;
+    infoCPU.advanced_shading = true;
+    infoCPU.pack_images = false;
+
+    devices.insert(devices.begin(), infoCPU);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    string mics = micFindDevices();
+    if (!mics.empty())
+    {
+        DeviceInfo infoCPU_MICS;
+
+        infoCPU_MICS.type = DEVICE_OMP;
+        infoCPU_MICS.description = "CPU";
+        infoCPU_MICS.id = "OMP_CPU_MICS";
+        infoCPU_MICS.num = -1;
+        infoCPU_MICS.advanced_shading = true;
+        infoCPU_MICS.pack_images = false;
+
+        DeviceInfo infoCPU_MIC0;
+
+        infoCPU_MIC0.type = DEVICE_OMP;
+        infoCPU_MIC0.description = "CPU";
+        infoCPU_MIC0.id = "OMP_CPU_MIC0";
+        infoCPU_MIC0.num = -1;
+        infoCPU_MIC0.advanced_shading = true;
+        infoCPU_MIC0.pack_images = false;
+
+        int num = 0;
+        std::vector<std::string> strDevices;
+        boost::split(strDevices, mics, boost::is_any_of("\n"));
+
+        int count_dev = 0;
+
+        foreach(string strDevice, strDevices)
+        {
+            std::vector<std::string> strNames;
+            boost::split(strNames, strDevice, boost::is_any_of(":"));
+
+            if (strNames.size() == 0)
+                continue;
+
+            string name = strNames[strNames.size() - 1];
+            boost::trim(name);
+
+            if (!name.empty())
+            {
+                DeviceInfo info;
+
+                info.type = DEVICE_OMP;
+                info.description = strDevice;
+                info.id = string_printf("OMP_MIC_%d", num);
+                info.num = num++;
+                info.advanced_shading = true;
+                info.pack_images = false;
+
+                infoCPU_MICS.description = string_printf("%s+%s", infoCPU_MICS.description.c_str(), info.description.c_str());
+                infoCPU_MICS.multi_devices.push_back(info);
+
+                if (count_dev == 0)
+                {
+                    infoCPU_MIC0.description = string_printf("%s+%s", infoCPU_MIC0.description.c_str(), info.description.c_str());
+                    infoCPU_MIC0.multi_devices.push_back(info);
+                }
+
+                count_dev++;
+            }
+        }
+
+        devices.insert(devices.begin(), infoCPU_MIC0);
+        devices.insert(devices.begin(), infoCPU_MICS);
+    }
+
+
+#endif
+}
+
+string device_omp_capabilities(void)
+{
+    return "";
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 1b4e5421b5ae7d0f9fc808c9628a2ff8f4fec5d8..4aa0c5de4ec6b5a15d810dcd18f17713b447edf1 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -39,6 +39,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+//#define VLOG(a) std::cout
+
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 /* Macro declarations used with split kernel */
@@ -118,8 +120,8 @@ bool opencl_kernel_use_advanced_shading(const string& platform)
 		return true;
 	else if(platform == "AMD Accelerated Parallel Processing")
 		return true;
-	else if(platform == "Intel(R) OpenCL")
-		return true;
+	//else if(platform == "Intel(R) OpenCL")
+	//	return true;
 	/* Make sure officially unsupported OpenCL platforms
 	 * does not set up to use advanced shading.
 	 */
@@ -165,6 +167,10 @@ bool opencl_device_supported(const string& platform_name,
 	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
 		return true;
 	}
+        if(platform_name == "Intel(R) OpenCL") {
+                return true;
+        }
+            
 	return false;
 }
 
@@ -1056,7 +1062,7 @@ public:
 
 		ConstMemMap::iterator mt;
 		for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-			mem_free(*(mt->second));
+			mem_free("second", *(mt->second));
 			delete mt->second;
 		}
 
@@ -1076,7 +1082,7 @@ public:
 			clReleaseContext(cxContext);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		size_t size = mem.memory_size();
 
@@ -1111,7 +1117,7 @@ public:
 		mem.device_size = size;
 	}
 
-	void mem_copy_to(device_memory& mem)
+	void mem_copy_to(const char *name, device_memory& mem)
 	{
 		/* this is blocking */
 		size_t size = mem.memory_size();
@@ -1127,7 +1133,7 @@ public:
 		}
 	}
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+	void mem_copy_from(const char *name, device_memory& mem, int y, int w, int h, int elem)
 	{
 		size_t offset = elem*y*w;
 		size_t size = elem*w*h;
@@ -1142,15 +1148,15 @@ public:
 		                                  NULL, NULL));
 	}
 
-	void mem_zero(device_memory& mem)
+	void mem_zero(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			memset((void*)mem.data_pointer, 0, mem.memory_size());
-			mem_copy_to(mem);
+			mem_copy_to(name, mem);
 		}
 	}
 
-	void mem_free(device_memory& mem)
+	void mem_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			if(mem.device_pointer != null_mem) {
@@ -1171,7 +1177,7 @@ public:
 			device_vector<uchar> *data = new device_vector<uchar>();
 			data->copy((uchar*)host, size);
 
-			mem_alloc(*data, MEM_READ_ONLY);
+			mem_alloc(name, *data, MEM_READ_ONLY);
 			i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 		}
 		else {
@@ -1179,7 +1185,7 @@ public:
 			data->copy((uchar*)host, size);
 		}
 
-		mem_copy_to(*i->second);
+		mem_copy_to(name, *i->second);
 	}
 
 	void tex_alloc(const char *name,
@@ -1188,13 +1194,13 @@ public:
 	               ExtensionType /*extension*/)
 	{
 		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
-		mem_alloc(mem, MEM_READ_ONLY);
-		mem_copy_to(mem);
+		mem_alloc(name, mem, MEM_READ_ONLY);
+		mem_copy_to(name, mem);
 		assert(mem_map.find(name) == mem_map.end());
 		mem_map.insert(MemMap::value_type(name, mem.device_pointer));
 	}
 
-	void tex_free(device_memory& mem)
+	void tex_free(const char *name, device_memory& mem)
 	{
 		if(mem.device_pointer) {
 			foreach(const MemMap::value_type& value, mem_map) {
@@ -1204,7 +1210,7 @@ public:
 				}
 			}
 
-			mem_free(mem);
+			mem_free(name, mem);
 		}
 	}
 
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 1f1128a28f858fdd23326de3ba459c6350328a6e..82d12175c9e686271b85abd43fda9caf44a004ed 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -21,6 +21,7 @@
 
 #include "util_algorithm.h"
 #include "util_time.h"
+#include "buffers.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -106,7 +107,7 @@ void DeviceTask::update_progress(RenderTile *rtile)
 		return;
 
 	if(update_progress_sample)
-		update_progress_sample();
+		update_progress_sample(rtile->sample);
 
 	if(update_tile_sample) {
 		double current_time = time_dt();
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index d7912f386f5c165d0badfe9d4fcbb8b7a1d67307..242635450a975b631e0f95146eecfa825ec50484 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -59,7 +59,7 @@ public:
 	void update_progress(RenderTile *rtile);
 
 	function<bool(Device *device, RenderTile&)> acquire_tile;
-	function<void(void)> update_progress_sample;
+	function<void(int)> update_progress_sample;
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3c17429fd092b59fb72ded4cbead600ab284df73..d505e3da582fdcc010a350e6aff67a57c155001b 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -271,6 +271,21 @@ if(WITH_CYCLES_OSL)
 	add_subdirectory(shaders)
 endif()
 
+# MPI module
+if(WITH_IT4I_MPI)
+	add_subdirectory(kernels/mpi)
+endif()
+
+# OMP module
+if(WITH_OPENMP)
+        add_subdirectory(kernels/omp)
+endif()
+
+# MIC module
+if(WITH_IT4I_MIC_OFFLOAD)
+        add_subdirectory(kernels/mic)
+endif()
+
 # CPU module
 
 include_directories(${INC})
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 49f6122f3f4e70980a9d56b5d16e38ff87581fa4..7e390d3852251a8e83c706b8c175620161e37e39 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -43,6 +43,7 @@ typedef struct KernelGlobals {
 #include "kernel_textures.h"
 
 	KernelData __data;
+	size_t __data_size;
 
 #ifdef __OSL__
 	/* On the CPU, we also have the OSL globals here. Most data structures are shared
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index bdd17c66c0f1fef54a5a9fde8fab48f7980bef32..88fad12cc241ff4115bcbd376a8aecdade1a3571 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -130,9 +130,16 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
 #  define __CL_USE_NATIVE__
 #  define __KERNEL_SHADING__
-#  define __KERNEL_ADV_SHADING__
+#  define __MULTI_CLOSURE__
+#  define __PASSES__
+#  define __BACKGROUND_MIS__
+#  define __LAMP_MIS__
+#  define __AO__
+#  define __CAMERA_MOTION__
+#  define __OBJECT_MOTION__
+#  define __HAIR__
 #  ifdef __KERNEL_EXPERIMENTAL__
-#    define __CMJ__
+#    define __TRANSPARENT_SHADOWS__
 #  endif
 #endif
 
diff --git a/intern/cycles/kernel/kernels/mic/CMakeLists.txt b/intern/cycles/kernel/kernels/mic/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5076784175293e79da50abaf3e91ce3c788e3ef3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mic/CMakeLists.txt
@@ -0,0 +1,32 @@
+
+set(INC
+	.
+	../../../kernel
+	../../../util
+        ../../../kernel/osl
+        ../../../../../it4i/client/api
+)
+
+set(SRC
+	kernel_mic.cpp
+)
+
+set(SRC_HEADERS
+	kernel_compat_mic.h
+	kernel_mic.h
+)
+
+if (WITH_IT4I_MIC_NATIVE)
+  add_definitions(-DWITH_IT4I_MIC_NATIVE)
+endif()
+
+if (WITH_IT4I_MIC_OFFLOAD)
+  add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+  
+  # -ip -fp-model fast=2
+  set_source_files_properties(kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qopenmp -qoffload-attribute-target=mic")
+  #set_source_files_properties(kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-openmp -offload=none")  
+endif()
+
+include_directories(${INC})
+add_library(cycles_kernel_mic ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h b/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b1e200cdf744c766ba132dc62b0c49621116c99
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mic/kernel_compat_mic.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_COMPAT_MIC_H__
+#define __KERNEL_COMPAT_MIC_H__
+
+#define __KERNEL_CPU__
+#define __KERNEL_MIC__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+//#include "util_debug.h"
+#include "util_math.h"
+//#include "util_simd.h"
+#include "util_half.h"
+#include "util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
+     defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
+     (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) //assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+
+template<typename T> struct texture  {
+	ccl_always_inline T fetch(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return data[index];
+	}
+
+#ifdef __KERNEL_SSE2__
+	ccl_always_inline ssef fetch_ssef(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return ((ssef*)data)[index];
+	}
+
+	ccl_always_inline ssei fetch_ssei(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return ((ssei*)data)[index];
+	}
+#endif
+
+	T *data;
+	int width;
+};
+
+template<typename T> struct texture_image  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+	ccl_always_inline float4 read(float4 r)
+	{
+		return r;
+	}
+
+	ccl_always_inline float4 read(uchar4 r)
+	{
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+
+	ccl_always_inline int wrap_periodic(int x, int width)
+	{
+		x %= width;
+		if(x < 0)
+			x += width;
+		return x;
+	}
+
+	ccl_always_inline int wrap_clamp(int x, int width)
+	{
+		return clamp(x, 0, width-1);
+	}
+
+	ccl_always_inline float frac(float x, int *ix)
+	{
+		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+		*ix = i;
+		return x - (float)i;
+	}
+
+	ccl_always_inline float4 interp(float x, float y)
+	{
+		if(UNLIKELY(!data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		int ix, iy, nix, niy;
+
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+			return read(data[ix + iy*width]);
+		}
+		else if(interpolation == INTERPOLATION_LINEAR) {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
+			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
+			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
+			r += ty*tx*read(data[nix + niy*width]);
+
+			return r;
+		}
+		else {
+			/* Bicubic b-spline interpolation. */
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			int pix, piy, nnix, nniy;
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			float u[4], v[4];
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y) (read(data[xc[x] + yc[y]]))
+#define TERM(col) \
+			(v[col] * (u[0] * DATA(0, col) + \
+			           u[1] * DATA(1, col) + \
+			           u[2] * DATA(2, col) + \
+			           u[3] * DATA(3, col)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+			/* Actual interpolation. */
+			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+
+#undef TERM
+#undef DATA
+		}
+	}
+
+	ccl_always_inline float4 interp_3d(float x, float y, float z)
+	{
+		return interp_3d_ex(x, y, z, interpolation);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR)
+	{
+		if(UNLIKELY(!data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		int ix, iy, iz, nix, niy, niz;
+
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			frac(z*(float)depth, &iz);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			return read(data[ix + iy*width + iz*width*height]);
+		}
+		else if(interpolation == INTERPOLATION_LINEAR) {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			float tz = frac(z*(float)depth - 0.5f, &iz);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			float4 r;
+
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
+
+			r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
+			r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
+			r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
+			r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
+
+			return r;
+		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+					piz = wrap_periodic(iz-1, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					nniz = wrap_periodic(iz+2, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+					piz = wrap_clamp(iz-1, depth);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+					nniz = wrap_clamp(iz+2, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+		}
+	}
+
+	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
+	{
+		width = width_;
+		height = height_;
+		depth = depth_;
+	}
+
+	T *data;
+	int interpolation;
+	ExtensionType extension;
+	int width, height, depth;
+#undef SET_CUBIC_SPLINE_WEIGHTS
+};
+
+typedef texture<float4> texture_float4;
+typedef texture<float2> texture_float2;
+typedef texture<float> texture_float;
+typedef texture<uint> texture_uint;
+typedef texture<int> texture_int;
+typedef texture<uint4> texture_uint4;
+typedef texture<uchar4> texture_uchar4;
+typedef texture_image<float4> texture_image_float4;
+typedef texture_image<uchar4> texture_image_uchar4;
+
+/* Macros to handle different memory storage on different devices */
+
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
+#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
+#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
+#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
+#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
+
+#define kernel_data (kg->__data)
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b& a)
+{
+	print_sseb(label, a.x);
+	print_sseb(label, a.y);
+	print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f& a)
+{
+	print_ssef(label, a.x);
+	print_ssef(label, a.y);
+	print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i& a)
+{
+	print_ssei(label, a.x);
+	print_ssei(label, a.y);
+	print_ssei(label, a.z);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_COMPAT_MIC_H__ */
+
diff --git a/intern/cycles/kernel/kernels/mic/kernel_mic.cpp b/intern/cycles/kernel/kernels/mic/kernel_mic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7261df09c98a94f4ac8e1299c19009a1b31ebfd7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mic/kernel_mic.cpp
@@ -0,0 +1,516 @@
+#include "kernel_mic.h"
+
+#include "kernel_compat_mic.h"
+#include "kernel.h"
+
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+#include <omp.h>
+
+//#define NUM_THREADS 240
+#define SIZE_T long
+
+#define ALLOC alloc_if(1) free_if(0)
+#define FREE alloc_if(0) free_if(1)
+#define REUSE alloc_if(0) free_if(0)
+
+#define ONE_USE //alloc_if(1) free_if(1)
+
+CCL_NAMESPACE_BEGIN
+
+void cwassert(const char * _Message, const char *_File, unsigned _Line)
+{
+    printf("ASSERT: %s, %s, %d\n", _Message, _File, _Line);
+}
+
+/* Memory Copy */
+void mic_const_copy_internal(DEVICE_PTR kg_bin, char *host_bin, size_t size)
+{
+    KernelGlobals *kg = (KernelGlobals *) kg_bin;
+    memcpy(&kg->__data, host_bin, size);
+    kg->__data_size = size;
+}
+
+void mic_const_copy(int numDevice, DEVICE_PTR kg_bin, const char *name, char *host_bin, size_t size)
+{
+    if (strcmp(name, "__data") == 0)
+    {
+        if (numDevice != -1)
+        {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) in(host_bin:length(size) ONE_USE) in(kg_bin) in(size)
+            {
+                mic_const_copy_internal(kg_bin, host_bin, size);
+            }
+
+#endif
+        }
+        else
+        {
+            mic_const_copy_internal(kg_bin, host_bin, size);
+        }
+    }
+    //    else
+    //        assert(0);
+}
+
+void mic_tex_copy_internal(DEVICE_PTR kg_bin,
+        const char *name,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension)
+{
+    KernelGlobals *kg = (KernelGlobals *) kg_bin;
+
+    if (0)
+    {
+    }
+#define KERNEL_TEX(type, ttype, tname) \
+                else if(strcmp(name, #tname) == 0) { \
+                kg->tname.data = (type*)mem; \
+                kg->tname.width = width; \
+                }
+#define KERNEL_IMAGE_TEX(type, ttype, tname)
+#include "kernel_textures.h"
+
+    else if (strstr(name, "__tex_image_float"))
+    {
+        texture_image_float4 *tex = NULL;
+        int id = atoi(name + strlen("__tex_image_float_"));
+        int array_index = id;
+
+        if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES)
+        {
+            tex = &kg->texture_float_images[array_index];
+        }
+
+        if (tex)
+        {
+            tex->data = (float4*) mem;
+            tex->dimensions_set(width, height, depth);
+            tex->interpolation = interpolation;
+            tex->extension = (ExtensionType) extension;
+        }
+    }
+    else if (strstr(name, "__tex_image"))
+    {
+        texture_image_uchar4 *tex = NULL;
+        int id = atoi(name + strlen("__tex_image_"));
+        int array_index = id - MAX_FLOAT_IMAGES;
+
+        if (array_index >= 0 && array_index < MAX_BYTE_IMAGES)
+        {
+            tex = &kg->texture_byte_images[array_index];
+        }
+
+        if (tex)
+        {
+            tex->data = (uchar4*) mem;
+            tex->dimensions_set(width, height, depth);
+            tex->interpolation = interpolation;
+            tex->extension = (ExtensionType) extension;
+        }
+    }
+
+}
+
+void mic_tex_copy(int numDevice, DEVICE_PTR kg_bin,
+        const char *name_bin,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension)
+{
+    if (name_bin == NULL || mem == NULL)
+        return;
+
+    size_t nameSize = sizeof (char) * (strlen(name_bin) + 1);
+    char *name = (char *) name_bin;
+
+    //printf("mic_tex_copy_internal: %d: %s, %d\n", numDevice, name, size);        
+
+    if (numDevice != -1)
+    {
+
+#ifdef WITH_IT4I_MIC_OFFLOAD 
+#pragma offload target(mic:numDevice) \
+            in(mem:length(size) ONE_USE) \
+            in(name:length(nameSize) ONE_USE) \
+            in(kg_bin) in(size) in(width) in(height) in(depth) in(interpolation) in(extension)
+        {
+            char* mem2 = new char[size];
+            memcpy(mem2, mem, size);
+            mic_tex_copy_internal(kg_bin, name, mem2, size, width, height, depth, interpolation, extension);
+        }
+#endif
+    }
+    else
+    {
+        mic_tex_copy_internal(kg_bin, name, mem, size, width, height, depth, interpolation, extension);
+    }
+
+    //printf("mic_tex_copy: %s\n", name);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void mic_wait(int numDevice, int signal_value)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload_wait target(mic:numDevice) wait(signal_value)
+#endif
+    }
+}
+
+void mic_film_convert_byte(KernelGlobals *kg,
+        uchar4 *rgba_byte, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride)
+{
+    /* buffer offset */
+    int index = offset + x + y*stride;
+
+    rgba_byte += index;
+    //rgba_float += index;
+    buffer += index * kernel_data.film.pass_stride;
+
+    /* map colors */
+    float4 irradiance = *((ccl_global float4*) buffer);
+    float4 float_result = film_map(kg, irradiance, sample_scale);
+    uchar4 byte_result = film_float_to_byte(float_result);
+
+    *rgba_byte = byte_result;
+}
+
+void mic_convert_to_half_float(KernelGlobals *kg,
+        uchar4 *rgba, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride)
+{
+    /* buffer offset */
+    int index = offset + x + y*stride;
+
+    float4 *in = (float4*) (buffer + index * kernel_data.film.pass_stride);
+    half *out = (half*) rgba + index * 4;
+
+    float exposure = kernel_data.film.exposure;
+
+    float4 rgba_in = *in;
+
+    if (exposure != 1.0f)
+    {
+        rgba_in.x *= exposure;
+        rgba_in.y *= exposure;
+        rgba_in.z *= exposure;
+    }
+
+    float4_store_half(out, rgba_in, sample_scale);
+}
+
+void mic_path_trace_internal(DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu)
+{
+    int size = tile_h*tile_w;
+
+    int *sample_finished = (int*) sample_finished_mic;
+    int *reqFinished = (int*) reqFinished_mic;
+
+    *sample_finished = start_sample;
+
+#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1)
+    for (int i = 0; i < size; i++)
+    {
+       // if (*reqFinished != 0)
+       //     continue;
+
+        int y = i / tile_w;
+        int x = i - y * tile_w;
+
+        for (int sample = start_sample; sample < end_sample; sample++)
+        {
+            kernel_path_trace((KernelGlobals *) kg_bin, (float *) buffer_bin, (unsigned int*) rng_state_bin, sample, x + tile_x, y + tile_y, offset, stride);
+
+            if (rgba_byte_bin != NULL)
+            {
+                float sample_scale = 1.0f / (sample + 1.0f);
+
+                if (is_rgba_float)
+                    mic_convert_to_half_float((KernelGlobals *) kg_bin, (uchar4*) rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride);
+                else
+                    mic_film_convert_byte((KernelGlobals *) kg_bin, (uchar4*) rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride);
+            }
+        }
+    }
+
+    *sample_finished = end_sample;
+    //printf("MIC: mic_path_trace_internal finished: %d\n", *sample_finished);
+    //fflush(0);
+}
+
+void mic_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu, int signal_value)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        if (rgba_byte_bin == NULL)
+        {
+#pragma offload target(mic:numDevice) \
+            in(buffer_bin : length(0) REUSE) \
+            in(rng_state_bin : length(0) REUSE) \
+            inout(sample_finished_mic : length(sizeof(int)) REUSE) \
+            in(reqFinished_mic : length(0) REUSE) \
+            in(kg_bin) in(start_sample) in(end_sample) in(tile_x) in(tile_y) in(offset) in(stride) in(tile_h) in(tile_w) in(nprocs_cpu) \
+            signal(signal_value)
+            {
+                mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, NULL, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu);
+                //printf("MIC: %d, mic_path_trace finished: %d\n", numDevice, *((int*)sample_finished_mic));
+                //fflush(0);
+            }
+        }
+        else
+        {
+#pragma offload target(mic:numDevice) \
+            in(buffer_bin : length(0) REUSE) \
+            in(rng_state_bin : length(0) REUSE) \
+            inout(sample_finished_mic : length(sizeof(int)) REUSE) \
+            in(reqFinished_mic : length(0) REUSE) \
+            in(rgba_byte_bin : length(0) REUSE) \
+            in(kg_bin) in(start_sample) in(end_sample) in(tile_x) in(tile_y) in(offset) in(stride) in(tile_h) in(tile_w) in(nprocs_cpu) \
+            signal(signal_value)
+            {
+                mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu);
+                //printf("MIC: %d, mic_path_trace finished: %d\n", numDevice, *((int*)sample_finished_mic));
+                //fflush(0);
+            }
+        }
+#endif
+    }
+    else
+    {
+        mic_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_mic, reqFinished_mic, nprocs_cpu);
+    }
+    
+    //printf("MIC: mic_path_trace finished: %d\n", *((int*)sample_finished_mic));
+    //fflush(0);    
+}
+
+DEVICE_PTR mic_alloc_kg(int numDevice)
+{
+    DEVICE_PTR kg_bin;
+
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) out(kg_bin)
+        {
+            KernelGlobals *kg = new KernelGlobals();
+            kg_bin = (DEVICE_PTR) kg;
+        }
+#endif
+    }
+    else
+    {
+        KernelGlobals *kg = new KernelGlobals();
+        kg_bin = (DEVICE_PTR) kg;
+    }
+
+    return (DEVICE_PTR) kg_bin;
+}
+
+void mic_free_kg(int numDevice, DEVICE_PTR kg_bin)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) in(kg_bin)
+        {
+            KernelGlobals *kg = (KernelGlobals *) kg_bin;
+            delete kg;
+        }
+#endif
+    }
+    else
+    {
+        KernelGlobals *kg = (KernelGlobals *) kg_bin;
+        delete kg;
+    }
+}
+
+void mic_mem_alloc(int numDevice, char *mem, size_t memSize)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) in(mem:length(memSize) ALLOC)
+        {
+
+        }
+#endif
+    }
+}
+
+void mic_mem_copy_to(int numDevice, char *mem, size_t memSize, char* signal_value)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        if (signal_value == NULL)
+        {
+#pragma offload target(mic:numDevice) in(mem:length(memSize) REUSE)
+            {
+
+            }
+        }
+        else
+        {
+#pragma offload_transfer target(mic:numDevice) in(mem:length(memSize) REUSE) signal(signal_value)                
+        }
+#endif
+    }
+}
+
+void mic_mem_copy_from(int numDevice, char *mem, size_t offset, size_t memSize, char* signal_value)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        if (signal_value == NULL)
+        {
+#pragma offload target(mic:numDevice) out(mem[offset:memSize]: REUSE)
+            {
+
+            }
+        }
+        else
+        {
+#pragma offload_transfer target(mic:numDevice) out(mem[offset:memSize]: REUSE) signal(signal_value)                
+        }
+#endif
+    }
+}
+
+void mic_mem_zero(int numDevice, char *mem, size_t memSize)
+{
+    memset(mem, 0, memSize);
+
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) in(mem:length(0) REUSE) in(memSize)
+        {
+            memset(mem, 0, memSize);
+        }
+#endif
+    }
+}
+
+void mic_mem_free(int numDevice, char *mem, size_t memSize)
+{
+    if (numDevice != -1)
+    {
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        //#pragma offload_transfer target(mic:numDevice) in(mem:length(0) FREE)
+#pragma offload target(mic:numDevice) in(mem:length(0) FREE)
+        {
+
+        }
+#endif
+    }
+}
+
+void mic_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name_bin, char *mem, size_t memSize)
+{
+    if (name_bin == NULL)
+        return;
+
+    size_t nameSize = sizeof (char) * (strlen(name_bin) + 1);
+    char *name = (char *) name_bin;
+
+    if (numDevice != -1)
+    {
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#pragma offload target(mic:numDevice) \
+            in(name:length(nameSize) ONE_USE)
+        {
+            KernelGlobals *kg = (KernelGlobals *) kg_bin;
+
+            if (0)
+            {
+            }
+#define KERNEL_TEX(type, ttype, tname) \
+                else if(strcmp(name, #tname) == 0) { \
+                    delete [] kg->tname.data; \
+                    kg->tname.data = NULL; \
+                    kg->tname.width = 0; \
+                }
+#define KERNEL_IMAGE_TEX(type, ttype, tname)
+#include "kernel_textures.h"
+
+            else if (strstr(name, "__tex_image_float"))
+            {
+                texture_image_float4 *tex = NULL;
+                int id = atoi(name + strlen("__tex_image_float_"));
+                int array_index = id;
+
+                if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES)
+                {
+                    tex = &kg->texture_float_images[array_index];
+                }
+
+                if (tex)
+                {
+                    delete [] tex->data;
+                    tex->data = NULL;
+                    tex->dimensions_set(0, 0, 0);
+                }
+            }
+            else if (strstr(name, "__tex_image"))
+            {
+                texture_image_uchar4 *tex = NULL;
+                int id = atoi(name + strlen("__tex_image_"));
+                int array_index = id - MAX_FLOAT_IMAGES;
+
+                if (array_index >= 0 && array_index < MAX_BYTE_IMAGES)
+                {
+                    tex = &kg->texture_byte_images[array_index];
+                }
+
+                if (tex)
+                {
+                    delete [] tex->data;
+                    tex->data = NULL;
+                    tex->dimensions_set(0, 0, 0);
+                }
+            }
+        }
+#endif
+    }
+}
+
+int mic_get_pass_stride(DEVICE_PTR kg)
+{
+    return ((KernelGlobals*) kg)->__data.film.pass_stride;
+}
+
+size_t mic_get_data_size(DEVICE_PTR kg)
+{
+    return ((KernelGlobals*) kg)->__data_size;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernels/mic/kernel_mic.h b/intern/cycles/kernel/kernels/mic/kernel_mic.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4b5f96cbbec84417c3085e2447c8cc2dd0e249d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mic/kernel_mic.h
@@ -0,0 +1,40 @@
+#ifndef __KERNEL_MIC_H__
+#define __KERNEL_MIC_H__
+
+#include "client_api.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+void mic_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_mic, char *reqFinished_mic, int nprocs_cpu, int signal_value);
+
+/* Device memory */
+DEVICE_PTR mic_alloc_kg(int numDevice);
+void mic_free_kg(int numDevice, DEVICE_PTR kg);
+
+void mic_mem_alloc(int numDevice, char* mem, size_t memSize);
+void mic_mem_copy_to(int numDevice, char* mem, size_t memSize, char* signal_value);
+void mic_mem_copy_from(int numDevice, char* mem, size_t offset, size_t memSize, char* signal_value);
+void mic_mem_zero(int numDevice, char* mem, size_t memSize);
+void mic_mem_free(int numDevice, char* mem, size_t memSize);
+void mic_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name, char* mem, size_t memSize);
+
+void mic_const_copy(int numDevice, DEVICE_PTR kg, const char *name, char *host, size_t size);
+void mic_tex_copy(int numDevice, DEVICE_PTR kg_bin,
+        const char *name,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension);
+
+void mic_wait(int numDevice, int signal_value);
+int mic_get_pass_stride(DEVICE_PTR kg);
+size_t mic_get_data_size(DEVICE_PTR kg);
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_MIC_H__ */
+
diff --git a/intern/cycles/kernel/kernels/mpi/CMakeLists.txt b/intern/cycles/kernel/kernels/mpi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..037ab14da12ab17228268282d550b1fb47398899
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mpi/CMakeLists.txt
@@ -0,0 +1,21 @@
+
+set(INC
+	.
+	../../../kernel
+	../../../util
+	../../../kernel/osl
+	../../../../../it4i/client/api
+	${MPI_INCLUDE_DIR}
+)
+
+set(SRC
+	kernel_mpi.cpp
+)
+
+set(SRC_HEADERS
+	kernel_mpi.h
+)
+
+include_directories(${INC})
+add_library(cycles_kernel_mpi ${SRC} ${SRC_HEADERS})
+target_link_libraries (cycles_kernel_mpi ${MPI_LIB_FILE})
diff --git a/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp b/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5fbd820b83097d4a0906dd803386fea2c68e709
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mpi/kernel_mpi.cpp
@@ -0,0 +1,177 @@
+#include "kernel_mpi.h"
+
+#include <string.h>
+#include <mpi.h>
+
+CCL_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+int getCountOfDevices()
+{
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    return world_size - 1;
+}
+
+void getMpiKernelData(mpi_kernel_struct *data, int mpi_tag)
+{
+    memset(data, 0, sizeof (mpi_kernel_struct));
+    data->mpi_tag = mpi_tag;
+}
+
+void sendMpiKernelData(mpi_kernel_struct *data)
+{
+    MPI_Bcast(data, sizeof (mpi_kernel_struct), MPI_BYTE, 0, MPI_COMM_WORLD);
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void mpi_const_copy(const char *name, char *host_bin, size_t size)
+{
+    if (strcmp(name, "__data") == 0)
+    {
+        //mpi_const_copy_struct s;
+        mpi_kernel_struct data;
+        getMpiKernelData(&data, MPI_TAG_mpi_const_copy);
+
+        strcpy(data.mpi_const_copy_data.name, name);
+        data.mpi_const_copy_data.host = (DEVICE_PTR) host_bin;
+        data.mpi_const_copy_data.size = size;
+
+        sendMpiKernelData(&data);
+
+        MPI_Bcast(host_bin, size, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+}
+
+void mpi_tex_copy(
+        const char *name,
+        DEVICE_PTR mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension)
+{
+
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_tex_copy);
+
+    strcpy(data.mpi_tex_copy_data.name, name);
+    data.mpi_tex_copy_data.mem = mem;
+    data.mpi_tex_copy_data.size = size;
+    data.mpi_tex_copy_data.width = width;
+    data.mpi_tex_copy_data.height = height;
+    data.mpi_tex_copy_data.depth = depth;
+    data.mpi_tex_copy_data.interpolation = interpolation;
+    data.mpi_tex_copy_data.extension = extension;
+
+    sendMpiKernelData(&data);
+
+    MPI_Bcast((char*) mem, size, MPI_BYTE, 0, MPI_COMM_WORLD);
+}
+
+void mpi_alloc_kg(bool enable_mics)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_alloc_kg);
+    
+//    data.enable_mics = enable_mics;
+    
+    sendMpiKernelData(&data);
+}
+
+void mpi_free_kg()
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_free_kg);
+    sendMpiKernelData(&data);
+}
+
+void mpi_mem_alloc(const char* name, DEVICE_PTR mem, size_t memSize)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_mem_alloc);
+
+    strcpy(data.mpi_mem_data.name, name);
+    data.mpi_mem_data.mem = (DEVICE_PTR) mem;
+    data.mpi_mem_data.memSize = memSize;
+
+    sendMpiKernelData(&data);
+}
+
+void mpi_mem_copy_to(DEVICE_PTR mem, size_t memSize, size_t offset)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_mem_copy_to);
+
+    data.mpi_mem_data.mem = (DEVICE_PTR) mem;
+    data.mpi_mem_data.memSize = memSize;
+    data.mpi_mem_data.offset = offset;
+
+    sendMpiKernelData(&data);
+    MPI_Bcast((char*) mem, memSize, MPI_BYTE, 0, MPI_COMM_WORLD);
+}
+
+void mpi_mem_zero(DEVICE_PTR mem, size_t memSize, size_t offset)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_mem_zero);
+    data.mpi_mem_data.mem = (DEVICE_PTR) mem;
+    data.mpi_mem_data.memSize = memSize;
+    data.mpi_mem_data.offset = offset;
+
+    sendMpiKernelData(&data);
+}
+
+void mpi_mem_free(DEVICE_PTR mem, size_t memSize)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_mem_free);
+    data.mpi_mem_data.mem = (DEVICE_PTR) mem;
+    data.mpi_mem_data.memSize = memSize;
+
+    sendMpiKernelData(&data);
+}
+
+void mpi_tex_free(const char* name, DEVICE_PTR mem, size_t memSize)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_tex_free);
+    strcpy(data.mpi_mem_data.name, name);
+    data.mpi_mem_data.mem = (DEVICE_PTR) mem;
+    data.mpi_mem_data.memSize = memSize;
+
+    sendMpiKernelData(&data);
+}
+/////////////////////////////////////////////////////////////////////
+
+void mpi_path_trace(size_t kg_data_size, char* rgba_pixels, bool half_float, char *buffer, char *rng_state, bool progressive, 
+        int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w)
+{
+    mpi_kernel_struct data;
+    getMpiKernelData(&data, MPI_TAG_mpi_path_trace);
+
+    data.mpi_path_trace_data.buffer = (DEVICE_PTR) buffer;
+    data.mpi_path_trace_data.rng_state = (DEVICE_PTR) rng_state;
+    data.mpi_path_trace_data.start_sample = start_sample;
+    data.mpi_path_trace_data.num_samples = num_samples;
+    data.mpi_path_trace_data.progressive = progressive;
+    data.mpi_path_trace_data.tile_x = tile_x;
+    data.mpi_path_trace_data.tile_y = tile_y;
+    data.mpi_path_trace_data.offset = offset;
+    data.mpi_path_trace_data.stride = stride;
+    data.mpi_path_trace_data.tile_h = tile_h;
+    data.mpi_path_trace_data.tile_w = tile_w;
+    data.mpi_path_trace_data.kg_data_size = kg_data_size;
+    data.mpi_path_trace_data.rgba_pixels = (DEVICE_PTR) rgba_pixels;
+    data.mpi_path_trace_data.half_float = half_float;    
+
+    sendMpiKernelData(&data);
+}
+/////////////////////////////////////////////////////////////////////
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/mpi/kernel_mpi.h b/intern/cycles/kernel/kernels/mpi/kernel_mpi.h
new file mode 100644
index 0000000000000000000000000000000000000000..1252ea6e5b5cf1db1e326e915ccbffd9d2ae0e0a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/mpi/kernel_mpi.h
@@ -0,0 +1,35 @@
+#ifndef __KERNEL_MPI_H__
+#define __KERNEL_MPI_H__
+
+#include "client_api.h"
+
+CCL_NAMESPACE_BEGIN
+
+void mpi_path_trace(size_t kg_data_size, char* rgba_pixels, bool half_float, char *buffer, char *rng_state, bool progressive, 
+        int start_sample, int num_samples, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w);
+
+void mpi_alloc_kg(bool enable_mics);
+void mpi_free_kg();
+
+void mpi_mem_alloc(const char *name, DEVICE_PTR mem, size_t memSize);
+void mpi_mem_copy_to(DEVICE_PTR mem, size_t memSize, size_t offset);
+void mpi_mem_zero(DEVICE_PTR mem, size_t memSize, size_t offset);
+void mpi_mem_free(DEVICE_PTR mem, size_t memSize);
+void mpi_tex_free(const char *name, DEVICE_PTR mem, size_t memSize);
+
+void mpi_const_copy(const char *name, char *host, size_t size);
+void mpi_tex_copy(const char *name,
+        DEVICE_PTR mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension);
+
+int getCountOfDevices();
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_MPI_H__ */
+
diff --git a/intern/cycles/kernel/kernels/omp/CMakeLists.txt b/intern/cycles/kernel/kernels/omp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c31a53fe7f64018018fd4f8cd34fa2999fc5ec0f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/omp/CMakeLists.txt
@@ -0,0 +1,26 @@
+
+set(INC
+	.
+	../../../kernel
+	../../../util
+        ../../../kernel/osl
+        ../../../../../it4i/client/api
+)
+
+set(SRC
+	kernel_omp.cpp
+)
+
+set(SRC_HEADERS
+	kernel_compat_omp.h
+	kernel_omp.h
+)
+
+if (WITH_IT4I_MIC_OFFLOAD)
+  add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+endif()
+
+set_source_files_properties(kernel_omp.cpp PROPERTIES COMPILE_FLAGS "-xCORE-AVX2") 
+
+include_directories(${INC})
+add_library(cycles_kernel_omp ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h b/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1145bdcc1f0cdbc6194a125cd52a05876c5658b7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/omp/kernel_compat_omp.h
@@ -0,0 +1,513 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_COMPAT_OMP_H__
+#define __KERNEL_COMPAT_OMP_H__
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+#include "util_debug.h"
+#include "util_math.h"
+#include "util_simd.h"
+#include "util_half.h"
+#include "util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
+     defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
+     (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+
+template<typename T> struct texture  {
+	ccl_always_inline T fetch(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return data[index];
+	}
+
+#ifdef __KERNEL_SSE2__
+	ccl_always_inline ssef fetch_ssef(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return ((ssef*)data)[index];
+	}
+
+	ccl_always_inline ssei fetch_ssei(int index)
+	{
+		kernel_assert(index >= 0 && index < width);
+		return ((ssei*)data)[index];
+	}
+#endif
+
+	T *data;
+	int width;
+};
+
+template<typename T> struct texture_image  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+	ccl_always_inline float4 read(float4 r)
+	{
+		return r;
+	}
+
+	ccl_always_inline float4 read(uchar4 r)
+	{
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+
+	ccl_always_inline int wrap_periodic(int x, int width)
+	{
+		x %= width;
+		if(x < 0)
+			x += width;
+		return x;
+	}
+
+	ccl_always_inline int wrap_clamp(int x, int width)
+	{
+		return clamp(x, 0, width-1);
+	}
+
+	ccl_always_inline float frac(float x, int *ix)
+	{
+		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+		*ix = i;
+		return x - (float)i;
+	}
+
+	ccl_always_inline float4 interp(float x, float y)
+	{
+		if(UNLIKELY(!data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		int ix, iy, nix, niy;
+
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+			return read(data[ix + iy*width]);
+		}
+		else if(interpolation == INTERPOLATION_LINEAR) {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
+			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
+			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
+			r += ty*tx*read(data[nix + niy*width]);
+
+			return r;
+		}
+		else {
+			/* Bicubic b-spline interpolation. */
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			int pix, piy, nnix, nniy;
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			float u[4], v[4];
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y) (read(data[xc[x] + yc[y]]))
+#define TERM(col) \
+			(v[col] * (u[0] * DATA(0, col) + \
+			           u[1] * DATA(1, col) + \
+			           u[2] * DATA(2, col) + \
+			           u[3] * DATA(3, col)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+			/* Actual interpolation. */
+			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+
+#undef TERM
+#undef DATA
+		}
+	}
+
+	ccl_always_inline float4 interp_3d(float x, float y, float z)
+	{
+		return interp_3d_ex(x, y, z, interpolation);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR)
+	{
+		if(UNLIKELY(!data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		int ix, iy, iz, nix, niy, niz;
+
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			frac(z*(float)depth, &iz);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			return read(data[ix + iy*width + iz*width*height]);
+		}
+		else if(interpolation == INTERPOLATION_LINEAR) {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			float tz = frac(z*(float)depth - 0.5f, &iz);
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			float4 r;
+
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
+
+			r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
+			r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
+			r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
+			r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
+
+			return r;
+		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+					piz = wrap_periodic(iz-1, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					nniz = wrap_periodic(iz+2, depth);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || z < 0.0f ||
+					   x > 1.0f || y > 1.0f || z > 1.0f)
+					{
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+					piz = wrap_clamp(iz-1, depth);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+					nniz = wrap_clamp(iz+2, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+				default:
+					kernel_assert(0);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+		}
+	}
+
+	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
+	{
+		width = width_;
+		height = height_;
+		depth = depth_;
+	}
+
+	T *data;
+	int interpolation;
+	ExtensionType extension;
+	int width, height, depth;
+#undef SET_CUBIC_SPLINE_WEIGHTS
+};
+
+typedef texture<float4> texture_float4;
+typedef texture<float2> texture_float2;
+typedef texture<float> texture_float;
+typedef texture<uint> texture_uint;
+typedef texture<int> texture_int;
+typedef texture<uint4> texture_uint4;
+typedef texture<uchar4> texture_uchar4;
+typedef texture_image<float4> texture_image_float4;
+typedef texture_image<uchar4> texture_image_uchar4;
+
+/* Macros to handle different memory storage on different devices */
+
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
+#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
+#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
+#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
+#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
+
+#define kernel_data (kg->__data)
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b& a)
+{
+	print_sseb(label, a.x);
+	print_sseb(label, a.y);
+	print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f& a)
+{
+	print_ssef(label, a.x);
+	print_ssef(label, a.y);
+	print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i& a)
+{
+	print_ssei(label, a.x);
+	print_ssei(label, a.y);
+	print_ssei(label, a.z);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_COMPAT_OMP_H__ */
+
diff --git a/intern/cycles/kernel/kernels/omp/kernel_omp.cpp b/intern/cycles/kernel/kernels/omp/kernel_omp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6893281c8b0df87425db53dd87893b611cf95bcd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/omp/kernel_omp.cpp
@@ -0,0 +1,302 @@
+#include "kernel_omp.h"
+
+#include "kernel_compat_omp.h"
+
+#include "kernel.h"
+
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+#include <omp.h>
+
+//#define NUM_THREADS 240
+#define SIZE_T long
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef WITH_IT4I_MIC_OFFLOAD
+void cwassert(const char * _Message, const char *_File, unsigned _Line)
+{
+    printf("ASSERT: %s, %s, %d\n", _Message, _File, _Line);
+}
+#endif
+
+/* Memory Copy */
+void omp_const_copy_internal(DEVICE_PTR kg_bin, char *host_bin, size_t size)
+{
+    KernelGlobals *kg = (KernelGlobals *) kg_bin;
+    memcpy(&kg->__data, host_bin, size);
+    kg->__data_size = size;
+}
+
+void omp_const_copy(int numDevice, DEVICE_PTR kg_bin, const char *name, char *host_bin, size_t size)
+{
+    if (strcmp(name, "__data") == 0)
+    {
+        omp_const_copy_internal(kg_bin, host_bin, size);
+    }
+    else
+        assert(0);
+}
+
+void omp_tex_copy_internal(DEVICE_PTR kg_bin,
+        const char *name,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension)
+{
+    KernelGlobals *kg = (KernelGlobals *) kg_bin;
+
+    if (0)
+    {
+    }
+#define KERNEL_TEX(type, ttype, tname) \
+                        else if(strcmp(name, #tname) == 0) { \
+                kg->tname.data = (type*)mem; \
+                kg->tname.width = width; \
+                        }
+#define KERNEL_IMAGE_TEX(type, ttype, tname)
+#include "kernel_textures.h"
+
+    else if (strstr(name, "__tex_image_float"))
+    {
+        texture_image_float4 *tex = NULL;
+        int id = atoi(name + strlen("__tex_image_float_"));
+        int array_index = id;
+
+        if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES)
+        {
+            tex = &kg->texture_float_images[array_index];
+        }
+
+        if (tex)
+        {
+            tex->data = (float4*) mem;
+            tex->dimensions_set(width, height, depth);
+            tex->interpolation = interpolation;
+            tex->extension = (ExtensionType) extension;
+        }
+    }
+    else if (strstr(name, "__tex_image"))
+    {
+        texture_image_uchar4 *tex = NULL;
+        int id = atoi(name + strlen("__tex_image_"));
+        int array_index = id - MAX_FLOAT_IMAGES;
+
+        if (array_index >= 0 && array_index < MAX_BYTE_IMAGES)
+        {
+            tex = &kg->texture_byte_images[array_index];
+        }
+
+        if (tex)
+        {
+            tex->data = (uchar4*) mem;
+            tex->dimensions_set(width, height, depth);
+            tex->interpolation = interpolation;
+            tex->extension = (ExtensionType) extension;
+        }
+    }
+
+}
+
+void omp_tex_copy(int numDevice, DEVICE_PTR kg_bin,
+        const char *name_bin,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension)
+{
+    if (name_bin == NULL || mem == NULL)
+        return;
+
+    size_t nameSize = sizeof (char) * (strlen(name_bin) + 1);
+    char *name = (char *) name_bin;
+
+    //printf("omp_tex_copy_internal: %d: %s, %d\n", numDevice, name, size);        
+
+    omp_tex_copy_internal(kg_bin, name, mem, size, width, height, depth, interpolation, extension);
+
+    //printf("omp_tex_copy: %s\n", name);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void omp_wait(int numDevice, char *signal_value)
+{
+}
+
+void omp_film_convert_byte(DEVICE_PTR _kg,
+        char *_rgba_byte, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride)
+{
+    KernelGlobals *kg = (KernelGlobals *)_kg;            
+    uchar4 *rgba_byte  = (uchar4 *)_rgba_byte;
+    
+    /* buffer offset */
+    int index = offset + x + y*stride;
+
+    rgba_byte += index;
+    //rgba_float += index;
+    buffer += index * kernel_data.film.pass_stride;
+
+    /* map colors */
+    float4 irradiance = *((ccl_global float4*) buffer);
+    float4 float_result = film_map(kg, irradiance, sample_scale);
+    uchar4 byte_result = film_float_to_byte(float_result);
+
+    *rgba_byte = byte_result;
+}
+
+void omp_convert_to_half_float(DEVICE_PTR _kg,
+        char *_rgba, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride)
+{
+    KernelGlobals *kg = (KernelGlobals *)_kg;  
+    uchar4 *rgba  = (uchar4 *)_rgba;
+    
+    /* buffer offset */
+    int index = offset + x + y*stride;
+
+    float4 *in = (float4*) (buffer + index * kernel_data.film.pass_stride);
+    half *out = (half*) rgba + index * 4;
+
+    float exposure = kernel_data.film.exposure;
+
+    float4 rgba_in = *in;
+
+    if (exposure != 1.0f)
+    {
+        rgba_in.x *= exposure;
+        rgba_in.y *= exposure;
+        rgba_in.z *= exposure;
+    }
+
+    float4_store_half(out, rgba_in, sample_scale);
+}
+
+void omp_path_trace_internal(DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu)
+{
+    int size = tile_h*tile_w;
+
+    int *sample_finished = (int*) sample_finished_omp;
+    int *reqFinished = (int*) reqFinished_omp;
+
+    //printf("exposure %f\n", ((KernelGlobals *) kg_bin)->__data.film.exposure);
+    //printf("sample_scale %f \n", 1.0f / (start_sample + 1.0f));
+
+    *sample_finished = start_sample;
+
+#pragma omp parallel for num_threads(nprocs_cpu) schedule(dynamic, 1)
+    for (int i = 0; i < size; i++)
+    {
+//        if (*reqFinished != 0)
+//            continue;
+
+        int y = i / tile_w;
+        int x = i - y * tile_w;
+
+        for (int sample = start_sample; sample < end_sample; sample++)
+        {
+
+            kernel_path_trace((KernelGlobals *) kg_bin, (float *) buffer_bin, (unsigned int*) rng_state_bin, sample, x + tile_x, y + tile_y, offset, stride);
+
+            if (rgba_byte_bin != NULL)
+            {
+                float sample_scale = 1.0f / (sample + 1.0f);
+                //printf("sample_scale %f\n", sample_scale);
+                //fflush(0);
+
+                if (is_rgba_float)
+                    omp_convert_to_half_float(kg_bin, rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride);
+                else
+                    omp_film_convert_byte(kg_bin, rgba_byte_bin, (float *) buffer_bin, sample_scale, x + tile_x, y + tile_y, offset, stride);
+            }
+        }
+    }
+
+    *sample_finished = end_sample;
+    
+    //printf("MIC: sample_finished %d\n", *sample_finished);
+    //fflush(0);
+}
+
+void omp_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu, char* signal_value)
+{
+    omp_path_trace_internal(kg_bin, buffer_bin, rng_state_bin, rgba_byte_bin, is_rgba_float, start_sample, end_sample, tile_x, tile_y, offset, stride, tile_h, tile_w, sample_finished_omp, reqFinished_omp, nprocs_cpu);
+}
+
+DEVICE_PTR omp_alloc_kg(int numDevice)
+{
+    DEVICE_PTR kg_bin;
+
+    KernelGlobals *kg = new KernelGlobals();
+    kg_bin = (DEVICE_PTR) kg;
+
+    return (DEVICE_PTR) kg_bin;
+}
+
+void omp_free_kg(int numDevice, DEVICE_PTR kg_bin)
+{
+    KernelGlobals *kg = (KernelGlobals *) kg_bin;
+    delete kg;
+}
+
+void omp_mem_alloc(int numDevice, char *mem, size_t memSize)
+{
+}
+
+void omp_mem_copy_to(int numDevice, char *mem, size_t memSize, char* signal_value)
+{
+}
+
+void omp_mem_copy_from(int numDevice, char *mem, size_t offset, size_t memSize, char* signal_value)
+{
+}
+
+void omp_mem_zero(int numDevice, char *mem, size_t memSize)
+{
+    memset(mem, 0, memSize);
+}
+
+void omp_mem_free(int numDevice, char *mem, size_t memSize)
+{
+}
+
+void omp_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name_bin, char *mem, size_t memSize)
+{
+}
+
+int omp_get_pass_stride(DEVICE_PTR kg)
+{
+    return ((KernelGlobals*) kg)->__data.film.pass_stride;
+}
+
+size_t omp_get_data_size(DEVICE_PTR kg)
+{
+    return ((KernelGlobals*) kg)->__data_size;
+}
+
+void omp_kernel_path_trace(DEVICE_PTR _kg,
+	float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride)
+{
+    kernel_path_trace((KernelGlobals*) _kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernels/omp/kernel_omp.h b/intern/cycles/kernel/kernels/omp/kernel_omp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b00b30d6eb890a53ec5f9a057096315257135b23
--- /dev/null
+++ b/intern/cycles/kernel/kernels/omp/kernel_omp.h
@@ -0,0 +1,53 @@
+#ifndef __KERNEL_OMP_H__
+#define __KERNEL_OMP_H__
+
+#include "client_api.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+void omp_path_trace(int numDevice, DEVICE_PTR kg_bin, char * buffer_bin, char * rng_state_bin, char* rgba_byte_bin, bool is_rgba_float, int start_sample, int end_sample, int tile_x, int tile_y, int offset, int stride, int tile_h, int tile_w, char *sample_finished_omp, char *reqFinished_omp, int nprocs_cpu, char* signal_value);
+
+/* Device memory */
+DEVICE_PTR omp_alloc_kg(int numDevice);
+void omp_free_kg(int numDevice, DEVICE_PTR kg);
+
+void omp_mem_alloc(int numDevice, char* mem, size_t memSize);
+void omp_mem_copy_to(int numDevice, char* mem, size_t memSize, char* signal_value);
+void omp_mem_copy_from(int numDevice, char* mem, size_t offset, size_t memSize, char* signal_value);
+void omp_mem_zero(int numDevice, char* mem, size_t memSize);
+void omp_mem_free(int numDevice, char* mem, size_t memSize);
+void omp_tex_free(int numDevice, DEVICE_PTR kg_bin, const char *name, char* mem, size_t memSize);
+
+void omp_const_copy(int numDevice, DEVICE_PTR kg, const char *name, char *host, size_t size);
+void omp_tex_copy(int numDevice, DEVICE_PTR kg_bin,
+        const char *name,
+        char* mem,
+        size_t size,
+        size_t width,
+        size_t height,
+        size_t depth,
+        int interpolation,
+        int extension);
+
+void omp_wait(int numDevice, char *signal_value);
+
+void omp_film_convert_byte(DEVICE_PTR _kg,
+        char *_rgba_byte, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride);
+
+void omp_convert_to_half_float(DEVICE_PTR _kg,
+        char *_rgba, float *buffer,
+        float sample_scale, int x, int y, int offset, int stride);
+
+int omp_get_pass_stride(DEVICE_PTR kg);
+size_t omp_get_data_size(DEVICE_PTR kg);
+
+void omp_kernel_path_trace(DEVICE_PTR _kg,
+	float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride);
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_OMP_H__ */
+
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 17ca6ce0f4848622e1f3fdf06365fca66a3ec4bf..c6a223b289eb5cbb13ec38ea7c88f8d86a7fb0e9 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -67,6 +67,18 @@ set(SRC_HEADERS
 	tile.h
 )
 
+if (WITH_IT4I_MPI)
+  add_definitions(-DWITH_IT4I_MPI)
+endif()
+
+if (WITH_IT4I_MIC_OFFLOAD)
+  add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+endif()
+
+if (WITH_OPENMP)
+  add_definitions(-DWITH_OPENMP)
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
 
 include_directories(${INC})
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 5bf5e5113ef8798c8e2f2dc13890f5409a6f4e08..6eb6d75923cad2ad4415b7ec5a11f0a7b1238b36 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -175,9 +175,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		/* needs to be up to data for attribute access */
 		device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-		device->mem_alloc(d_input, MEM_READ_ONLY);
-		device->mem_copy_to(d_input);
-		device->mem_alloc(d_output, MEM_WRITE_ONLY);
+		device->mem_alloc("d_input", d_input, MEM_READ_ONLY);
+		device->mem_copy_to("d_input", d_input);
+		device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY);
 
 		DeviceTask task(DeviceTask::SHADER);
 		task.shader_input = d_input.device_pointer;
@@ -195,15 +195,15 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		device->task_wait();
 
 		if(progress.get_cancel()) {
-			device->mem_free(d_input);
-			device->mem_free(d_output);
+			device->mem_free("d_input", d_input);
+			device->mem_free("d_output", d_output);
 			m_is_baking = false;
 			return false;
 		}
 
-		device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
-		device->mem_free(d_input);
-		device->mem_free(d_output);
+		device->mem_copy_from("d_output", d_output, 0, 1, d_output.size(), sizeof(float4));
+		device->mem_free("d_input", d_input);
+		device->mem_free("d_output", d_output);
 
 		/* read result */
 		int k = 0;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fab3f701757e3d320ec98d8433ae4f530ed2aeb5..a9918f440e1899ac6255305329b65b1554ac752a 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -85,6 +85,10 @@ RenderTile::RenderTile()
 	start_sample = 0;
 	num_samples = 0;
 	resolution = 0;
+        
+        num_samples_orig = 0;
+        progressive = false;        
+        half_float = false;
 
 	offset = 0;
 	stride = 0;
@@ -110,12 +114,12 @@ RenderBuffers::~RenderBuffers()
 void RenderBuffers::device_free()
 {
 	if(buffer.device_pointer) {
-		device->mem_free(buffer);
+		device->mem_free("buffer", buffer);
 		buffer.clear();
 	}
 
 	if(rng_state.device_pointer) {
-		device->mem_free(rng_state);
+		device->mem_free("rng_state", rng_state);
 		rng_state.clear();
 	}
 }
@@ -129,8 +133,8 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	
 	/* allocate buffer */
 	buffer.resize(params.width*params.height*params.get_passes_size());
-	device->mem_alloc(buffer, MEM_READ_WRITE);
-	device->mem_zero(buffer);
+	device->mem_alloc("buffer", buffer, MEM_READ_WRITE);
+	device->mem_zero("buffer", buffer);
 
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);
@@ -142,8 +146,8 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 		for(y = 0; y < height; y++)
 			init_state[x + y*width] = hash_int_2d(params.full_x+x, params.full_y+y);
 
-	device->mem_alloc(rng_state, MEM_READ_WRITE);
-	device->mem_copy_to(rng_state);
+	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
+	device->mem_copy_to("rng_state", rng_state);
 }
 
 bool RenderBuffers::copy_from_device()
@@ -151,168 +155,14 @@ bool RenderBuffers::copy_from_device()
 	if(!buffer.device_pointer)
 		return false;
 
-	device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+	device->mem_copy_from("buffer", buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
 
 	return true;
 }
 
 bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels)
 {
-	int pass_offset = 0;
-
-	foreach(Pass& pass, params.passes) {
-		if(pass.type != type) {
-			pass_offset += pass.components;
-			continue;
-		}
-
-		float *in = (float*)buffer.data_pointer + pass_offset;
-		int pass_stride = params.get_passes_size();
-
-		float scale = (pass.filter)? 1.0f/(float)sample: 1.0f;
-		float scale_exposure = (pass.exposure)? scale*exposure: scale;
-
-		int size = params.width*params.height;
-
-		if(components == 1) {
-			assert(pass.components == components);
-
-			/* scalar */
-			if(type == PASS_DEPTH) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = (f == 0.0f)? 1e10f: f*scale_exposure;
-				}
-			}
-			else if(type == PASS_MIST) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = saturate(f*scale_exposure);
-				}
-			}
-#ifdef WITH_CYCLES_DEBUG
-			else if(type == PASS_BVH_TRAVERSAL_STEPS) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = f;
-				}
-			}
-			else if(type == PASS_RAY_BOUNCES) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = f;
-				}
-			}
-#endif
-			else {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = f*scale_exposure;
-				}
-			}
-		}
-		else if(components == 3) {
-			assert(pass.components == 4);
-
-			/* RGBA */
-			if(type == PASS_SHADOW) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-					float4 f = make_float4(in[0], in[1], in[2], in[3]);
-					float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f;
-
-					pixels[0] = f.x*invw;
-					pixels[1] = f.y*invw;
-					pixels[2] = f.z*invw;
-				}
-			}
-			else if(pass.divide_type != PASS_NONE) {
-				/* RGB lighting passes that need to divide out color */
-				pass_offset = 0;
-				foreach(Pass& color_pass, params.passes) {
-					if(color_pass.type == pass.divide_type)
-						break;
-					pass_offset += color_pass.components;
-				}
-
-				float *in_divide = (float*)buffer.data_pointer + pass_offset;
-
-				for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
-					float3 f = make_float3(in[0], in[1], in[2]);
-					float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
-
-					f = safe_divide_even_color(f*exposure, f_divide);
-
-					pixels[0] = f.x;
-					pixels[1] = f.y;
-					pixels[2] = f.z;
-				}
-			}
-			else {
-				/* RGB/vector */
-				for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-					float3 f = make_float3(in[0], in[1], in[2]);
-
-					pixels[0] = f.x*scale_exposure;
-					pixels[1] = f.y*scale_exposure;
-					pixels[2] = f.z*scale_exposure;
-				}
-			}
-		}
-		else if(components == 4) {
-			assert(pass.components == components);
-
-			/* RGBA */
-			if(type == PASS_SHADOW) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-					float4 f = make_float4(in[0], in[1], in[2], in[3]);
-					float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f;
-
-					pixels[0] = f.x*invw;
-					pixels[1] = f.y*invw;
-					pixels[2] = f.z*invw;
-					pixels[3] = 1.0f;
-				}
-			}
-			else if(type == PASS_MOTION) {
-				/* need to normalize by number of samples accumulated for motion */
-				pass_offset = 0;
-				foreach(Pass& color_pass, params.passes) {
-					if(color_pass.type == PASS_MOTION_WEIGHT)
-						break;
-					pass_offset += color_pass.components;
-				}
-
-				float *in_weight = (float*)buffer.data_pointer + pass_offset;
-
-				for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
-					float4 f = make_float4(in[0], in[1], in[2], in[3]);
-					float w = in_weight[0];
-					float invw = (w > 0.0f)? 1.0f/w: 0.0f;
-
-					pixels[0] = f.x*invw;
-					pixels[1] = f.y*invw;
-					pixels[2] = f.z*invw;
-					pixels[3] = f.w*invw;
-				}
-			}
-			else {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-					float4 f = make_float4(in[0], in[1], in[2], in[3]);
-
-					pixels[0] = f.x*scale_exposure;
-					pixels[1] = f.y*scale_exposure;
-					pixels[2] = f.z*scale_exposure;
-
-					/* clamp since alpha might be > 1.0 due to russian roulette */
-					pixels[3] = saturate(f.w*scale);
-				}
-			}
-		}
-
-		return true;
-	}
-
-	return false;
+    return device->get_pass_rect(type, exposure, sample, components, pixels, params, (float*) buffer.data_pointer);
 }
 
 /* Display Buffer */
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 4fa1c51d821515c4f2940523d816210bdde2471a..75d2cb90ac4b947c347cef36248bb99dd2e8638d 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -134,10 +134,16 @@ public:
 	int x, y, w, h;
 	int start_sample;
 	int num_samples;
-	int sample;
+        
+        int num_samples_orig;
+        bool progressive;
+	
+        int sample;
 	int resolution;
 	int offset;
 	int stride;
+        
+        bool half_float;
 
 	device_ptr buffer;
 	device_ptr rng_state;
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 0bebdaf8a6712ea5b39e4ca860f806a63f613f6e..001ff9141f219b033d40a137e657b2c958cb31ab 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -61,7 +61,7 @@ void ImageManager::set_osl_texture_system(void *texture_system)
 
 void ImageManager::set_extended_image_limits(const DeviceInfo& info)
 {
-	if(info.type == DEVICE_CPU) {
+	if (info.type == DEVICE_CPU || info.type == DEVICE_OMP || info.type == DEVICE_MPI) {
 		tex_num_images = TEX_EXTENDED_NUM_IMAGES_CPU;
 		tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES;
 		tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START;
@@ -700,9 +700,15 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		device_vector<float4>& tex_img = dscene->tex_float_image[slot];
 
+		string name;
+
+		if(slot >= 100) name = string_printf("__tex_image_float_%d", slot);
+		else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot);
+		else name = string_printf("__tex_image_float_00%d", slot);                
+                
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
+			device->tex_free(name.c_str(), tex_img);
 		}
 
 		if(!file_load_float_image(img, tex_img)) {
@@ -715,12 +721,6 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		string name;
-
-		if(slot >= 100) name = string_printf("__tex_image_float_%d", slot);
-		else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot);
-		else name = string_printf("__tex_image_float_00%d", slot);
-
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
@@ -735,9 +735,15 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		device_vector<uchar4>& tex_img = dscene->tex_image[slot - tex_image_byte_start];
 
+		string name;
+
+		if(slot >= 100) name = string_printf("__tex_image_%d", slot);
+		else if(slot >= 10) name = string_printf("__tex_image_0%d", slot);
+		else name = string_printf("__tex_image_00%d", slot);                
+                
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(tex_img);
+			device->tex_free(name.c_str(), tex_img);
 		}
 
 		if(!file_load_image(img, tex_img)) {
@@ -750,12 +756,6 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 			pixels[3] = (TEX_IMAGE_MISSING_A * 255);
 		}
 
-		string name;
-
-		if(slot >= 100) name = string_printf("__tex_image_%d", slot);
-		else if(slot >= 10) name = string_printf("__tex_image_0%d", slot);
-		else name = string_printf("__tex_image_00%d", slot);
-
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
@@ -794,7 +794,7 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, int sl
 
 			if(tex_img.device_pointer) {
 				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
+				device->tex_free("tex_float_image", tex_img);
 			}
 
 			tex_img.clear();
@@ -807,7 +807,7 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, int sl
 
 			if(tex_img.device_pointer) {
 				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
+				device->tex_free("tex_image", tex_img);
 			}
 
 			tex_img.clear();
@@ -928,14 +928,14 @@ void ImageManager::device_pack_images(Device *device,
 	if(dscene->tex_image_packed.size()) {
 		if(dscene->tex_image_packed.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_packed);
+			device->tex_free("__tex_image_packed", dscene->tex_image_packed);
 		}
 		device->tex_alloc("__tex_image_packed", dscene->tex_image_packed);
 	}
 	if(dscene->tex_image_packed_info.size()) {
 		if(dscene->tex_image_packed_info.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_packed_info);
+			device->tex_free("__tex_image_packed_info", dscene->tex_image_packed_info);
 		}
 		device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
 	}
@@ -959,8 +959,8 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
 	for(size_t slot = 0; slot < float_images.size(); slot++)
 		device_free_image(device, dscene, slot);
 
-	device->tex_free(dscene->tex_image_packed);
-	device->tex_free(dscene->tex_image_packed_info);
+	device->tex_free("__tex_image_packed", dscene->tex_image_packed);
+	device->tex_free("__tex_image_packed_info", dscene->tex_image_packed_info);
 
 	dscene->tex_image_packed.clear();
 	dscene->tex_image_packed_info.clear();
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 47489f6e007cd0e03de1f5a4132ca6ff2bc520a6..9923ac6e171727c9c4e47bf1658687a644988c0c 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -179,7 +179,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 void Integrator::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->sobol_directions);
+	device->tex_free("__sobol_directions", dscene->sobol_directions);
 	dscene->sobol_directions.clear();
 }
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1637045ce84bf496099778937c34693bf1a7f756..bf7ba6a56d47401c8677e632034f85e58b0054c4 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
-	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("d_input", d_input, MEM_READ_ONLY);
+	device->mem_copy_to("d_input", d_input);
+	device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask main_task(DeviceTask::SHADER);
 	main_task.shader_input = d_input.device_pointer;
@@ -77,11 +77,11 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	foreach(DeviceTask& task, split_tasks) {
 		device->task_add(task);
 		device->task_wait();
-		device->mem_copy_from(d_output, task.shader_x, 1, task.shader_w, sizeof(float4));
+		device->mem_copy_from("d_output", d_output, task.shader_x, 1, task.shader_w, sizeof(float4));
 	}
 
-	device->mem_free(d_input);
-	device->mem_free(d_output);
+	device->mem_free("d_input", d_input);
+	device->mem_free("d_output", d_output);
 
 	d_input.clear();
 
@@ -801,10 +801,10 @@ void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *sce
 
 void LightManager::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->light_distribution);
-	device->tex_free(dscene->light_data);
-	device->tex_free(dscene->light_background_marginal_cdf);
-	device->tex_free(dscene->light_background_conditional_cdf);
+	device->tex_free("__light_distribution", dscene->light_distribution);
+	device->tex_free("__light_data", dscene->light_data);
+	device->tex_free("__light_background_marginal_cdf", dscene->light_background_marginal_cdf);
+	device->tex_free("__light_background_conditional_cdf", dscene->light_background_conditional_cdf);
 
 	dscene->light_distribution.clear();
 	dscene->light_data.clear();
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 705483112a17fbdcff0063c1ca723f94aaa572a4..df0956f127f1292e75546bc6f8effc8b05ab5a33 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1332,24 +1332,24 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 void MeshManager::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->bvh_nodes);
-	device->tex_free(dscene->bvh_leaf_nodes);
-	device->tex_free(dscene->object_node);
-	device->tex_free(dscene->tri_woop);
-	device->tex_free(dscene->prim_type);
-	device->tex_free(dscene->prim_visibility);
-	device->tex_free(dscene->prim_index);
-	device->tex_free(dscene->prim_object);
-	device->tex_free(dscene->tri_shader);
-	device->tex_free(dscene->tri_vnormal);
-	device->tex_free(dscene->tri_vindex);
-	device->tex_free(dscene->tri_verts);
-	device->tex_free(dscene->curves);
-	device->tex_free(dscene->curve_keys);
-	device->tex_free(dscene->attributes_map);
-	device->tex_free(dscene->attributes_float);
-	device->tex_free(dscene->attributes_float3);
-	device->tex_free(dscene->attributes_uchar4);
+	device->tex_free("__bvh_nodes", dscene->bvh_nodes);
+	device->tex_free("__bvh_leaf_nodes", dscene->bvh_leaf_nodes);
+	device->tex_free("__object_node", dscene->object_node);
+	device->tex_free("__tri_woop", dscene->tri_woop);
+	device->tex_free("__prim_type", dscene->prim_type);
+	device->tex_free("__prim_visibility", dscene->prim_visibility);
+	device->tex_free("__prim_index", dscene->prim_index);
+	device->tex_free("__prim_object", dscene->prim_object);
+	device->tex_free("__tri_shader", dscene->tri_shader);
+	device->tex_free("__tri_vnormal", dscene->tri_vnormal);
+	device->tex_free("__tri_vindex", dscene->tri_vindex);
+	device->tex_free("__tri_verts", dscene->tri_verts);
+	device->tex_free("__curves", dscene->curves);
+	device->tex_free("__curve_keys", dscene->curve_keys);
+	device->tex_free("__attributes_map", dscene->attributes_map);
+	device->tex_free("__attributes_float", dscene->attributes_float);
+	device->tex_free("__attributes_float3", dscene->attributes_float3);
+	device->tex_free("__attributes_uchar4", dscene->attributes_uchar4);
 
 	dscene->bvh_nodes.clear();
 	dscene->object_node.clear();
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index dccfd74f17a81a326340406875511654156b5a81..184b4b7e910fee9ef0e5e8a30159ab35bdd8897f 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -110,9 +110,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	/* needs to be up to data for attribute access */
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
-	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("d_input", d_input, MEM_READ_ONLY);
+	device->mem_copy_to("d_input", d_input);
+	device->mem_alloc("d_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask task(DeviceTask::SHADER);
 	task.shader_input = d_input.device_pointer;
@@ -127,14 +127,14 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	device->task_wait();
 
 	if(progress.get_cancel()) {
-		device->mem_free(d_input);
-		device->mem_free(d_output);
+		device->mem_free("d_input", d_input);
+		device->mem_free("d_output", d_output);
 		return false;
 	}
 
-	device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
-	device->mem_free(d_input);
-	device->mem_free(d_output);
+	device->mem_copy_from("d_output", d_output, 0, 1, d_output.size(), sizeof(float4));
+	device->mem_free("d_input", d_input);
+	device->mem_free("d_output", d_output);
 
 	/* read result */
 	done.clear();
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index ec85aa8f80bd5f9eca9b887754d600e040a6bfb7..230ba7675ec5a096f80db3453a9b330c458267dd 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -471,13 +471,13 @@ void ObjectManager::device_update_flags(Device *device,
 
 void ObjectManager::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->objects);
+	device->tex_free("__objects", dscene->objects);
 	dscene->objects.clear();
 
-	device->tex_free(dscene->objects_vector);
+	device->tex_free("__objects_vector", dscene->objects_vector);
 	dscene->objects_vector.clear();
 
-	device->tex_free(dscene->object_flag);
+	device->tex_free("__object_flag", dscene->object_flag);
 	dscene->object_flag.clear();
 }
 
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 8f9e8c6d6391cf24a21f8b5a02001704a2a0ed3f..9a3751ae21616bc7f18be7414be5ec1354cd820a 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -111,7 +111,7 @@ void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, S
 
 void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->particles);
+	device->tex_free("__particles", dscene->particles);
 	dscene->particles.clear();
 }
 
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 9842cd10a1e170fe7a4869546eefbde6079ef930..5efae1ec40d59ed0abf9a25a0a7f42a8d48b20f7 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -60,7 +60,7 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
 	bake_manager = new BakeManager();
 
 	/* OSL only works on the CPU */
-	if(device_info_.type == DEVICE_CPU)
+	if(device_info_.type == DEVICE_CPU ||device_info_.type == DEVICE_MPI || device_info_.type == DEVICE_OMP)
 		shader_manager = ShaderManager::create(this, params.shadingsystem);
 	else
 		shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM);
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 84a420ce9b68229883386de6c2cdc14fbb0073d7..1a36f3e4fd39601f0d3d9fd2db66fd0a875846cf 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -49,7 +49,7 @@ Session::Session(const SessionParams& params_)
        max(params.device.multi_devices.size(), 1)),
   stats()
 {
-	device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background);
+	device_use_gl = ((params.device.type != DEVICE_CPU && params.device.type != DEVICE_MPI && params.device.type != DEVICE_OMP) && !params.background);
 
 	TaskScheduler::init(params.threads);
 
@@ -376,6 +376,11 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	rtile.h = tile.h;
 	rtile.start_sample = tile_manager.state.sample;
 	rtile.num_samples = tile_manager.state.num_samples;
+        
+        rtile.num_samples_orig = tile_manager.num_samples;
+        rtile.progressive = tile_manager.progressive;        
+        rtile.half_float = (display != NULL) ? display->half_float : false;
+        
 	rtile.resolution = tile_manager.state.resolution_divider;
 
 	tile_lock.unlock();
@@ -830,7 +835,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	/* update status */
 	string status, substatus;
 
-	if(!params.progressive) {
+	if (!params.progressive) {
 		const int progress_sample = progress.get_sample(), num_samples = tile_manager.num_samples;
 		const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
 		const bool is_multidevice = params.device.multi_devices.size() > 1;
@@ -839,8 +844,8 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if((is_gpu && !is_multidevice && !device->info.use_split_kernel) ||
-		   (is_cpu && (num_tiles == 1 || is_last_tile)))
+		if ((is_gpu && !is_multidevice && !device->info.use_split_kernel) ||
+			(is_cpu && (num_tiles == 1 || is_last_tile)))
 		{
 			/* When using split-kernel (OpenCL) each thread in a tile will be working on a different
 			 * sample. Can't display sample number when device uses split-kernel
@@ -854,12 +859,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 */
 
 			int status_sample = progress_sample;
-			if(tile > 1) {
+			if (tile > 1) {
 				/* sample counter is global for all tiles, subtract samples
 				 * from already finished tiles to get sample counter for
 				 * current tile only
 				 */
-				if(is_cpu && is_last_tile && num_tiles > 1) {
+				if (is_cpu && is_last_tile && num_tiles > 1) {
 					status_sample = num_samples - (num_samples * num_tiles - progress_sample);
 				}
 				else {
@@ -869,6 +874,10 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 			substatus += string_printf(", Sample %d/%d", status_sample, num_samples);
 		}
+		else if (params.device.type == DEVICE_MPI || params.device.type == DEVICE_OMP)
+		{
+			substatus = string_printf("Path Tracing Tile %d/%d", device->get_tile_id(), device->get_num_tiles());
+		}
 	}
 	else if(tile_manager.num_samples == INT_MAX)
 		substatus = string_printf("Path Tracing Sample %d", sample+1);
@@ -900,9 +909,10 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	progress.set_tile(tile, tile_time);
 }
 
-void Session::update_progress_sample()
+void Session::update_progress_sample(int s)
 {
-	progress.increment_sample();
+	//progress.increment_sample();
+    progress.set_sample(s);
 }
 
 void Session::path_trace()
@@ -914,7 +924,7 @@ void Session::path_trace()
 	task.release_tile = function_bind(&Session::release_tile, this, _1);
 	task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
 	task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
-	task.update_progress_sample = function_bind(&Session::update_progress_sample, this);
+	task.update_progress_sample = function_bind(&Session::update_progress_sample, this, _1);
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c669bccd34bf028f5ce675fd4a7d9c51d6a0ed85..0ac6fffc49df000ecb4d94116f980570811f42b8 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -174,7 +174,7 @@ protected:
 	void update_tile_sample(RenderTile& tile);
 	void release_tile(RenderTile& tile);
 
-	void update_progress_sample();
+	void update_progress_sample(int s);
 
 	bool device_use_gl;
 
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 09a6061abea8561b4f784835deef3fb3f54fff5b..6b8b82cee36d434dcaee4bb7ea173150f01d34a5 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -331,7 +331,7 @@ void ShaderManager::device_update_common(Device *device,
                                          Scene *scene,
                                          Progress& /*progress*/)
 {
-	device->tex_free(dscene->shader_flag);
+	device->tex_free("__shader_flag", dscene->shader_flag);
 	dscene->shader_flag.clear();
 
 	if(scene->shaders.size() == 0)
@@ -423,7 +423,7 @@ void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scen
 		beckmann_table_offset = TABLE_OFFSET_INVALID;
 	}
 
-	device->tex_free(dscene->shader_flag);
+	device->tex_free("__shader_flag", dscene->shader_flag);
 	dscene->shader_flag.clear();
 }
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index f3d39c1bd72ddd01923213c90106ae1c50a6d290..6228d0b60bf5f5846586830438de625d32667859 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -103,7 +103,7 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 {
 	device_free_common(device, dscene, scene);
 
-	device->tex_free(dscene->svm_nodes);
+	device->tex_free("__svm_nodes", dscene->svm_nodes);
 	dscene->svm_nodes.clear();
 }
 
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index ad3f486607229e81349754232ef5ad531c3af16d..f0c915d3851c0c45a4f085adde5888550d471c99 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -42,7 +42,7 @@ void LookupTables::device_update(Device *device, DeviceScene *dscene)
 	if(!need_update)
 		return;
 
-	device->tex_free(dscene->lookup_table);
+	device->tex_free("__lookup_table", dscene->lookup_table);
 
 	if(lookup_tables.size() > 0)
 		device->tex_alloc("__lookup_table", dscene->lookup_table);
@@ -52,7 +52,7 @@ void LookupTables::device_update(Device *device, DeviceScene *dscene)
 
 void LookupTables::device_free(Device *device, DeviceScene *dscene)
 {
-	device->tex_free(dscene->lookup_table);
+	device->tex_free("__lookup_table", dscene->lookup_table);
 	dscene->lookup_table.clear();
 }
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 3fb60735b65a8fde0cd0cf45281dc84d0f142039..a6a764d6e1461001809a299676deb6919e602e13 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -110,6 +110,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	params = params_;
 
 	int divider = 1;
+        
+#if !defined(WITH_IT4I_MIC_OFFLOAD) && !defined(WITH_IT4I_MPI) && !defined(WITH_OPENMP)
 	int w = params.width, h = params.height;
 
 	if(start_resolution != INT_MAX) {
@@ -120,7 +122,7 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 			divider *= 2;
 		}
 	}
-
+#endif
 	num_samples = num_samples_;
 
 	state.buffer = BufferParams();
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 700e00c9e0ad3ef0daa4bd0ee0444ac46f4347fe..48ce0cd891c8d95ee340dd2f3849ac75988c60b3 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -70,6 +70,7 @@ public:
 	} state;
 
 	int num_samples;
+        bool progressive;
 
 	TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution,
 	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
@@ -85,8 +86,7 @@ public:
 protected:
 
 	void set_tiles();
-
-	bool progressive;
+	
 	int2 tile_size;
 	TileOrder tile_order;
 	int start_resolution;
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 1fef0bd044e97cf131c971b34e08f4418bd235ed..2bff9a1322952c884614600901ad077b40bb08c3 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -18,6 +18,7 @@
 #define __UTIL_OPTIMIZATION_H__
 
 #ifndef __KERNEL_GPU__
+#ifndef __KERNEL_MIC__
 
 /* quiet unused define warnings */
 #if defined(__KERNEL_SSE2__)  || \
@@ -117,5 +118,7 @@
 
 #endif
 
+#endif
+
 #endif /* __UTIL_OPTIMIZATION_H__ */
 
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 0b35142ddb36d9a29250b32a71f538387b9d0c4b..d21594f7833cb69addbbffd81ec4944f43bd2127 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -192,6 +192,13 @@ public:
 
 		sample++;
 	}
+        
+	void set_sample(int s)
+	{
+		thread_scoped_lock lock(progress_mutex);
+
+		sample = s;
+	}        
 
 	void increment_sample_update()
 	{
diff --git a/it4i/client/CMakeLists.txt b/it4i/client/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b009608692c434ce76350d8dc4db020a50bcf84
--- /dev/null
+++ b/it4i/client/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 2.8)
+
+project (blender_client)
+#set( CMAKE_VERBOSE_MAKEFILE on )                                                                
+
+option(WITH_IT4I_MIC_OFFLOAD        "Enable MIC (has to be supported by the compiler)" OFF)
+option(WITH_IT4I_MIC_NATIVE "Enable MIC_NATIVE (has to be supported by the compiler)" OFF)
+
+
+if(WITH_IT4I_MIC_NATIVE) 
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -mmic" CACHE STRING "CMAKE_CXX_FLAGS_RELEASE" FORCE)
+    set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -mmic" CACHE STRING "CMAKE_CXX_FLAGS_DEBUG" FORCE)
+
+    set(MIC_FLAG "-mic")
+else() 
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "CMAKE_CXX_FLAGS_RELEASE" FORCE)
+    set(CMAKE_CXX_FLAGS_DEBUG "-g -O0" CACHE STRING "CMAKE_CXX_FLAGS_DEBUG" FORCE)
+endif()
+
+
+set(CMAKE_CXX_FLAGS "-qopenmp")
+
+add_definitions(
+    -DCCL_NAMESPACE_BEGIN=
+    -DCCL_NAMESPACE_END=
+)
+
+# check flag
+if(WITH_IT4I_MIC_NATIVE AND WITH_IT4I_MIC_OFFLOAD)
+    message(FATAL_ERROR "The flags WITH_IT4I_MIC_NATIVE and WITH_IT4I_MIC_OFFLOAD are not compatible.")
+endif()
+
+# Subdirectories
+add_subdirectory(main)
+add_subdirectory(cycles_mpi)
+
+if(WITH_IT4I_MIC_NATIVE)
+    add_subdirectory(cycles_mic)
+else()
+    add_subdirectory(cycles_omp)
+endif()
+
+if(WITH_IT4I_MIC_OFFLOAD)
+    add_subdirectory(cycles_mic)
+endif()
+
diff --git a/it4i/client/api/client_api.h b/it4i/client/api/client_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..180eca0949613d5fee2a47c74aeccf7d5af72b7e
--- /dev/null
+++ b/it4i/client/api/client_api.h
@@ -0,0 +1,148 @@
+#ifndef __client_api_H__
+#define __client_api_H__
+
+#include <cstdio>
+
+/////////////////////////cycles//////////////////////////////////////
+#define MPI_TAG_mpi_cycles_start				1000
+
+#define MPI_TAG_mpi_const_copy					1001
+#define MPI_TAG_mpi_tex_copy					1002
+#define MPI_TAG_mpi_path_trace					1003
+#define MPI_TAG_mpi_branched_path_trace			1004
+#define MPI_TAG_mpi_film_convert_half			1005
+#define MPI_TAG_mpi_film_convert_byte			1006
+#define MPI_TAG_mpi_bake						1007
+#define MPI_TAG_mpi_shader						1008
+
+#define MPI_TAG_mpi_alloc_kg					1010
+#define MPI_TAG_mpi_free_kg						1011
+#define MPI_TAG_mpi_mem_alloc					1012
+#define MPI_TAG_mpi_mem_copy_to					1013
+#define MPI_TAG_mpi_mem_copy_from				1014
+#define MPI_TAG_mpi_mem_zero					1015
+#define MPI_TAG_mpi_mem_free					1016
+
+#define MPI_TAG_mpi_path_trace_buffer			1017
+#define MPI_TAG_mpi_path_trace_rng_state		1018
+//#define MPI_TAG_mpi_path_trace_buffer_sample	1019
+#define MPI_TAG_mpi_path_trace_rgba             1020
+#define MPI_TAG_mpi_path_trace_rng              1021
+#define MPI_TAG_mpi_tex_copy_data				1022
+#define MPI_TAG_mpi_mem_copy_to_data			1023
+#define MPI_TAG_mpi_mem_copy_from_data			1024
+#define MPI_TAG_mpi_tex_free					1025
+
+#define MPI_TAG_mpi_cycles_end					1999
+
+//////////////////////////////OTHER//////////////////////////////////////
+#define MPI_NAME_MAX_LENGTH    256
+#define DEVICE_PTR unsigned long long
+//#define MAX_NODE_DEVICES 3
+//#define TILE_STEP 4
+/////////////////////////////////CYCLES///////////////////////////////////////////////
+/* Path Tracing */
+struct mpi_path_trace_struct
+{
+	DEVICE_PTR buffer;
+	DEVICE_PTR rng_state;
+	int start_sample;
+	int num_samples;
+	bool progressive;
+	int tile_x;
+	int tile_y;
+	int offset;
+	int stride;
+	int tile_h;
+	int tile_w;
+
+	DEVICE_PTR rgba_pixels;
+	bool half_float;
+	size_t kg_data_size;
+        
+        //bool enable_mics;
+};
+
+/* Film */
+struct mpi_film_convert_struct
+{
+	DEVICE_PTR rgba;
+	DEVICE_PTR buffer;
+	float sample_scale;
+	int offset;
+	int stride;
+	int task_x;
+	int task_y;
+	int task_h;
+	int task_w;
+};
+
+/* Shader Evaluation */
+struct mpi_bake_struct
+{
+	DEVICE_PTR input;
+	DEVICE_PTR output;
+	int type;
+	int task_shader_x;
+	int task_shader_w;
+	int offset;
+	int sample;
+};
+
+struct mpi_shader_struct
+{
+	DEVICE_PTR input;
+	DEVICE_PTR output;
+	int type;
+	int task_shader_x;
+	int task_shader_w;
+	int sample;
+};
+
+struct mpi_mem_struct
+{
+	DEVICE_PTR mem;
+	char name[MPI_NAME_MAX_LENGTH];
+	size_t offset;
+	size_t memSize;
+};
+
+struct mpi_const_copy_struct
+{
+	char name[MPI_NAME_MAX_LENGTH];
+	DEVICE_PTR host;
+	size_t size;
+};
+
+struct mpi_tex_copy_struct
+{
+	char name[MPI_NAME_MAX_LENGTH];
+	DEVICE_PTR mem;
+	size_t size;
+	size_t width;
+	size_t height;
+	size_t depth;
+	int interpolation;
+	int extension;
+};
+
+///////////////////////////////////////////////////////////////////////
+struct mpi_kernel_struct
+{
+	int mpi_tag;
+	int world_size;
+	int world_rank;        
+        //bool enable_mics;
+
+	////////////cycles///////////////////////
+	mpi_path_trace_struct mpi_path_trace_data;
+	mpi_film_convert_struct mpi_film_convert_data;
+	mpi_bake_struct mpi_bake_data;
+	mpi_shader_struct mpi_shader_data;
+	mpi_mem_struct mpi_mem_data;
+	mpi_const_copy_struct mpi_const_copy_data;
+	mpi_tex_copy_struct mpi_tex_copy_data;
+};
+
+#endif /* __client_api_H__ */
+
diff --git a/it4i/client/cycles_mic/CMakeLists.txt b/it4i/client/cycles_mic/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..102e0bd8a871fe270aa4112d95ebecb8539c9857
--- /dev/null
+++ b/it4i/client/cycles_mic/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(INC
+	.
+	../../../intern/cycles/util
+	../../../intern/cycles/kernel
+	../../../intern/cycles/kernel/kernels/mic
+	../../../intern/cycles/kernel/kernels/mpi
+        ../../../intern/cycles/kernel/kernels/omp
+	../api
+	${MPI_INCLUDE_DIR}
+)
+
+set(SRC
+	../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp
+)
+
+add_definitions(-DBLENDER_CLIENT)
+
+if(WITH_IT4I_MIC_NATIVE)
+    add_definitions(-DWITH_IT4I_MIC_NATIVE)
+    set_source_files_properties(../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qoffload=none")
+endif()
+
+if(WITH_IT4I_MIC_OFFLOAD)
+    add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+    #-ip -fp-model fast=2
+    #set_source_files_properties(../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-g -O0 -qoffload-attribute-target=mic")
+    set_source_files_properties(../../../intern/cycles/kernel/kernels/mic/kernel_mic.cpp PROPERTIES COMPILE_FLAGS "-qoffload-attribute-target=mic")
+endif()
+
+
+include_directories(${INC})
+add_library(cycles_mic${MIC_FLAG} SHARED ${SRC})
+
+install (TARGETS cycles_mic${MIC_FLAG} DESTINATION lib)
diff --git a/it4i/client/cycles_mpi/CMakeLists.txt b/it4i/client/cycles_mpi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a44ae8d66f80a73d07d54059a0009649a67717ec
--- /dev/null
+++ b/it4i/client/cycles_mpi/CMakeLists.txt
@@ -0,0 +1,47 @@
+set(INC
+	.
+	../../../intern/cycles/util
+	../../../intern/cycles/kernel
+	../../../intern/cycles/kernel/kernels/mic
+	../../../intern/cycles/kernel/kernels/mpi
+        ../../../intern/cycles/kernel/kernels/omp
+	../api
+	${MPI_INCLUDE_DIR}
+)
+
+set(SRC
+	cycles_mpi.cpp
+)
+
+set(SRC_HEADERS
+	cycles_mpi.h
+)
+
+add_definitions(-DBLENDER_CLIENT)
+
+if(WITH_IT4I_MIC_NATIVE)
+	add_definitions(-DWITH_IT4I_MIC_NATIVE)
+endif()
+
+if(WITH_IT4I_MIC_OFFLOAD)
+	add_definitions(-DWITH_IT4I_MIC_OFFLOAD)
+endif()
+
+include_directories(${INC})
+add_library(cycles_mpi${MIC_FLAG} SHARED ${SRC} ${SRC_HEADERS})
+target_link_libraries(cycles_mpi${MIC_FLAG} ${MPI_LIB_FILE})
+
+if(WITH_IT4I_MIC_NATIVE)
+    add_dependencies(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG})
+    target_link_libraries(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG})
+else()
+    add_dependencies(cycles_mpi${MIC_FLAG} cycles_omp${MIC_FLAG})
+    target_link_libraries(cycles_mpi${MIC_FLAG} cycles_omp${MIC_FLAG})
+endif()
+
+if(WITH_IT4I_MIC_OFFLOAD)
+    add_dependencies(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG})
+    target_link_libraries(cycles_mpi${MIC_FLAG} cycles_mic${MIC_FLAG})
+endif()
+
+install (TARGETS cycles_mpi${MIC_FLAG} DESTINATION lib)
diff --git a/it4i/client/cycles_mpi/cycles_mpi.cpp b/it4i/client/cycles_mpi/cycles_mpi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..073d09979f1ae2bc1e510b24e2ecaabaf723d385
--- /dev/null
+++ b/it4i/client/cycles_mpi/cycles_mpi.cpp
@@ -0,0 +1,1169 @@
+#include "cycles_mpi.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include <map>
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+#include "kernel_mic.h"
+#endif
+
+#ifdef WITH_IT4I_MIC_NATIVE
+#include "kernel_mic.h"
+#else
+#include "kernel_omp.h"
+#endif
+
+#include <omp.h>
+#include <mpi.h>
+
+#include <unistd.h>
+
+#define SIZEOF_UCHAR4 (sizeof(unsigned char)*4)
+
+CCL_NAMESPACE_BEGIN
+
+struct sMpiData
+{
+    DEVICE_PTR kernel_globals_cpu;
+    std::vector<DEVICE_PTR> kernel_globals_mics;
+    std::map<DEVICE_PTR, DEVICE_PTR> ptr_map;
+};
+
+sMpiData *mpiData = NULL;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+
+void split(std::vector<std::string> &result, std::string s, std::string delimiter)
+{
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos)
+    {
+        token = s.substr(0, pos);
+        //std::cout << token << std::endl;
+        //printf("dev: %s\n", token.c_str());
+        result.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    //std::cout << s << std::endl;
+    //printf("dev: %s\n", s.c_str());
+    //result.push_back(s);
+
+    //std::cout << "end" << std::endl;
+    //printf("dev: %s\n", "end");
+}
+
+int micFindDevices()
+{
+#if !defined(_WIN32) && !defined(__APPLE__)
+    FILE *handle = popen("micinfo -group Versions | grep 'Device Name'", "r");
+    if (!handle)
+        return 0;
+
+    char buffer[4096] = {0};
+    int len = fread(buffer, 1, sizeof (buffer) - 1, handle);
+    buffer[len] = '\0';
+    pclose(handle);
+
+    if (!buffer[0])
+        return 0;
+
+    std::string mics = std::string(buffer);
+    std::vector<std::string> strDevices;
+
+    split(strDevices, mics, "\n");
+
+    return strDevices.size();
+#endif
+
+#if defined(_WIN32)
+    return 1; //"fakeMIC"
+#else
+    return 0;
+#endif
+}
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void mpi_const_copy(mpi_kernel_struct &data)
+{
+    //KernelGlobals *kg = (KernelGlobals *) mpiData->kernel_globals_cpu;
+
+    //MPI_Bcast(&kg->__data, data.mpi_const_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD);
+    std::vector<char> kg_data(data.mpi_const_copy_data.size);
+    MPI_Bcast(&kg_data[0], data.mpi_const_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    mic_const_copy(-1, mpiData->kernel_globals_cpu, data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size);
+#else
+    omp_const_copy(-1, mpiData->kernel_globals_cpu, data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size);
+#endif
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_const_copy(dev, mpiData->kernel_globals_mics[dev], data.mpi_const_copy_data.name, &kg_data[0], data.mpi_const_copy_data.size);
+    }
+#endif
+}
+
+void mpi_tex_copy(mpi_kernel_struct &data)
+{
+
+    //printf("mpi_tex_copy: %s, %d\n", data.mpi_tex_copy_data.name, data.mpi_tex_copy_data.size);
+
+    mpiData->ptr_map[data.mpi_tex_copy_data.mem] = (DEVICE_PTR)new char[data.mpi_tex_copy_data.size];
+    MPI_Bcast((char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem], data.mpi_tex_copy_data.size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    mic_tex_copy(-1, mpiData->kernel_globals_cpu,
+            data.mpi_tex_copy_data.name,
+            (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem],
+            data.mpi_tex_copy_data.size,
+            data.mpi_tex_copy_data.width,
+            data.mpi_tex_copy_data.height,
+            data.mpi_tex_copy_data.depth,
+            data.mpi_tex_copy_data.interpolation,
+            data.mpi_tex_copy_data.extension);
+#else
+    omp_tex_copy(-1, mpiData->kernel_globals_cpu,
+            data.mpi_tex_copy_data.name,
+            (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem],
+            data.mpi_tex_copy_data.size,
+            data.mpi_tex_copy_data.width,
+            data.mpi_tex_copy_data.height,
+            data.mpi_tex_copy_data.depth,
+            data.mpi_tex_copy_data.interpolation,
+            data.mpi_tex_copy_data.extension);
+#endif
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_tex_copy(dev, mpiData->kernel_globals_mics[dev],
+                data.mpi_tex_copy_data.name,
+                (char*) mpiData->ptr_map[data.mpi_tex_copy_data.mem],
+                data.mpi_tex_copy_data.size,
+                data.mpi_tex_copy_data.width,
+                data.mpi_tex_copy_data.height,
+                data.mpi_tex_copy_data.depth,
+                data.mpi_tex_copy_data.interpolation,
+                data.mpi_tex_copy_data.extension);
+    }
+#endif
+}
+
+void mpi_alloc_kg(mpi_kernel_struct &data)
+{
+    if (mpiData != NULL)
+    {
+        delete mpiData;
+    }
+
+    mpiData = new sMpiData();
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    mpiData->kernel_globals_cpu = mic_alloc_kg(-1);
+#else
+    mpiData->kernel_globals_cpu = omp_alloc_kg(-1);
+#endif
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    int mics = micFindDevices();
+    if (mics > 0/* && data.enable_mics*/)
+    {
+        mpiData->kernel_globals_mics.resize(mics);
+        //printf("mics: %d\n", mics);
+    }
+
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mpiData->kernel_globals_mics[dev] = mic_alloc_kg(dev);
+    }
+#endif
+}
+
+void mpi_free_kg(mpi_kernel_struct &data)
+{
+#ifdef WITH_IT4I_MIC_NATIVE
+    mic_free_kg(-1, mpiData->kernel_globals_cpu);
+#else
+    omp_free_kg(-1, mpiData->kernel_globals_cpu);
+#endif
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_free_kg(dev, mpiData->kernel_globals_mics[dev]);
+    }
+#endif
+
+    mpiData->kernel_globals_mics.clear();
+}
+
+void mpi_mem_alloc(mpi_kernel_struct &data)
+{
+    //printf("mpi_mem_alloc: %s, %zu\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize);
+
+    mpiData->ptr_map[data.mpi_mem_data.mem] = (DEVICE_PTR) new char[data.mpi_mem_data.memSize];
+    memset((char*) mpiData->ptr_map[data.mpi_mem_data.mem], 0, data.mpi_mem_data.memSize);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_mem_alloc(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize);
+    }
+#endif
+}
+
+void mpi_mem_copy_to(mpi_kernel_struct &data)
+{
+    MPI_Bcast((char *) mpiData->ptr_map[data.mpi_mem_data.mem]/* + data.mpi_mem_data.offset*/, data.mpi_mem_data.memSize, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_mem_copy_to(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize, NULL);
+    }
+#endif
+}
+
+void mpi_mem_zero(mpi_kernel_struct &data)
+{
+    memset((char *) mpiData->ptr_map[data.mpi_mem_data.mem] /*+ data.mpi_mem_data.offset*/, 0, data.mpi_mem_data.memSize);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_mem_zero(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize);
+    }
+#endif
+}
+
+void mpi_mem_free(mpi_kernel_struct &data)
+{
+    if (mpiData->ptr_map[data.mpi_mem_data.mem])
+    {
+        //printf("mpi_mem_free: %s, %zu\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize);
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+        {
+            mic_mem_free(dev, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize);
+        }
+#endif
+
+        char *tmp = (char*) mpiData->ptr_map[data.mpi_mem_data.mem];
+        delete tmp;
+        mpiData->ptr_map.erase(data.mpi_mem_data.mem);
+    }
+}
+
+void mpi_tex_free(mpi_kernel_struct &data)
+{
+    if (mpiData->ptr_map[data.mpi_mem_data.mem])
+    {
+        //printf("mpi_tex_free: %s, %d\n", data.mpi_mem_data.name, data.mpi_mem_data.memSize);
+#ifdef WITH_IT4I_MIC_OFFLOAD
+        for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+        {
+            mic_tex_free(dev, mpiData->kernel_globals_mics[dev], data.mpi_mem_data.name, (char*) mpiData->ptr_map[data.mpi_mem_data.mem], data.mpi_mem_data.memSize);
+        }
+#endif
+        char *tmp = (char*) mpiData->ptr_map[data.mpi_mem_data.mem];
+        delete tmp;
+        mpiData->ptr_map.erase(data.mpi_mem_data.mem);
+    }
+}
+
+//offline rendering - native, cpu, cpu+offload
+//#ifdef WITH_IT4I_MIC_OFFLOAD
+
+void mpi_path_trace_offline(mpi_kernel_struct &data)
+{
+    ///////////////////////////share nodes////////////////////////////////////
+
+    size_t offsetSample = 0;
+    size_t sizeSample = sizeof (int);
+
+    int reqFinished = 0;
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int));
+    }
+#endif
+    int reqJob = -1;
+    size_t sizeJob = sizeof (int);
+
+    int start_sample = data.mpi_path_trace_data.start_sample;
+    int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples;
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu);
+#else
+    int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu);
+#endif
+
+    int offset = data.mpi_path_trace_data.offset;
+    int stride = data.mpi_path_trace_data.stride;
+
+    int tile_x = data.mpi_path_trace_data.tile_x;
+    int tile_w = data.mpi_path_trace_data.tile_w;
+
+    ////////////////////////////one node///////////////////////////////////
+    omp_set_nested(1);
+
+    int tile_step_node = 1; //TILE_STEP;
+    if (getenv("IT4I_OMP_TILE_STEP"))
+    {
+        tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP"));
+        printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node);
+    }
+
+    int nprocs_mic = 240;
+    int nprocs_cpu = omp_get_max_threads() - 1;
+
+    if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+    {
+        nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1;
+    }
+
+    if (getenv("IT4I_OMP_MIC_NUM_THREADS"))
+    {
+        nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS"));
+    }
+
+    int dev_node = data.world_rank - 1;
+    int devices_size_node = data.world_size - 1;
+
+    int tile_h_node = tile_step_node;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    int tile_h_cpu = tile_step_node / 2.0;
+    if (tile_h_cpu < 1)
+        tile_h_cpu = 1;
+    int tile_h_mic = tile_step_node * 1.0 / 4.0;
+#else
+    int tile_h_cpu = tile_step_node;
+    int tile_h_mic = 0;
+#endif    
+    int omp_path_trace_req = 0;
+
+    int size_node = tile_h_node * tile_w;
+    int size_cpu = tile_h_cpu * tile_w;
+    int size_mic = tile_h_mic * tile_w;
+
+    //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+    size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+    size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float);
+    size_t sizeBuf_mic = size_mic * pass_stride * sizeof (float);
+
+    //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+    size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+    size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4;
+    size_t sizeByte_mic = size_mic * SIZEOF_UCHAR4;
+
+    //int sample_finished_node = 0;
+    ////////////////////////////MICS//////////////////////////////////////
+    int signal1, signal2, signal3, signal4;
+
+    const int num_devices_cpu_mics = mpiData->kernel_globals_mics.size() + 1;
+    //const int num_devices_mics = mpiData->kernel_globals_mics.size();
+
+    std::vector<int> sample_finished_devices(num_devices_cpu_mics);
+
+    reqJob = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+    std::vector<int> tile_y_devices(num_devices_cpu_mics);
+
+    for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+    {
+        sample_finished_devices[dev] = end_sample;
+        tile_y_devices[dev] = 0;
+
+#ifdef WITH_IT4I_MIC_OFFLOAD        
+        if (dev > 0)
+        {
+            mic_mem_alloc(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int));
+        }
+#endif       
+    }
+
+    //tile_y_devices[0] = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+
+    //int signal1, signal2, signal3, signal4;
+
+    //////////////////////////////////////////////////////////////////
+
+#pragma omp parallel num_threads(2)
+    {
+#pragma omp single nowait
+        {
+#pragma omp task
+            {
+                while (reqFinished == 0)
+                {
+#pragma omp flush
+                    if (omp_path_trace_req != 0)
+                    {
+                        //mic_path_trace(0, mpiData->kernel_globals_mics[0], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices + tile_h_cpu, offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[1], (char*) &reqFinished, nprocs_mic, signal1);
+                        //mic_path_trace(1, mpiData->kernel_globals_mics[1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices + tile_h_cpu + tile_h_mic, offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[2], (char*) &reqFinished, nprocs_mic, signal2);
+                        printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev_node, sample_finished_devices[0], end_sample, tile_y_devices[0], tile_h_cpu);
+                        fflush(0);
+#ifdef WITH_IT4I_MIC_NATIVE
+                        mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_mic, NULL);
+#else
+                        omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[0], offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_devices[0], (char*) &reqFinished, nprocs_cpu, NULL);
+#endif
+                        //mic_wait(0, signal1);
+                        //mic_wait(1, signal2);
+
+                        omp_path_trace_req = 0;
+                    }
+                    usleep(100);
+                }
+            }
+
+#pragma omp task
+            {
+                while (true)
+                {
+                    int min_count = end_sample;
+
+                    for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+                    {
+                        if (reqJob >= 0)
+                        {
+                            sample_finished_devices[dev] = start_sample;
+
+                            if (dev == 0)
+                            {
+                                tile_y_devices[dev] = reqJob;
+                                omp_path_trace_req = 1;
+                            }
+                            else
+                            {
+#ifdef WITH_IT4I_MIC_OFFLOAD                                
+                                if (dev == 1)
+                                {
+                                    tile_y_devices[dev] = reqJob + tile_h_cpu;
+
+                                    //printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h_mic);
+                                    //fflush(0);
+                                    mic_path_trace(0, mpiData->kernel_globals_mics[dev - 1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal1);
+                                }
+                                if (dev == 2)
+                                {
+                                    tile_y_devices[dev] = reqJob + tile_h_cpu + tile_h_mic;
+
+                                    //printf("dev %d, sample_finished_devices %d, end_sample %d, tile_y_devices %d, tile_h %d\n", dev, sample_finished_devices[dev], end_sample, tile_y_devices[dev], tile_h_mic);
+                                    //fflush(0);
+                                    mic_path_trace(1, mpiData->kernel_globals_mics[dev - 1], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices[dev], offset, stride, tile_h_mic, tile_w, (char*) &sample_finished_devices[dev], (char*) &reqFinished, nprocs_mic, signal2);
+                                }
+#endif                                
+                            }
+                        }
+#ifdef WITH_IT4I_MIC_OFFLOAD
+                        if (dev > 0)
+                        {
+                            if (tile_y_devices[dev] != 0)
+                            {
+                                if (data.mpi_path_trace_data.rgba_pixels != NULL)
+                                {
+                                    size_t offsetByte_mic = (offset + tile_x + (tile_y_devices[dev]) * stride) * SIZEOF_UCHAR4;
+                                    mic_mem_copy_from(dev - 1, (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels], offsetByte_mic, sizeByte_mic, NULL); //(char*) &data.mpi_path_trace_data.rgba_pixels);
+                                }
+                                else
+                                {
+                                    size_t offsetBuf_mic = (offset + tile_x + (tile_y_devices[dev]) * stride) * pass_stride * sizeof (float);
+                                    mic_mem_copy_from(dev - 1, (char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer], offsetBuf_mic, sizeBuf_mic, NULL); // (char*) &data.mpi_path_trace_data.buffer);
+                                }
+                            }
+                        }
+#endif                        
+#pragma omp flush
+                        if (min_count > sample_finished_devices[dev])
+                            min_count = sample_finished_devices[dev];
+                    }
+
+                    int req = (min_count == end_sample) ? 0 : 1;
+
+#pragma omp flush                    
+                    MPI_Gatherv(&req, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+                    MPI_Gatherv(&tile_y_devices[0], sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+                    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+                    {
+                        size_t offsetByte_node = (offset + tile_x + tile_y_devices[0] * stride) * SIZEOF_UCHAR4;
+                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+                    }
+                    else
+                    {
+                        size_t offsetBuf_node = (offset + tile_x + tile_y_devices[0] * stride) * pass_stride * sizeof (float);
+                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+                    }
+
+                    MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD);
+                    MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+                    if (reqFinished != 0)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+#pragma omp taskwait
+    }
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < num_devices_cpu_mics; dev++)
+    {
+        if (dev > 0)
+        {
+            //mic_wait(dev, (char*)&reqFinished);
+            //mic_wait(dev - 1, (char*) &data.mpi_path_trace_data.rng_state);
+            if (dev == 1)
+                mic_wait(dev - 1, signal1);
+
+            if (dev == 2)
+                mic_wait(dev - 1, signal2);
+
+            mic_mem_free(dev - 1, (char*) &sample_finished_devices[dev], sizeof (int));
+            mic_mem_free(dev - 1, (char*) &reqFinished, sizeof (int));
+        }
+    }
+#endif
+
+    MPI_Gatherv(&tile_y_devices[0], sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+    {
+        size_t offsetByte_node = (offset + tile_x + tile_y_devices[0] * stride) * SIZEOF_UCHAR4;
+        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else
+    {
+        size_t offsetBuf_node = (offset + tile_x + tile_y_devices[0] * stride) * pass_stride * sizeof (float);
+        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+}
+//#endif
+
+//void mpi_path_trace_offline(mpi_kernel_struct &data)
+//{
+//    //printf("CLIENT: mpi_path_trace_offline\n");
+//    ///////////////////////////share nodes////////////////////////////////////
+//
+//    size_t offsetSample = 0;
+//    size_t sizeSample = sizeof (int);
+//
+//    int reqFinished = 0;
+//    int reqJob = -1;
+//    size_t sizeJob = sizeof (int);
+//
+//    int start_sample = data.mpi_path_trace_data.start_sample;
+//    int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples;
+//
+//#ifdef WITH_IT4I_MIC_NATIVE
+//    int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu);
+//#else
+//    int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu);
+//#endif
+//
+//    int offset = data.mpi_path_trace_data.offset;
+//    int stride = data.mpi_path_trace_data.stride;
+//
+//    int tile_x = data.mpi_path_trace_data.tile_x;
+//    int tile_w = data.mpi_path_trace_data.tile_w;
+//
+//    ////////////////////////////one node///////////////////////////////////
+//    omp_set_nested(1);
+//
+//    int nprocs_mic = 240;
+//    int nprocs_cpu = omp_get_max_threads() - 1;
+//
+//    if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+//    {
+//        nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS")) - 1;
+//    }
+//
+//    if (getenv("IT4I_OMP_MIC_NUM_THREADS"))
+//    {
+//        nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS"));
+//    }
+//
+//    int dev_node = data.world_rank - 1;
+//    int devices_size_node = data.world_size - 1;
+//
+//    int tile_step_node = 1; //TILE_STEP;
+//    if (getenv("IT4I_OMP_TILE_STEP"))
+//    {
+//        tile_step_node = atoi(getenv("IT4I_OMP_TILE_STEP"));
+//        printf("IT4I_OMP_TILE_STEP: %d\n", tile_step_node);
+//    }
+//
+//    int tile_h_node = tile_step_node;
+//    int omp_path_trace_req = 1;
+//
+//    int size_node = tile_h_node * tile_w;
+//
+//    //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+//    size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+//
+//    //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+//    size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+//
+//    //int sample_finished_node = 0;
+//    ////////////////////////////MICS//////////////////////////////////////
+//
+//    //const int num_devices_cpu_mics = 1;
+//    //const int num_devices_mics = mpiData->kernel_globals_mics.size();
+//
+//    int sample_finished_devices = 0;
+//    int tile_y_devices = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+//
+//    //////////////////////////////////////////////////////////////////
+//
+//#pragma omp parallel shared(omp_path_trace_req) num_threads(2)
+//    {
+//#pragma omp single nowait
+//        {
+//#pragma omp task shared(omp_path_trace_req)
+//            {
+//                while (reqFinished == 0)
+//                {
+//#pragma omp flush
+//                    if (omp_path_trace_req != 0)
+//                    {
+//                        //printf("CLIENT: omp_path_trace: %d, %d, %f\n", 0, tile_y_devices, omp_get_wtime());
+//
+//#ifdef WITH_IT4I_MIC_NATIVE
+//                        mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices, (char*) &reqFinished, nprocs_cpu, NULL);
+//#else
+//                        omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_devices, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_devices, (char*) &reqFinished, nprocs_cpu, NULL);
+//#endif
+//                        //usleep(100);
+//                        omp_path_trace_req = 0;
+//                    }
+//                    usleep(100);
+//                }
+//            }
+//
+//#pragma omp task shared(omp_path_trace_req)
+//            {
+//                while (true)
+//                {
+//#pragma omp flush
+//                    MPI_Gatherv(&omp_path_trace_req, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//                    //printf("CLIENT: tile_y_devices: %d\n", tile_y_devices);
+//                    MPI_Gatherv(&tile_y_devices, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//                    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+//                    {
+//                        size_t offsetByte_node = (offset + tile_x + tile_y_devices * stride) * SIZEOF_UCHAR4;
+//                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//                    }
+//                    else
+//                    {
+//                        size_t offsetBuf_node = (offset + tile_x + tile_y_devices * stride) * pass_stride * sizeof (float);
+//                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//                    }
+//
+//                    MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//                    if (reqJob >= 0)
+//                    {
+//                        sample_finished_devices = start_sample;
+//                        tile_y_devices = reqJob;
+//                        omp_path_trace_req = 1;
+//                    }
+//
+//
+//                    MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD);
+//                    if (reqFinished != 0)
+//                    {
+//                        //printf("CLIENT: finished %f\n", omp_get_wtime());
+//                        //fflush(0);
+//                        break;
+//                    }
+//
+//
+//
+//                }
+//            }
+//        }
+//
+//#pragma omp taskwait
+//    }
+//
+//    MPI_Gatherv(&tile_y_devices, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+//    {
+//        size_t offsetByte_node = (offset + tile_x + tile_y_devices * stride) * SIZEOF_UCHAR4;
+//        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//    }
+//    else
+//    {
+//        size_t offsetBuf_node = (offset + tile_x + tile_y_devices * stride) * pass_stride * sizeof (float);
+//        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//    }
+//}
+
+//#endif
+
+
+//void mpi_path_trace_progressive(mpi_kernel_struct &data)
+//{
+//    printf("CLIENT: mpi_path_trace_progressive\n");
+//    ///////////////////////////share nodes////////////////////////////////////
+//
+//    size_t offsetSample = 0;
+//    size_t sizeSample = sizeof (int);
+//
+//    int reqFinished = -1;
+//    int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples;
+//
+//#ifdef WITH_IT4I_MIC_NATIVE
+//    int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu);
+//#else
+//    int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu);
+//#endif
+//
+//    int offset = data.mpi_path_trace_data.offset;
+//    int stride = data.mpi_path_trace_data.stride;
+//
+//    int tile_x = data.mpi_path_trace_data.tile_x;
+//    int tile_w = data.mpi_path_trace_data.tile_w;
+//
+//    ////////////////////////////one node///////////////////////////////////
+//    omp_set_nested(1);
+//    int nprocs_cpu = omp_get_max_threads() - 1;
+//    //printf("nprocs_cpu: %d\n", nprocs_cpu);
+//
+//    int dev_node = data.world_rank - 1;
+//    int devices_size_node = data.world_size - 1;
+//
+//    int tile_step_node = data.mpi_path_trace_data.tile_h / devices_size_node;
+//    int tile_last_node = data.mpi_path_trace_data.tile_h - (devices_size_node - 1) * tile_step_node;
+//
+//    int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+//    int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node;
+//
+//    int size_node = tile_h_node * tile_w;
+//
+//    size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+//    size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+//
+//    size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+//    size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+//
+//    int sample_finished_node = 0;
+//    //////////////////////////////////////////////////////////////////
+//
+//#ifdef WITH_IT4I_MIC_NATIVE
+//    mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, omp_get_max_threads(), NULL);
+//#else
+//    omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, omp_get_max_threads(), NULL);
+//#endif
+//
+//    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+//    {
+//        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//    }
+//    else
+//    {
+//        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//    }
+//}
+//
+//void mpi_path_trace(mpi_kernel_struct &data)
+//{
+//    printf("CLIENT: mpi_path_trace\n");
+//    ///////////////////////////share nodes////////////////////////////////////
+//
+//    size_t offsetSample = 0;
+//    size_t sizeSample = sizeof (int);
+//
+//    int reqFinished = 0;
+//    int reqJob = -1;
+//    size_t sizeJob = sizeof (int);
+//
+//    int start_sample = data.mpi_path_trace_data.start_sample;
+//    int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples;
+//
+//#ifdef WITH_IT4I_MIC_NATIVE
+//    int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu);
+//#else
+//    int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu);
+//#endif
+//
+//    int offset = data.mpi_path_trace_data.offset;
+//    int stride = data.mpi_path_trace_data.stride;
+//
+//    int tile_x = data.mpi_path_trace_data.tile_x;
+//    int tile_w = data.mpi_path_trace_data.tile_w;
+//
+//    ////////////////////////////one node///////////////////////////////////
+//    omp_set_nested(1);
+//    int nprocs_cpu = omp_get_max_threads() - 1;
+//
+//    int dev_node = data.world_rank - 1;
+//    int devices_size_node = data.world_size - 1;
+//
+//    int tile_step_node = 1;
+//
+//    int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+//    int tile_h_node = tile_step_node;
+//    int mic_path_trace_req = 1;
+//
+//    int size_node = tile_h_node * tile_w;
+//
+//    //size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+//    size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+//
+//    //size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * sizeof (uchar4);
+//    size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+//
+//    int sample_finished_node = 0;
+//    //////////////////////////////////////////////////////////////////
+//
+//#pragma omp parallel num_threads(2)
+//    {
+//#pragma omp single nowait
+//        {
+//#pragma omp task
+//            {
+//                while (reqFinished == 0)
+//                {
+//#pragma omp flush
+//                    if (mic_path_trace_req != 0)
+//                    {
+//                        //printf("mic_path_trace - tile_y_node: %d\n", tile_y_node);
+//#ifdef WITH_IT4I_MIC_NATIVE
+//                        mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, nprocs_cpu, NULL);
+//#else
+//                        omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_node, offset, stride, tile_h_node, tile_w, (char*) &sample_finished_node, (char*) &reqFinished, nprocs_cpu, NULL);
+//#endif
+//
+//                        mic_path_trace_req = 0;
+//                    }
+//                    //usleep(1000);
+//                    //#pragma omp wait
+//                }
+//            }
+//
+//#pragma omp task
+//            {
+//                while (true)
+//                {
+//                    //MPI_Bcast(&reqFinished, 1, MPI_INT, 0, MPI_COMM_WORLD);
+//                    //MPI_Bcast(&reqJob, 1, MPI_INT, 0, MPI_COMM_WORLD);
+//
+//                    //printf("mic_path_trace - reqJob: %d\n", reqJob);
+//
+//                    MPI_Scatterv(NULL, 0, NULL, MPI_BYTE, &reqJob, sizeJob, MPI_BYTE, 0, MPI_COMM_WORLD);
+//                    //MPI_Gatherv(&sample_finished_node, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//                    if (reqJob >= 0)
+//                    {
+//                        sample_finished_node = start_sample;
+//                    }
+//                    else if (reqJob == -2)
+//                    {
+//                        reqFinished = 1;
+//                    }
+//
+//#pragma omp flush
+//                    int sample_finished = sample_finished_node;
+//
+//                    MPI_Gatherv(&sample_finished, sizeSample, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//
+//                    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+//                    {
+//                        size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+//                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//                    }
+//                    else
+//                    {
+//                        size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+//                        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+//                    }
+//
+//                    if (reqJob >= 0)
+//                    {
+//                        tile_y_node = reqJob;
+//                        mic_path_trace_req = 1;
+//
+//                    }
+//
+//                    if (reqFinished != 0)
+//                    {
+//
+//                        break;
+//                    }
+//
+//                }
+//            }
+//        }
+//
+//#pragma omp taskwait
+//    }
+//}
+
+//interactive rendering - native, cpu, cpu+offload
+
+void mpi_path_trace_progressive(mpi_kernel_struct &data)
+{
+    /////////////////////////////share nodes////////////////////////////////////
+
+    size_t offsetSample = 0;
+    size_t sizeSample = sizeof (int);
+
+    int reqFinished = 0;
+
+    //#ifdef WITH_IT4I_MIC_OFFLOAD
+    //    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    //    {
+    //        mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int));
+    //    }
+    //#endif
+    
+    int num_samples_scale = 1;
+    if (getenv("IT4I_OMP_NUM_SAMPLES_SCALE"))
+    {
+        num_samples_scale = atoi(getenv("IT4I_OMP_NUM_SAMPLES_SCALE"));
+    }    
+
+    int end_sample = data.mpi_path_trace_data.start_sample + data.mpi_path_trace_data.num_samples * num_samples_scale;
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    int pass_stride = mic_get_pass_stride(mpiData->kernel_globals_cpu);
+#else
+    int pass_stride = omp_get_pass_stride(mpiData->kernel_globals_cpu);
+#endif
+
+    int offset = data.mpi_path_trace_data.offset;
+    int stride = data.mpi_path_trace_data.stride;
+
+    int tile_x = data.mpi_path_trace_data.tile_x;
+    int tile_w = data.mpi_path_trace_data.tile_w;
+
+    ////////////////////////////one node///////////////////////////////////
+    //omp_set_nested(1);
+    int nprocs_mic = 240;
+    int nprocs_cpu = omp_get_max_threads();
+
+    if (getenv("IT4I_OMP_CPU_NUM_THREADS"))
+    {
+        nprocs_cpu = atoi(getenv("IT4I_OMP_CPU_NUM_THREADS"));
+    }
+
+    if (getenv("IT4I_OMP_MIC_NUM_THREADS"))
+    {
+        nprocs_mic = atoi(getenv("IT4I_OMP_MIC_NUM_THREADS"));
+    }
+
+    int dev_node = data.world_rank - 1;
+    int devices_size_node = data.world_size - 1;
+
+    int tile_step_node = data.mpi_path_trace_data.tile_h / devices_size_node;
+    int tile_last_node = data.mpi_path_trace_data.tile_h - (devices_size_node - 1) * tile_step_node;
+
+    int tile_y_node = data.mpi_path_trace_data.tile_y + tile_step_node * dev_node;
+    int tile_h_node = (devices_size_node - 1 == dev_node) ? tile_last_node : tile_step_node;
+
+    int size_node = tile_h_node * tile_w;
+
+    size_t offsetBuf_node = (offset + tile_x + tile_y_node * stride) * pass_stride * sizeof (float);
+    size_t sizeBuf_node = size_node * pass_stride * sizeof (float);
+
+    size_t offsetByte_node = (offset + tile_x + tile_y_node * stride) * SIZEOF_UCHAR4;
+    size_t sizeByte_node = size_node * SIZEOF_UCHAR4;
+
+    int devices_size_cpu_mics = mpiData->kernel_globals_mics.size() + 2;
+
+    int tile_step_cpu_mics = tile_h_node / devices_size_cpu_mics;
+    //int tile_last_cpu_mics = tile_h_node - (devices_size_cpu_mics - 1) * tile_step_cpu_mics;
+
+    int dev_cpu_mics = 0;
+
+    //////////////////////////mic0////////////////////////////////////
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    int signal1, signal2, signal3, signal4;
+
+    std::vector<int> sample_finished_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<int> tile_y_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<int> tile_h_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<int> size_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<size_t> offsetBuf_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<size_t> sizeBuf_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<size_t> offsetByte_mic0(mpiData->kernel_globals_mics.size());
+    std::vector<size_t> sizeByte_mic0(mpiData->kernel_globals_mics.size());
+
+    //sync
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        sample_finished_mic0[dev] = 0;
+
+        mic_mem_alloc(dev, (char*) &reqFinished, sizeof (int));
+        mic_mem_alloc(dev, (char*) &sample_finished_mic0[dev], sizeof (int));
+    }
+
+    //async
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        dev_cpu_mics = dev;
+
+        //sample_finished_mic0[dev] = 0;
+        //mic_mem_alloc(dev, (char*)&sample_finished_mic0[dev], sizeof(int));
+
+        tile_y_mic0[dev] = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+        tile_h_mic0[dev] = tile_step_cpu_mics;
+
+        size_mic0[dev] = tile_h_mic0[dev] * tile_w;
+
+        offsetBuf_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * pass_stride * sizeof (float);
+        sizeBuf_mic0[dev] = size_mic0[dev] * pass_stride * sizeof (float);
+
+        offsetByte_mic0[dev] = (offset + tile_x + tile_y_mic0[dev] * stride) * SIZEOF_UCHAR4;
+        sizeByte_mic0[dev] = size_mic0[dev] * SIZEOF_UCHAR4;
+
+        //mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) buffer, (char *) rng_state, (char*) rgba_pixels, tile.half_float, start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, (char *) rng_state);
+        //mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/ (char*) &sample_finished_mic0[dev]);
+
+        if (dev == 0)
+            mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/signal1);
+        if (dev == 1)
+            mic_path_trace(dev, mpiData->kernel_globals_mics[dev], (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_mic0[dev], offset, stride, tile_h_mic0[dev], tile_w, (char*) &sample_finished_mic0[dev], (char*) &reqFinished, nprocs_mic, /*(char*) &data.mpi_path_trace_data.rng_state*/signal2);
+    }
+#endif
+    //////////////////////////cpu/////////////////////////////////////
+
+    dev_cpu_mics = mpiData->kernel_globals_mics.size();
+
+    int sample_finished_cpu = 0;
+
+    int tile_y_cpu = tile_y_node + tile_step_cpu_mics * dev_cpu_mics;
+    int tile_h_cpu = tile_h_node - (devices_size_cpu_mics - 2) * tile_step_cpu_mics;
+
+    int size_cpu = tile_h_cpu * tile_w;
+
+    size_t offsetBuf_cpu = (offset + tile_x + tile_y_cpu * stride) * pass_stride * sizeof (float);
+    size_t sizeBuf_cpu = size_cpu * pass_stride * sizeof (float);
+
+    size_t offsetByte_cpu = (offset + tile_x + tile_y_cpu * stride) * SIZEOF_UCHAR4;
+    size_t sizeByte_cpu = size_cpu * SIZEOF_UCHAR4;
+    //////////////////////////////////////////////////////////////////
+
+#ifdef WITH_IT4I_MIC_NATIVE
+    mic_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_mic, NULL);
+#else
+    omp_path_trace(-1, mpiData->kernel_globals_cpu, (char *) mpiData->ptr_map[data.mpi_path_trace_data.buffer], (char *) mpiData->ptr_map[data.mpi_path_trace_data.rng_state], (data.mpi_path_trace_data.rgba_pixels != NULL) ? (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] : NULL, data.mpi_path_trace_data.half_float, data.mpi_path_trace_data.start_sample, end_sample, tile_x, tile_y_cpu, offset, stride, tile_h_cpu, tile_w, (char*) &sample_finished_cpu, (char*) &reqFinished, nprocs_cpu, NULL);
+#endif
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        //mic_wait(dev, (char*) &data.mpi_path_trace_data.rng_state);
+        //mic_wait(dev, (char*) &sample_finished_mic0[dev]);
+        
+        if (dev == 0)
+            mic_wait(dev, signal1);
+
+        if (dev == 1)
+            mic_wait(dev, signal2);        
+
+        if (data.mpi_path_trace_data.rgba_pixels != NULL)
+        {
+            mic_mem_copy_from(dev, (char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels], offsetByte_mic0[dev], sizeByte_mic0[dev], NULL);
+        }
+        else
+        {
+            mic_mem_copy_from(dev, (char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer], offsetBuf_mic0[dev], sizeBuf_mic0[dev], NULL);
+        }
+    }
+#endif
+
+    if (data.mpi_path_trace_data.rgba_pixels != NULL)
+    {
+        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.rgba_pixels] + offsetByte_node, sizeByte_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else
+    {
+        MPI_Gatherv((char*) mpiData->ptr_map[data.mpi_path_trace_data.buffer] + offsetBuf_node, sizeBuf_node, MPI_BYTE, NULL, 0, NULL, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+
+
+#ifdef WITH_IT4I_MIC_OFFLOAD
+    for (int dev = 0; dev < mpiData->kernel_globals_mics.size(); dev++)
+    {
+        mic_mem_free(dev, (char*) &sample_finished_mic0[dev], sizeof (int));
+        mic_mem_free(dev, (char*) &reqFinished, sizeof (int));
+    }
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void mpi_render(mpi_kernel_struct &data)
+{
+    int action = data.mpi_tag;
+    if (action == MPI_TAG_mpi_const_copy)
+    {
+        mpi_const_copy(data);
+    }
+    else if (action == MPI_TAG_mpi_tex_copy)
+    {
+        mpi_tex_copy(data);
+    }
+    else if (action == MPI_TAG_mpi_path_trace)
+    {
+        if (data.mpi_path_trace_data.progressive)
+        {
+            mpi_path_trace_progressive(data);
+        }
+        else
+        {
+            //#ifdef WITH_IT4I_MIC_OFFLOAD
+            //            mpi_path_trace_offline_offload(data);
+            //#else
+            mpi_path_trace_offline(data);
+            //#endif
+
+        }
+    }
+    else if (action == MPI_TAG_mpi_alloc_kg)
+    {
+        mpi_alloc_kg(data);
+    }
+    else if (action == MPI_TAG_mpi_free_kg)
+    {
+        mpi_free_kg(data);
+    }
+    else if (action == MPI_TAG_mpi_mem_alloc)
+    {
+        mpi_mem_alloc(data);
+    }
+    else if (action == MPI_TAG_mpi_mem_copy_to)
+    {
+        mpi_mem_copy_to(data);
+    }
+    else if (action == MPI_TAG_mpi_mem_zero)
+    {
+        mpi_mem_zero(data);
+    }
+    else if (action == MPI_TAG_mpi_mem_free)
+    {
+        mpi_mem_free(data);
+    }
+    else if (action == MPI_TAG_mpi_tex_free)
+    {
+        mpi_tex_free(data);
+    }
+}
+CCL_NAMESPACE_END
diff --git a/it4i/client/cycles_mpi/cycles_mpi.h b/it4i/client/cycles_mpi/cycles_mpi.h
new file mode 100644
index 0000000000000000000000000000000000000000..31707c15833578ce57851744236c4e66ad45c66f
--- /dev/null
+++ b/it4i/client/cycles_mpi/cycles_mpi.h
@@ -0,0 +1,48 @@
+#ifndef __CYCLES_MPI_H__
+#define __CYCLES_MPI_H__
+
+//#define CCL_NAMESPACE_BEGIN 
+//#define CCL_NAMESPACE_END 
+
+//#include "kernel_compat_mic.h"
+
+//#include "kernel_mpi.h"
+//#include "util_types.h"
+
+#include "client_api.h"
+
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+void mpi_path_trace(mpi_kernel_struct &data);
+//void mpi_branched_path_trace(mpi_kernel_struct &data);
+//
+///* Film */
+//void mpi_film_convert_half();
+//void mpi_film_convert_byte();
+//
+///* Shader Evaluation */
+//void mpi_bake();
+//void mpi_shader();
+
+/* Device memory */
+void mpi_alloc_kg(mpi_kernel_struct &data);
+void mpi_free_kg(mpi_kernel_struct &data);
+
+void mpi_mem_alloc(mpi_kernel_struct &data);
+void mpi_mem_copy_to(mpi_kernel_struct &data);
+//void mpi_mem_copy_from(mpi_kernel_struct &data);
+void mpi_mem_zero(mpi_kernel_struct &data);
+void mpi_mem_free(mpi_kernel_struct &data);
+void mpi_tex_free(mpi_kernel_struct &data);
+
+void mpi_const_copy(mpi_kernel_struct &data);
+void mpi_tex_copy(mpi_kernel_struct &data);
+
+void mpi_render(mpi_kernel_struct &data);
+CCL_NAMESPACE_END
+
+
+#endif /* __CYCLES_MPI_H__ */
+
diff --git a/it4i/client/cycles_omp/CMakeLists.txt b/it4i/client/cycles_omp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..604a6b00502abc0e710494bcf44535b0bd36f194
--- /dev/null
+++ b/it4i/client/cycles_omp/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(INC
+	.
+	../../../intern/cycles/util
+	../../../intern/cycles/kernel
+	../../../intern/cycles/kernel/kernels/mic
+	../../../intern/cycles/kernel/kernels/mpi
+        ../../../intern/cycles/kernel/kernels/omp
+	../api
+	${MPI_INCLUDE_DIR}
+)
+
+set(SRC
+	../../../intern/cycles/kernel/kernels/omp/kernel_omp.cpp
+)
+
+add_definitions(-DWITH_OPENMP)
+
+set_source_files_properties(../../../intern/cycles/kernel/kernels/omp/kernel_omp.cpp PROPERTIES COMPILE_FLAGS "-xCORE-AVX2")
+
+include_directories(${INC})
+add_library(cycles_omp${MIC_FLAG} SHARED ${SRC})
+
+install (TARGETS cycles_omp${MIC_FLAG} DESTINATION lib)
diff --git a/it4i/client/main/CMakeLists.txt b/it4i/client/main/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d56ef917f1bcb04d50422d3c5617fe0cd1afb9f6
--- /dev/null
+++ b/it4i/client/main/CMakeLists.txt
@@ -0,0 +1,49 @@
+
+set(INC
+	.
+	../../../intern/cycles/util
+	../../../intern/cycles/kernel
+	../../../intern/cycles/kernel/kernels/mpi
+	../api
+	../cycles_mpi
+	${MPI_INCLUDE_DIR}
+)
+
+set(SRC
+	main.cpp
+)
+
+set(SRC_HEADERS
+	main.h
+)
+
+include_directories(${INC})
+
+add_executable(blender_client${MIC_FLAG} ${SRC} ${SRC_HEADERS})
+target_link_libraries (blender_client${MIC_FLAG} cycles_mpi${MIC_FLAG} ${MPI_LIB_FILE})
+add_dependencies(blender_client${MIC_FLAG} cycles_mpi${MIC_FLAG})
+
+#if(WITH_IT4I_MIC_NATIVE)
+#    add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+#else()
+#    add_dependencies(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG})
+#endif()
+#
+#if(WITH_IT4I_MIC_OFFLOAD)
+#    add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+#endif()
+
+if(WITH_IT4I_MIC_NATIVE)
+    add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+    target_link_libraries(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+else()
+    add_dependencies(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG})
+    target_link_libraries(blender_client${MIC_FLAG} cycles_omp${MIC_FLAG})
+endif()
+
+if(WITH_IT4I_MIC_OFFLOAD)
+    add_dependencies(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+    target_link_libraries(blender_client${MIC_FLAG} cycles_mic${MIC_FLAG})
+endif()
+
+install (TARGETS blender_client${MIC_FLAG} DESTINATION bin)
diff --git a/it4i/client/main/main.cpp b/it4i/client/main/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..161407cda9b1f79b34a9fef963ea5b5f876849ad
--- /dev/null
+++ b/it4i/client/main/main.cpp
@@ -0,0 +1,54 @@
+#include "main.h"
+#include "cycles_mpi.h"
+
+#include <mpi.h>
+
+int main(int argc, char** argv)
+{
+    // Initialize the MPI environment
+    //MPI_Init(&argc, &argv);
+    int provided;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);       
+
+    // Get the number of processes
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+    // Get the rank of the process
+    int world_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    // Get the name of the processor
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+
+    // Print off a hello world message
+    printf("Start from processor %s, rank %d"
+            " out of %d processors\n",
+            processor_name, world_rank, world_size);
+
+    while (true)
+    {
+        mpi_kernel_struct data;
+        MPI_Bcast(&data, sizeof (mpi_kernel_struct), MPI_BYTE, 0, MPI_COMM_WORLD);
+        data.world_rank = world_rank;
+        data.world_size = world_size;
+
+        if (MPI_TAG_mpi_cycles_start <= data.mpi_tag && data.mpi_tag <= MPI_TAG_mpi_cycles_end)
+        {
+            mpi_render(data);
+        }
+    }
+
+    printf("End from processor %s, rank %d"
+            " out of %d processors\n",
+            processor_name, world_rank, world_size);
+
+    // Finalize the MPI environment.
+    MPI_Finalize();
+
+
+    return 0;
+}
+
diff --git a/it4i/client/main/main.h b/it4i/client/main/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..8511f08b5caec0c6852025cc50b5787b2e3a9546
--- /dev/null
+++ b/it4i/client/main/main.h
@@ -0,0 +1,5 @@
+#ifndef MAIN_H
+#define MAIN_H
+
+#endif /* MAIN_H */
+
diff --git a/it4i/scripts/build_blender.sh b/it4i/scripts/build_blender.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ee034a816f485055f3dd83e5e03776e8b2b491df
--- /dev/null
+++ b/it4i/scripts/build_blender.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+lib_dir=${ROOT_DIR}/install
+output=${ROOT_DIR}/install/blender
+src=${ROOT_DIR}/src
+
+#boost_1_60_0  
+#ilmbase-2.2.0  
+#openexr-2.2.0  
+#tiff-4.0.6
+#gdcm-2.6.2    
+#oiio           
+#Python-3.4.4   
+#zlib-1.2.8
+
+export CC=mpiicc
+export CXX=mpiicpc
+
+#-----------blender--------------
+mkdir ${ROOT_DIR}/build/blender
+cd ${ROOT_DIR}/build/blender
+
+make_d="${src}/blender"
+make_d="${make_d} -DPYTHON_LIBRARY=${lib_dir}/Python-3.5.2/lib/libpython3.5m.a" 
+make_d="${make_d} -DPYTHON_LIBPATH=${lib_dir}/Python-3.5.2/lib" 
+make_d="${make_d} -DPYTHON_INCLUDE_DIR=${lib_dir}/Python-3.5.2/include/python3.5m" 
+make_d="${make_d} -DPYTHON_INCLUDE_CONFIG_DIR=${lib_dir}/Python-3.5.2/include/python3.5m" 
+make_d="${make_d} -DWITH_OPENIMAGEIO=ON" 
+make_d="${make_d} -DWITH_CYCLES=ON" 
+make_d="${make_d} -DOPENEXR_INCLUDE_DIR=${lib_dir}/openexr-2.2.0/include" 
+make_d="${make_d} -DOPENEXR_ILMIMF_LIBRARIES=${lib_dir}/openexr-2.2.0/lib/libIlmImf.so"
+make_d="${make_d} -DOPENEXR_ILMTHREAD_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIlmThread.so"
+make_d="${make_d} -DOPENEXR_IMATH_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libImath.so"
+make_d="${make_d} -DOPENEXR_ILMIMF_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIlmImf.so"
+make_d="${make_d} -DOPENEXR_HALF_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libHalf.so"
+make_d="${make_d} -DOPENEXR_IEX_LIBRARY=${lib_dir}/openexr-2.2.0/lib/libIex.so"
+make_d="${make_d} -DBOOST_ROOT=${lib_dir}/boost_1_60_0" 
+make_d="${make_d} -DTIFF_LIBRARY=${lib_dir}/tiff-4.0.6/lib/libtiff.so" 
+make_d="${make_d} -DTIFF_INCLUDE_DIR=${lib_dir}/tiff-4.0.6/include" 
+make_d="${make_d} -DOPENIMAGEIO_LIBRARY=${lib_dir}/oiio/lib/libOpenImageIO.so" 
+make_d="${make_d} -DOPENIMAGEIO_INCLUDE_DIR=${lib_dir}/oiio/include" 
+make_d="${make_d} -DWITH_SYSTEM_GLEW=OFF"  
+make_d="${make_d} -DZLIB_INCLUDE_DIR=${lib_dir}/zlib-1.2.8/include"
+make_d="${make_d} -DZLIB_LIBRARY=${lib_dir}/zlib-1.2.8/lib/libz.so"
+make_d="${make_d} -DCMAKE_CXX_FLAGS='-wd47,177,858,1875,2621,1011,780,1292'"  
+make_d="${make_d} -DCMAKE_C_FLAGS='-wd47,177,858,1875,2621,1011,780,1292'"
+make_d="${make_d} -DJPEG_INCLUDE_DIR:PATH=${lib_dir}/libjpeg-turbo-1.4.2/include"
+make_d="${make_d} -DJPEG_LIBRARY:FILEPATH=${lib_dir}/libjpeg-turbo-1.4.2/lib/libjpeg.a"
+make_d="${make_d} -DFREETYPE_INCLUDE_DIR_freetype2=${lib_dir}/freetype-2.6.3/include"
+make_d="${make_d} -DFREETYPE_INCLUDE_DIR_ft2build=${lib_dir}/freetype-2.6.3/include/freetype2"
+make_d="${make_d} -DFREETYPE_LIBRARY=${lib_dir}/freetype-2.6.3/lib/libfreetype.so"
+make_d="${make_d} -DWITH_OPENMP=ON"
+make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=ON"
+make_d="${make_d} -DWITH_IT4I_MPI:BOOL=ON"
+make_d="${make_d} -DWITH_CYCLES_DEVICE_OPENCL=OFF"
+make_d="${make_d} -DWITH_GAMEENGINE=OFF"
+make_d="${make_d} -DWITH_AUDASPACE=OFF"
+make_d="${make_d} -DWITH_OPENAL=OFF"
+make_d="${make_d} -DX11_Xi_LIB=/usr/lib64/libXi.so.6"
+make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}"
+make_d="${make_d} -DCMAKE_BUILD_TYPE=Debug"
+
+cmake ${make_d}
+
+make -j24 install
diff --git a/it4i/scripts/build_client.sh b/it4i/scripts/build_client.sh
new file mode 100644
index 0000000000000000000000000000000000000000..03e6b2c5cea6bc1e0ad8707e5d5e3d9fb26c8c3e
--- /dev/null
+++ b/it4i/scripts/build_client.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+lib_dir=${ROOT_DIR}/install
+output=${ROOT_DIR}/install/blender_client_offload
+src=${ROOT_DIR}/src
+
+#boost_1_60_0  
+#ilmbase-2.2.0  
+#openexr-2.2.0  
+#tiff-4.0.6
+#gdcm-2.6.2    
+#oiio           
+#Python-3.4.4   
+#zlib-1.2.8
+
+export CC=mpiicc
+export CXX=mpiicpc
+
+#-----------blender_client--------------
+mkdir ${ROOT_DIR}/build/blender_client_offload
+cd ${ROOT_DIR}/build/blender_client_offload
+
+make_d="${src}/blender/it4i/client"
+make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=ON"
+make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=OFF"
+make_d="${make_d} -DCMAKE_BUILD_TYPE=Release"
+make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}"
+
+cmake ${make_d}
+make -j24 install
diff --git a/it4i/scripts/build_client_mic.sh b/it4i/scripts/build_client_mic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..062860c2bcef9ff18c66fa377fdc3a05828a9da5
--- /dev/null
+++ b/it4i/scripts/build_client_mic.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+lib_dir=${ROOT_DIR}/install
+output=${ROOT_DIR}/install/blender_client_symmetric
+src=${ROOT_DIR}/src
+
+#boost_1_60_0  
+#ilmbase-2.2.0  
+#openexr-2.2.0  
+#tiff-4.0.6
+#gdcm-2.6.2    
+#oiio           
+#Python-3.4.4   
+#zlib-1.2.8
+
+export CC=mpiicc
+export CXX=mpiicpc
+
+#-----------blender_client cpu--------------
+mkdir ${ROOT_DIR}/build/blender_client_symmetric
+cd ${ROOT_DIR}/build/blender_client_symmetric
+
+make_d="${src}/blender/it4i/client"
+make_d="${make_d} -DWITH_IT4I_MPI=ON"
+make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=OFF"
+make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=OFF"
+make_d="${make_d} -DCMAKE_BUILD_TYPE=Release"
+make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}"
+
+cmake ${make_d}
+make -j24 install
+
+#-----------blender_client mic--------------
+make_d="${src}/cyclesphi/it4i/client"
+make_d="${make_d} -DWITH_IT4I_MPI=ON"
+make_d="${make_d} -DWITH_IT4I_MIC_OFFLOAD=OFF"
+make_d="${make_d} -DWITH_IT4I_MIC_NATIVE=ON"
+make_d="${make_d} -DCMAKE_BUILD_TYPE=Release"
+make_d="${make_d} -DCMAKE_INSTALL_PREFIX=${output}"
+
+cmake ${make_d}
+make -j24 install
diff --git a/it4i/scripts/build_lib.sh b/it4i/scripts/build_lib.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8cb038ca52f6810dc74b5321d8a567b2445bace4
--- /dev/null
+++ b/it4i/scripts/build_lib.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+path_main=${ROOT_DIR}
+path_lib=${path_main}/lib
+path_src=${path_main}/src
+path_build=${path_main}/build
+path_install=${path_main}/install
+num_cores_flag=-j24
+
+cd ${path_main}
+
+mkdir ${path_src}
+mkdir ${path_build}
+mkdir ${path_install}
+mkdir ${path_lib}
+
+#-----boost
+
+tar -xvf boost_1_60_0.tar.gz  -C ${path_src}
+
+path_boost=${path_install}/boost_1_60_0
+cd ${path_src}/boost_1_60_0
+
+./bootstrap.sh
+./bjam install ${num_cores_flag} -a --prefix=${path_boost}
+
+cd ${path_main}
+
+#--------------ilmbase
+
+tar -xvf ilmbase-2.2.0.tar.gz  -C ${path_src}
+
+path_ilmbase=${path_install}/ilmbase-2.2.0
+cd ${path_src}/ilmbase-2.2.0
+
+./configure --prefix=${path_ilmbase}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+
+
+#-----------openexr-----------------
+tar -xvf openexr-2.2.0.tar.gz   -C ${path_src}
+
+path_openexr=${path_install}/openexr-2.2.0
+cd ${path_src}/openexr-2.2.0
+
+./configure --disable-ilmbasetest --with-ilmbase-prefix=${path_ilmbase} --prefix=${path_openexr}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+
+#-----------tiff-----------------
+tar -xvf tiff-4.0.6.tar.gz   -C ${path_src}
+
+path_tiff=${path_install}/tiff-4.0.6
+cd ${path_src}/tiff-4.0.6
+
+./configure --prefix=${path_tiff}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------openimageio-----------------
+tar -xvf oiio.tar.gz      -C ${path_src}
+
+path_oiio=${path_install}/oiio
+mkdir ${path_build}/oiio 
+cd ${path_build}/oiio
+
+cmake ${path_src}/oiio -DILMBASE_HOME=${path_ilmbase} -DOPENEXR_HOME=${path_openexr} -DBOOST_ROOT=${path_boost} -DTIFF_LIBRARY=${path_tiff}/lib/libtiff.so -DTIFF_INCLUDE_DIR=${path_tiff}/include -DCMAKE_INSTALL_PREFIX=${path_oiio}
+
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------zlib-----------------
+tar -xvf zlib-1.2.8.tar.gz   -C ${path_src}
+
+path_zlib=${path_install}/zlib-1.2.8
+cd ${path_src}/zlib-1.2.8
+
+./configure --prefix=${path_zlib}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------png-----------------
+tar -xvf libpng-1.6.21.tar.gz   -C ${path_src}
+
+path_png=${path_install}/libpng-1.6.21
+cd ${path_src}/libpng-1.6.21
+
+./configure --prefix=${path_png}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------jpeg-----------------
+tar -xvf libjpeg-turbo-1.4.2.tar.gz   -C ${path_src}
+
+path_jpeg=${path_install}/libjpeg-turbo-1.4.2
+cd ${path_src}/libjpeg-turbo-1.4.2
+
+./configure --prefix=${path_jpeg} --without-simd
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------freetype-----------------
+tar -xvf freetype-2.6.3.tar.gz   -C ${path_src}
+
+path_freetype=${path_install}/freetype-2.6.3
+cd ${path_src}/freetype-2.6.3
+
+./configure --prefix=${path_freetype}
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------python----------------- 
+tar -xvf Python-3.5.2.tgz  -C ${path_src}
+
+path_python=${path_install}/Python-3.5.2
+cd ${path_src}/Python-3.5.2
+
+./configure --prefix=${path_python}
+
+make ${num_cores_flag}
+make install
+
+cd ${path_main}
+#-----------blender--------------
+cp -r ${path_ilmbase}/include/* ${path_oiio}/include/.
+cp -r ${path_ilmbase}/include/* ${path_openexr}/include/.
+cp -r ${path_ilmbase}/lib/* ${path_openexr}/lib/.
+
+
+cp -r ${path_boost}/lib/*.so* ${path_lib}/.
+cp -r ${path_ilmbase}/lib/*.so* ${path_lib}/.
+cp -r ${path_openexr}/lib/*.so* ${path_lib}/.
+cp -r ${path_tiff}/lib/*.so* ${path_lib}/.
+cp -r ${path_oiio}/lib/*.so* ${path_lib}/.
+cp -r ${path_zlib}/lib/*.so* ${path_lib}/.
+cp -r ${path_png}/lib/*.so* ${path_lib}/.
+cp -r ${path_jpeg}/lib/*.so* ${path_lib}/.
+cp -r ${path_freetype}/lib/*.so* ${path_lib}/.
+cp -r ${path_gdcm}/lib/*.so* ${path_lib}/.
+cp -r ${path_python}/lib/*.so* ${path_lib}/.
+
+
+
diff --git a/it4i/scripts/run_blender.sh b/it4i/scripts/run_blender.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1d0f490f12a82458d91fcadd07282d22fe530949
--- /dev/null
+++ b/it4i/scripts/run_blender.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:/opt/intel/opencl/lib64
+export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/opt/intel/opencl/lib64
+
+#export LD_LIBRARY_PATH=/apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/compilers_and_libraries_2016.3.210/linux/tbb/lib/mic:$LD_LIBRARY_PATH
+#export LD_PRELOAD=libtbbmalloc_proxy.so.2:libtbbmalloc.so.2:$LD_PRELOAD
+
+#export MIC_LD_LIBRARY_PATH=/apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/compilers_and_libraries_2016.3.210/linux/tbb/lib/mic:$MIC_LD_LIBRARY_PATH
+#export MIC_LD_PRELOAD=libtbbmalloc_proxy.so.2:libtbbmalloc.so.2:$MIC_LD_PRELOAD
+
+export MIC_USE_2MB_BUFFERS=100k
+
+export IT4I_OMP_TILE_STEP=2
+export IT4I_OMP_CPU_NUM_THREADS=24
+export IT4I_OMP_MIC_NUM_THREADS=240
+
+cd ${ROOT_DIR}/install/blender
+./blender
diff --git a/it4i/scripts/run_ddt.sh b/it4i/scripts/run_ddt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2d9b14f4a3928f26b3f81050c233955b419b9822
--- /dev/null
+++ b/it4i/scripts/run_ddt.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+module load Forge/6.0.6
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+#module load DDT/5.0.1
+#module load OpenCL-runtime/15.1
+
+ROOT_DIR=${PWD}
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib
+#/home/milanjaros/intel/opencl-1.2-4.5.0.8/lib64
+export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib
+#export ALLINEA_DISABLE_THREAD_SPARKLINES=1
+
+cd ${ROOT_DIR}/install/blender
+ddt
+
diff --git a/it4i/scripts/run_mpi_offload.sh b/it4i/scripts/run_mpi_offload.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b5c6c3107261a1ecdaafb1a3db93d0e937c59e06
--- /dev/null
+++ b/it4i/scripts/run_mpi_offload.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client_offload/lib
+
+JOBID=$(ls "/lscratch/")
+NODEFILECN="/lscratch/$JOBID/nodefile-cn-sn"
+TEMP=$(wc -l < "$NODEFILECN")
+NUMOFCN=128 #$((TEMP-1))
+
+export IT4I_OMP_NUM_SAMPLES_SCALE=8
+export IT4I_OMP_TILE_STEP=4
+export IT4I_OMP_CPU_NUM_THREADS=24
+export IT4I_OMP_MIC_NUM_THREADS=240
+export I_MPI_DEBUG=2
+
+export MIC_USE_2MB_BUFFERS=100k
+
+#mpirun -n 1 ${ROOT_DIR}/install/blender/blender : -n 1 ${ROOT_DIR}/install/blender_client_offload/bin/blender_client
+
+mpirun -n 1 -machinefile $NODEFILECN ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_offload/bin/blender_client
+
diff --git a/it4i/scripts/run_mpi_symmetric.sh b/it4i/scripts/run_mpi_symmetric.sh
new file mode 100644
index 0000000000000000000000000000000000000000..065718aec3d2356088d31dd97c80e8d4c17dde78
--- /dev/null
+++ b/it4i/scripts/run_mpi_symmetric.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+export MIC_ENV_PREFIX=MIC
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client_symmetric/lib
+export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client_symmetric/lib
+
+export I_MPI_MIC=1
+export I_MPI_FABRICS=shm:dapl
+export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1u,ofa-v2-scif0,ofa-v2-mcm-1
+export I_MPI_MIC_POSTFIX=-mic
+export I_MPI_DEBUG=1
+
+export IT4I_OMP_NUM_SAMPLES_SCALE=1
+export IT4I_OMP_TILE_STEP=4
+export IT4I_OMP_CPU_NUM_THREADS=24
+export IT4I_OMP_MIC_NUM_THREADS=240
+
+export MIC_USE_2MB_BUFFERS=100k
+
+JOBID=$(ls "/lscratch/")
+NODEFILECN="/lscratch/$JOBID/nodefile-cn-sn"
+#NODEFILEMIC="/lscratch/$JOBID/nodefile-mic-sn"
+NODEFILECN2="/home/milanjaros/nodes.txt"
+NUMOFCN=64 #number of clients
+ 
+let I=0
+let zero=0
+while read -r line
+do
+    if ((I == 0))
+    then
+        #hosts="${line}"
+        echo "${line}" > ${NODEFILECN2}
+    else
+        #hosts="${hosts};${line};${line}-mic0;${line}-mic1"
+        echo "${line}" >> ${NODEFILECN2}
+        #echo "${line}-mic0" >> ${NODEFILECN2}
+        #echo "${line}-mic1" >> ${NODEFILECN2}
+    fi
+
+    if ((I == NUMOFCN))
+    then
+        break
+    fi
+    
+    I=$((I+1))
+done < "$NODEFILECN"
+
+#let I=0
+#let zero=0
+#while read -r line
+#do
+#    if ((I == 0))
+#    then
+#        #hosts="${line}"
+#        echo "${line}" #> ${NODEFILECN2}
+#    else
+#        #hosts="${hosts};${line};${line}-mic0;${line}-mic1"
+#        #echo "${line}" >> ${NODEFILECN2}
+#        echo "${line}-mic0" >> ${NODEFILECN2}
+#        echo "${line}-mic1" >> ${NODEFILECN2}
+#    fi
+#
+#    if ((I == NUMOFCN))
+#    then
+#        break
+#    fi
+#    
+#    I=$((I+1))
+#done < "$NODEFILECN"
+
+TEMP=$(wc -l < "$NODEFILECN2")
+NUMOFCN2=$((TEMP-1))
+
+cp ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client-mic ${ROOT_DIR}/install/blender/blender-mic
+
+#mpirun -n 1 ${ROOT_DIR}/install/blender/blender : -n 1 ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client
+
+#mpirun -n 1 -machinefile $NODEFILECN ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client
+
+mpirun -genv LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH -machine $NODEFILECN2 -n 1 ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN2 ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client
+
+#mpirun -genv LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH -hosts $hosts -n 1 ${ROOT_DIR}/install/blender/blender : -n $NUMOFCN ${ROOT_DIR}/install/blender_client_symmetric/bin/blender_client
diff --git a/it4i/scripts/run_netbeans.sh b/it4i/scripts/run_netbeans.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c56384d4e9e6ae5b7014557cce62f60ceffe62b5
--- /dev/null
+++ b/it4i/scripts/run_netbeans.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib
+export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib
+
+source /apps/all/icc/2016.1.150-GCC-4.9.3-2.25/debugger_2016/bin/debuggervars.sh
+alias gdb="gdb-ia"
+
+/home/milanjaros/netbeans-8.1/bin/netbeans
diff --git a/it4i/scripts/run_vtune.sh b/it4i/scripts/run_vtune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..27d0d8ea983f6304a9af0c69a9422c0083ebdc01
--- /dev/null
+++ b/it4i/scripts/run_vtune.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+module load intel/2016.03-GCC-5.3
+module load CMake/3.3.1-GCC-5.3.0-2.25
+
+ROOT_DIR=${PWD}
+
+source /apps/all/imkl/11.3.3.210-iimpi-2016.03-GCC-5.3.0-2.26/vtune_amplifier_xe_2016.3.0.463186/amplxe-vars.sh
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ROOT_DIR}/lib:${ROOT_DIR}/install/blender_client/lib
+export MIC_LD_LIBRARY_PATH=/apps/compiler/icc/2016.3.210-GCC-5.3.0-2.26/lib/mic:/apps/all/impi/5.1.3.181-iccifort-2016.3.210-GCC-5.3.0-2.26/mic/lib:${ROOT_DIR}/install/blender_client/lib
+
+cd ${ROOT_DIR}/install/blender
+amplxe-gui
+
diff --git a/source/blender/blenlib/BLI_utildefines.h b/source/blender/blenlib/BLI_utildefines.h
index d504e503c686441e77e6bf4aafcae6f642da8541..d6c743760db3778c81ff6fc098b0a180aa4e4cdf 100644
--- a/source/blender/blenlib/BLI_utildefines.h
+++ b/source/blender/blenlib/BLI_utildefines.h
@@ -435,7 +435,7 @@ extern "C" {
 	} (void)0
 
 /* assuming a static array */
-#if defined(__GNUC__) && !defined(__cplusplus) && !defined(__clang__)
+#if defined(__GNUC__) && !defined(__cplusplus) && !defined(__clang__) && !defined(__INTEL_COMPILER)
 #  define ARRAY_SIZE(arr) \
 	((sizeof(struct {int isnt_array : ((const void *)&(arr) == &(arr)[0]);}) * 0) + \
 	 (sizeof(arr) / sizeof(*(arr))))
diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c
index fa14ca96fe2cca6b9a599298c6622edded1fa23d..8ab1de95b4137b41fba3dd5608a17e969a3059e5 100644
--- a/source/blender/editors/space_view3d/space_view3d.c
+++ b/source/blender/editors/space_view3d/space_view3d.c
@@ -329,7 +329,7 @@ static SpaceLink *view3d_new(const bContext *C)
 	v3d->grid = 1.0f;
 	v3d->gridlines = 16;
 	v3d->gridsubdiv = 10;
-	v3d->drawtype = OB_SOLID;
+	v3d->drawtype = OB_BOUNDBOX;
 
 	v3d->gridflag = V3D_SHOW_X | V3D_SHOW_Y | V3D_SHOW_FLOOR;
 	
diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h
index af1dfc62894fff9d4871a03c684699872202bb59..8444a03cb86759fc242a42ba508cad94b8ff676b 100644
--- a/source/blender/makesdna/DNA_userdef_types.h
+++ b/source/blender/makesdna/DNA_userdef_types.h
@@ -869,6 +869,8 @@ typedef enum eCompute_Device_Type {
 	USER_COMPUTE_DEVICE_NONE	= 0,
 	USER_COMPUTE_DEVICE_OPENCL	= 1,
 	USER_COMPUTE_DEVICE_CUDA	= 2,
+	USER_COMPUTE_DEVICE_OMP		= 4,
+	USER_COMPUTE_DEVICE_MPI		= 5,
 } eCompute_Device_Type;
 
 	
diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c
index f4c6fdf42f5c5da470ecafee7451f872ebf594fb..00dce381f9678777719b37da25e5cf3397e55f0a 100644
--- a/source/blender/makesrna/intern/rna_userdef.c
+++ b/source/blender/makesrna/intern/rna_userdef.c
@@ -57,6 +57,8 @@ static EnumPropertyItem compute_device_type_items[] = {
 	{USER_COMPUTE_DEVICE_NONE, "NONE", 0, "None", "Don't use compute device"},
 	{USER_COMPUTE_DEVICE_CUDA, "CUDA", 0, "CUDA", "Use CUDA for GPU acceleration"},
 	{USER_COMPUTE_DEVICE_OPENCL, "OPENCL", 0, "OpenCL", "Use OpenCL for GPU acceleration"},
+	{USER_COMPUTE_DEVICE_OMP, "OMP", 0, "OMP", "Use OMP/MIC for acceleration"},
+	{USER_COMPUTE_DEVICE_MPI, "MPI", 0, "MPI", "Use MPI for acceleration"},
 	{ 0, NULL, 0, NULL, NULL}
 };
 #endif
@@ -473,6 +475,10 @@ static EnumPropertyItem *rna_userdef_compute_device_type_itemf(bContext *UNUSED(
 		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_CUDA);
 	if (CCL_compute_device_list(1))
 		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_OPENCL);
+	if (CCL_compute_device_list(4))
+		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_OMP);
+	if (CCL_compute_device_list(5))
+		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_MPI);
 
 	RNA_enum_item_end(&item, &totitem);
 	*r_free = true;
@@ -504,11 +510,33 @@ static EnumPropertyItem *rna_userdef_compute_device_itemf(bContext *UNUSED(C), P
 	}
 	else {
 		/* get device list from cycles. it would be good to make this generic
-		 * once we have more subsystems using opencl, for now this is easiest */
-		int opencl = (U.compute_device_type == USER_COMPUTE_DEVICE_OPENCL);
-		CCLDeviceInfo *devices = CCL_compute_device_list(opencl);
+		* once we have more subsystems using opencl, for now this is easiest */
+
+		//int opencl = (U.compute_device_type == USER_COMPUTE_DEVICE_OPENCL);
+		//CCLDeviceInfo *devices = CCL_compute_device_list(opencl);
+		CCLDeviceInfo *devices = NULL;
+		int deviceType = -1;
 		int a;
 
+		switch(U.compute_device_type) 
+		{
+		case USER_COMPUTE_DEVICE_CUDA:
+			deviceType = 0;
+			break;
+		case USER_COMPUTE_DEVICE_OPENCL:
+			deviceType = 1;
+			break;
+		case USER_COMPUTE_DEVICE_OMP:
+			deviceType = 4;
+			break;
+		case USER_COMPUTE_DEVICE_MPI:
+			deviceType = 5;
+			break;
+		}
+
+		devices = CCL_compute_device_list(deviceType);
+
+
 		if (devices) {
 			for (a = 0; devices[a].identifier[0]; a++) {
 				tmp.value = devices[a].value;
diff --git a/source/creator/CMakeLists.txt b/source/creator/CMakeLists.txt
index ff6544cf0e36e2b2ffb2b86469cacbe5163230e2..11eabf8e45ca810d26176d1e810a7a5422ffd5e1 100644
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@@ -48,6 +48,10 @@ if(WIN32)
 	blender_include_dirs(../../intern/utfconv)
 endif()
 
+if(WITH_IT4I_MPI)
+        add_definitions(-DWITH_IT4I_MPI)
+endif()
+
 if(WITH_LIBMV)
 	blender_include_dirs(../../intern/libmv)
 	add_definitions(-DWITH_LIBMV)
diff --git a/source/creator/creator.c b/source/creator/creator.c
index bf8347d59bba29af1a70bbd192c40aa9ac93d429..24d51d65c6d4e4b4ede7a64ddd14c3502e66d87e 100644
--- a/source/creator/creator.c
+++ b/source/creator/creator.c
@@ -123,6 +123,10 @@
 #  define BUILD_DATE
 #endif
 
+#ifdef WITH_IT4I_MPI
+	#include <mpi.h>
+#endif
+
 /* for passing information between creator and gameengine */
 #ifdef WITH_GAMEENGINE
 #  include "BL_System.h"
@@ -1796,7 +1800,8 @@ int main(
 #ifdef WIN32
         const char **UNUSED(argv_c)
 #else
-        const char **argv
+        //const char **argv
+        char **argv
 #endif
         )
 {
@@ -1814,6 +1819,12 @@ int main(
 
 	/* --- end declarations --- */
 
+#ifdef WITH_IT4I_MPI
+    // Initialize the MPI environment
+    //MPI_Init(&argc, &argv);
+    int provided;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);        
+#endif
 
 #ifdef WIN32
 	/* We delay loading of openmp so we can set the policy here. */
@@ -2100,6 +2111,11 @@ int main(
 	}
 
 	WM_main(C);
+  
+#ifdef WITH_IT4I_MPI
+    // Finalize the MPI environment.
+    MPI_Finalize();        
+#endif  
 
 	return 0;
 } /* end of int main(argc, argv)	*/