diff --git a/.gitmodules b/.gitmodules
index 8a04f82d9d..1b3a3671df 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -84,7 +84,7 @@
 	url = git@github.com:Devsh-Graphics-Programming/Nabla-Continous-Integration-Python-Framework.git
 [submodule "3rdparty/boost/superproject"]
 	path = 3rdparty/boost/superproject
-	url = ../boost.git
+	url = git@github.com:Devsh-Graphics-Programming/boost.git
 [submodule "3rdparty/argparse"]
 	path = 3rdparty/argparse
 	url = git@github.com:p-ranav/argparse.git
@@ -117,7 +117,7 @@
 	url = git@github.com:Devsh-Graphics-Programming/glm.git
 [submodule "docker/msvc-winsdk"]
 	path = docker/msvc-winsdk
-	url = ../docker-nanoserver-msvc-winsdk
+	url = git@github.com:Devsh-Graphics-Programming/docker-nanoserver-msvc-winsdk.git
 [submodule "3rdparty/gtl"]
 	path = 3rdparty/gtl
 	url = https://github.com/greg7mdp/gtl.git
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index a6228b01de..4ef2cd887f 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -95,7 +95,9 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 # boost
+set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) #forcing boost to be in release
 add_subdirectory(boost boost EXCLUDE_FROM_ALL)
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "" FORCE) #restoring config from boost
 
 set(SPIRV_HEADERS_SKIP_INSTALL ON CACHE INTERNAL "Skip SPIRV-Headers install")
 set(SPIRV_HEADERS_SKIP_EXAMPLES ON CACHE INTERNAL "Skip SPIRV-Headers examples")
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index ecd3f93521..d76c7890b1 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit ecd3f93521f1aceabff64b14857f47f9a32c9958
+Subproject commit d76c7890b19ce0b344ee0ce116dbc1c92220ccea
diff --git a/3rdparty/gli b/3rdparty/gli
index c4e6446d3b..2749a197e8 160000
--- a/3rdparty/gli
+++ b/3rdparty/gli
@@ -1 +1 @@
-Subproject commit c4e6446d3b646538026fd5a95533daed952878d4
+Subproject commit 2749a197e88f94858f4108732824b3790064f6ec
diff --git a/3rdparty/glm b/3rdparty/glm
index 2d4c4b4dd3..8f6213d379 160000
--- a/3rdparty/glm
+++ b/3rdparty/glm
@@ -1 +1 @@
-Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f
+Subproject commit 8f6213d379a904f5ae910e09a114e066e25faf57
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2235512d1f..84c9a99dc4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,7 @@ option(NBL_FAST_MATH "Enable fast low-precision math" OFF) # the reason OFF is b
 option(NBL_BUILD_EXAMPLES "Enable building examples" ON)
 option(NBL_BUILD_MITSUBA_LOADER "Enable nbl::ext::MitsubaLoader?" OFF) # TODO: once it compies turn this ON by default!
 option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON)
+option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON)
 
 option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF)
 if(NBL_COMPILE_WITH_CUDA)
diff --git a/docker/compiler-explorer b/docker/compiler-explorer
index 45866dfa87..27318d12f8 160000
--- a/docker/compiler-explorer
+++ b/docker/compiler-explorer
@@ -1 +1 @@
-Subproject commit 45866dfa8782404fc121f25ce15ad0626b474db0
+Subproject commit 27318d12f88cf34bd0444101e6e260b12f5063a0
diff --git a/docs/nsc-prebuilds.md b/docs/nsc-prebuilds.md
new file mode 100644
index 0000000000..4d57d7a8de
--- /dev/null
+++ b/docs/nsc-prebuilds.md
@@ -0,0 +1,386 @@
+# NSC prebuilds (build-time HLSL -> SPIR-V)
+
+This document explains how to use `NBL_CREATE_NSC_COMPILE_RULES` together with `NBL_CREATE_RESOURCE_ARCHIVE` to:
+
+- Compile HLSL to SPIR-V at **build time** (via the `nsc` tool).
+- Optionally generate **device-cap permutations** (limits/features "CAPS").
+- Generate a small C++ header with **type-safe key getters** (`get_spirv_key<...>()`).
+- Make the same code work with `NBL_EMBED_BUILTIN_RESOURCES` **ON** (embedded virtual archive) and **OFF** (mounted build directory) when loading your precompiled SPIR-V at runtime.
+
+Definitions live in `cmake/common.cmake` (`NBL_CREATE_NSC_COMPILE_RULES`, `NBL_CREATE_RESOURCE_ARCHIVE`).
+
+## Runtime mounting requirement (important)
+
+All of this assumes your app mounts the directory/archive containing the NSC outputs (i.e. `BINARY_DIR`) into Nabla's virtual filesystem, then loads files via keys that are relative to that mounted root (the examples use `app_resources`).
+
+The examples "just work" because they inherit from `nbl::examples::BuiltinResourcesApplication`, which mounts:
+
+- `NBL_EMBED_BUILTIN_RESOURCES=OFF`: `system::CMountDirectoryArchive(NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT, ...)` at `app_resources`
+- `NBL_EMBED_BUILTIN_RESOURCES=ON`: the generated embedded archive (e.g. `nbl::this_example::builtin::build::CArchive`) at `app_resources`
+
+If you're writing your own app/extension and don't use `BuiltinResourcesApplication`, you must mount equivalently yourself (split by `NBL_EMBED_BUILTIN_RESOURCES`). Optionally set `IAssetLoader::SAssetLoadParams::workingDirectory` to whatever virtual root you want to load from.
+
+The `MOUNT_POINT_DEFINE` argument of `NBL_CREATE_NSC_COMPILE_RULES` defines a C/C++ macro whose value is the absolute path to the NSC output directory (`BINARY_DIR`) that you mount when builtins are off (in examples it's `NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT`).
+
+See `examples_tests/common/include/nbl/examples/common/BuiltinResourcesApplication.hpp` for the exact mounting logic.
+
+## Why build-time NSC instead of runtime compilation?
+
+Build-time compilation is usually preferable because it:
+
+- Uses your build system's parallelism (Ninja/MSBuild jobs) to compile shaders quickly.
+- Writes **only into the build tree** (no source tree pollution, easy clean/reconfigure).
+- Lets CI validate "shaders compile" as part of a normal build.
+- Enables fast runtime iteration: at runtime you only **pick** the right SPIR-V, you don't compile it.
+- Makes shader compilation deterministic and reproducible (toolchain + flags captured by the build).
+
+Runtime compilation is still useful for prototyping, but (assuming you don't use a runtime shader cache) it can make startup slower and shift failures to runtime instead of CI/build (a cache can hide the repeated cost on subsequent runs; our current one has some rough edges: it writes into the source tree and has issues when compiling many inputs from the same source directory).
+
+## What `NBL_CREATE_NSC_COMPILE_RULES` produces
+
+For each registered input it generates:
+
+- One `.spv` output **per CMake configuration** (`Debug/`, `Release/`, `RelWithDebInfo/`).
+- If you use `CAPS`, it generates a **cartesian product** of permutations and emits a `.spv` for each.
+- A generated header (you choose the path via `INCLUDE`) containing:
+  - a primary template `get_spirv_key<Key>(limits, features)` and `get_spirv_key<Key>(device)`
+  - explicit specializations for each registered base `KEY`
+  - the returned key already includes the build config prefix (compiled into the header).
+
+Keys are strings that match the output layout:
+
+```
+<CONFIG>/<KEY>(.<capName>_<value>)(.<capName>_<value>)....spv
+```
+
+## The JSON "INPUTS" format
+
+`INPUTS` is a JSON array of objects. Each object supports:
+
+- `INPUT` (string, required): path to `.hlsl` (relative to `CMAKE_CURRENT_SOURCE_DIR` or absolute).
+- `KEY` (string, required): base key (prefer without `.spv`; it is always appended, so using `foo.spv` will result in `foo.spv.spv`).
+- `COMPILE_OPTIONS` (array of strings, optional): per-input extra options (e.g. `["-T","cs_6_8"]`).
+- `DEPENDS` (array of strings, optional): per-input dependencies (extra files that should trigger rebuild).
+- `CAPS` (array, optional): permutation caps (see below).
+
+You can register many rules in a single call, and you can call the function multiple times to append rules to the same `TARGET`.
+
+## Compile options (generator expressions, defaults, debug info)
+
+`NBL_CREATE_NSC_COMPILE_RULES` combines options from multiple sources:
+
+- Built-in defaults from the helper (see `cmake/common.cmake`): HLSL version, Vulkan SPIR-V target env, scalar layout, warnings, and per-config optimization flags (e.g. `-O0` for Debug, `-O3` for Release) implemented via CMake generator expressions.
+- Global extra options via `COMMON_OPTIONS` (CMake list).
+- Per-input extra options via JSON `COMPILE_OPTIONS` (array of strings).
+
+Both `COMMON_OPTIONS` and JSON `COMPILE_OPTIONS` support CMake generator expressions like `$<$<CONFIG:Debug>:...>` (the helper uses them itself), so you can make flags configuration-dependent when needed.
+
+### Debug info for RenderDoc
+
+The helper also exposes CMake options that append NSC debug flags **only for Debug config** (via generator expressions). Enable them if you want RenderDoc to show source/line information instead of just raw disassembly:
+
+- `NSC_DEBUG_EDIF_FILE_BIT` (default `ON`) -> `-fspv-debug=file`
+- `NSC_DEBUG_EDIF_TOOL_BIT` (default `ON`) -> `-fspv-debug=tool`
+- `NSC_DEBUG_EDIF_SOURCE_BIT` (default `OFF`) -> `-fspv-debug=source`
+- `NSC_DEBUG_EDIF_LINE_BIT` (default `OFF`) -> `-fspv-debug=line`
+- `NSC_DEBUG_EDIF_NON_SEMANTIC_BIT` (default `OFF`) -> `-fspv-debug=vulkan-with-source`
+
+## Source files and rebuild dependencies (important)
+
+Make sure shader inputs and includes are:
+
+1. Marked as header-only on your target (so the IDE shows them, but the build system doesn't try to compile them with default HLSL rules like `fxc`):
+
+```cmake
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+```
+
+2. Listed as dependencies of the NSC custom commands (so editing any of them triggers a rebuild of the `.spv` outputs).
+
+This is what the `DEPENDS` argument of `NBL_CREATE_NSC_COMPILE_RULES` (and/or per-input JSON `DEPENDS`) is for. Always include the main `INPUT` file itself and any files it includes; otherwise the build system might not re-run `nsc` when you change them.
+
+## Minimal usage (no permutations)
+
+Example pattern (as in `examples_tests/27_MPMCScheduler/CMakeLists.txt`):
+
+```cmake
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+  app_resources/common.hlsl
+  app_resources/shader.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+  {
+    "INPUT": "app_resources/shader.comp.hlsl",
+    "KEY": "shader",
+    "COMPILE_OPTIONS": ["-T", "cs_6_8"],
+    "DEPENDS": [],
+    "CAPS": []
+  }
+]
+]=])
+
+NBL_CREATE_NSC_COMPILE_RULES(
+  TARGET ${EXECUTABLE_NAME}SPIRV
+  LINK_TO ${EXECUTABLE_NAME}
+  DEPENDS ${DEPENDS}
+  BINARY_DIR ${OUTPUT_DIRECTORY}
+  MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+  COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+  OUTPUT_VAR KEYS
+  INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+  NAMESPACE nbl::this_example::builtin::build
+  INPUTS ${JSON}
+)
+```
+
+Then include the generated header and use the key to load the SPIR-V:
+
+```cpp
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+// ...
+auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(device);
+auto bundle = assetMgr->getAsset(key.c_str(), loadParams);
+```
+
+`OUTPUT_VAR` (here: `KEYS`) is assigned the list of **all** produced access keys (all configurations + all permutations). This list is intended to be fed into `NBL_CREATE_RESOURCE_ARCHIVE(BUILTINS ${KEYS})`.
+
+## Permutations via `CAPS`
+
+`CAPS` lets you prebuild multiple SPIR-V variants parameterized by device limits or features.
+
+Each `CAPS` entry looks like:
+
+- `kind` (string, optional): `"limits"` or `"features"` (defaults to `"limits"` if omitted/invalid).
+- `name` (string, required): identifier used in both generated HLSL config and C++ key (must be a valid C/C++ identifier).
+- `type` (string, required): `bool`, `uint16_t`, `uint32_t`, `uint64_t`.
+- `values` (array of numbers, required): the values you want to prebuild.
+  - for `bool`, values must be `0` or `1`.
+
+At build time, NSC compiles each combination of values (cartesian product). At runtime, `get_spirv_key` appends suffixes using the `limits`/`features` you pass in.
+
+### Example: mixing `limits` and `features`
+
+This example permutes over one device limit and one device feature (order matters: the suffix order matches the `CAPS` array order):
+
+```cmake
+set(JSON [=[
+[
+  {
+    "INPUT": "app_resources/shader.hlsl",
+    "KEY": "shader",
+    "COMPILE_OPTIONS": ["-T", "lib_6_8"],
+    "DEPENDS": ["app_resources/common.hlsl"],
+    "CAPS": [
+      {
+        "kind": "limits",
+        "name": "maxComputeSharedMemorySize",
+        "type": "uint32_t",
+        "values": [16384, 32768, 65536]
+      },
+      {
+        "kind": "features",
+        "name": "shaderFloat64",
+        "type": "bool",
+        "values": [0, 1]
+      }
+    ]
+  }
+]
+]=])
+
+NBL_CREATE_NSC_COMPILE_RULES(
+  # ...
+  OUTPUT_VAR KEYS
+  INPUTS ${JSON}
+)
+```
+
+This produces `3 * 2 = 6` permutations per build configuration, and `KEYS` contains all of them (for example):
+
+```
+Debug/shader.maxComputeSharedMemorySize_16384.shaderFloat64_0.spv
+Debug/shader.maxComputeSharedMemorySize_16384.shaderFloat64_1.spv
+...
+```
+
+Practical tip: for numeric limits you often want to "bucket" real device values into one of the prebuilt values. The CountingSort example does exactly that:
+
+- CMake definition: `examples_tests/10_CountingSort/CMakeLists.txt`
+- Runtime bucketing: `examples_tests/10_CountingSort/main.cpp`
+
+```cpp
+auto limits = m_physicalDevice->getLimits();
+constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = { 16384, 32768, 65536 };
+
+auto upperBoundSharedMemSize = std::upper_bound(
+	AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize
+);
+// devices which support less than 16KB of max compute shared memory size are not supported
+if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+{
+	m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
+	exit(0);
+}
+
+limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+
+auto key = nbl::this_example::builtin::build::get_spirv_key<"prefix_sum_shader">(limits, m_physicalDevice->getFeatures());
+```
+
+## Pairing with `NBL_CREATE_RESOURCE_ARCHIVE` (works with builtins ON/OFF)
+
+The recommended pattern is to always call `NBL_CREATE_RESOURCE_ARCHIVE` right after the NSC rules, using the produced `KEYS` list:
+
+```cmake
+NBL_CREATE_RESOURCE_ARCHIVE(
+  TARGET ${EXECUTABLE_NAME}_builtinsBuild
+  LINK_TO ${EXECUTABLE_NAME}
+  BIND ${OUTPUT_DIRECTORY}
+  BUILTINS ${KEYS}
+  NAMESPACE nbl::this_example::builtin::build
+)
+```
+
+### How `BINARY_DIR`, `MOUNT_POINT_DEFINE`, and `BIND` fit together
+
+- In `NBL_CREATE_NSC_COMPILE_RULES`, `BINARY_DIR` is the output directory where NSC writes the compiled files:
+  - `${BINARY_DIR}/<CONFIG>/<KEY>....spv`
+- In `NBL_CREATE_NSC_COMPILE_RULES`, `MOUNT_POINT_DEFINE` is the *name* of a C/C++ preprocessor define whose value is set to the **absolute path** of `BINARY_DIR`.
+  - Example: `MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT` results in something like `-DNBL_THIS_EXAMPLE_BUILD_MOUNT_POINT="C:/.../auto-gen"` on the target.
+  - Keys returned by `get_spirv_key<...>()` are relative to that directory; the full path on disk is:
+    - `${NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT}/<key>`
+- In `NBL_CREATE_RESOURCE_ARCHIVE`, `BIND` should point at the same directory as `BINARY_DIR`.
+  - The `BUILTINS` list entries must be relative to `BIND`.
+  - This is why pairing it with `OUTPUT_VAR KEYS` works: `KEYS` is exactly the list of relative paths under `BINARY_DIR` that were generated by the NSC rules, so the archive generator knows what to serialize/embed.
+
+This is designed to work in both modes:
+
+- `NBL_EMBED_BUILTIN_RESOURCES=OFF`:
+  - `NBL_CREATE_RESOURCE_ARCHIVE` becomes a no-op (creates a dummy interface target).
+  - You load SPIR-V from the **build directory** mounted into the virtual filesystem.
+  - `MOUNT_POINT_DEFINE` provides an absolute path (e.g. `NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT`) for mounting.
+- `NBL_EMBED_BUILTIN_RESOURCES=ON`:
+  - `NBL_CREATE_RESOURCE_ARCHIVE` generates a small library that embeds the listed files into a virtual archive and emits `.../CArchive.h` under the requested `NAMESPACE`.
+  - You mount the embedded archive instead of a directory; runtime loading code stays the same (keys don't change).
+
+## Notes / gotchas
+
+- `INCLUDE` must be a **relative** path (it is emitted under the build tree and added to include dirs automatically).
+- Prefer not to include `.spv` in `KEY` (the extension is appended unconditionally); if you do, you'll just get `.spv.spv` in the final filename/key (not an error, just not what you want).
+- You can mix:
+  - per-input `COMPILE_OPTIONS` (inside JSON), and
+  - global `COMMON_OPTIONS` (CMake list after `COMMON_OPTIONS`).
+
+## Troubleshooting (no logs / silent NSC failures)
+
+Sometimes an NSC compile rule fails during the build, but the build output doesn't show a useful log. In that case, run the failing command under a debugger:
+
+1. Open the generated Visual Studio solution and set the `nsc` project/target as the Startup Project.
+2. Open the `nsc` project properties and set **Debugging -> Command Arguments**.
+3. Copy the exact CLI from the failing "NSC Rules" custom command (the one that calls `nsc.exe`) into the Command Arguments field.
+4. Start debugging (`F5`) and reproduce; if needed, put a breakpoint in the HLSL compiler/preprocessor codepath and step until you find the root cause.
+
+If the error looks like a preprocessing issue, note that we use Boost.Wave as the preprocessor; it can have quirky edge cases (e.g. needing a trailing newline/whitespace at the end of a file for correct parsing).
+
+## Best practices
+
+- Prefer compiling to a shader library (`-T lib_6_x`) and using multiple entry points when possible: fewer inputs means fewer compile rules and less build overhead; at runtime you still choose the entry point from the same `.spv`.
+- Treat `CAPS` as a build-time cost multiplier (cartesian product). If the permutation count gets too large (thousands+), prebuilding usually stops paying off; an example of such workload is `examples_tests/23_Arithmetic2UnitTest`.
+
+## Complete example (expand)
+
+<details>
+<summary>NSC rules + archive + runtime key usage</summary>
+
+### CMake (`CMakeLists.txt`)
+
+```cmake
+include(common)
+
+nbl_create_executable_project("" "" "" "")
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+  app_resources/common.hlsl
+  app_resources/shader.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+  {
+    "INPUT": "app_resources/shader.hlsl",
+    "KEY": "shader",
+    "COMPILE_OPTIONS": ["-T", "lib_6_8"],
+    "DEPENDS": [],
+    "CAPS": [
+      {
+        "kind": "limits",
+        "name": "maxComputeSharedMemorySize",
+        "type": "uint32_t",
+        "values": [16384, 32768, 65536]
+      },
+      {
+        "kind": "features",
+        "name": "shaderFloat64",
+        "type": "bool",
+        "values": [0, 1]
+      }
+    ]
+  }
+]
+]=])
+
+NBL_CREATE_NSC_COMPILE_RULES(
+  TARGET ${EXECUTABLE_NAME}SPIRV
+  LINK_TO ${EXECUTABLE_NAME}
+  DEPENDS ${DEPENDS}
+  BINARY_DIR ${OUTPUT_DIRECTORY}
+  MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+  COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+  OUTPUT_VAR KEYS
+  INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+  NAMESPACE nbl::this_example::builtin::build
+  INPUTS ${JSON}
+)
+
+# Works for both NBL_EMBED_BUILTIN_RESOURCES=ON/OFF
+NBL_CREATE_RESOURCE_ARCHIVE(
+  NAMESPACE nbl::this_example::builtin::build
+  TARGET ${EXECUTABLE_NAME}_builtinsBuild
+  LINK_TO ${EXECUTABLE_NAME}
+  BIND ${OUTPUT_DIRECTORY}
+  BUILTINS ${KEYS}
+)
+```
+
+### Runtime usage (C++)
+
+```cpp
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
+// Load relative to the VFS mount (examples mount it at "app_resources")
+asset::IAssetLoader::SAssetLoadParams lp = {};
+lp.workingDirectory = "app_resources";
+
+auto limits = device->getPhysicalDevice()->getLimits();
+limits.maxComputeSharedMemorySize = 32768; // one of the prebuilt values; real code should bucket/clamp with std::upper_bound (see the CountingSort snippet above)
+
+auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(limits, device->getEnabledFeatures());
+auto bundle = assetMgr->getAsset(key.c_str(), lp);
+const auto assets = bundle.getContents();
+auto spvShader = asset::IAsset::castDown<asset::IShader>(assets[0]);
+
+// params.shader.shader = spvShader.get();
+
+// If you compiled with `-T lib_6_x`, pick the entry point at pipeline creation time (e.g. `params.shader.entryPoint = "main";`).
+```
+
+</details>
diff --git a/examples_tests b/examples_tests
index 4ab1de2235..159d1533e8 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 4ab1de2235365833db2d089259000bec2bcce3e3
+Subproject commit 159d1533e8d82e3c5e82165e8b79ea67c0f23111
diff --git a/include/nbl/application_templates/MonoDeviceApplication.hpp b/include/nbl/application_templates/MonoDeviceApplication.hpp
index a3a169d7b7..4e0e6c759a 100644
--- a/include/nbl/application_templates/MonoDeviceApplication.hpp
+++ b/include/nbl/application_templates/MonoDeviceApplication.hpp
@@ -74,6 +74,8 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 				
 				const auto supportedPreferredFormats = getPreferredDeviceFeatures().intersectWith(m_physicalDevice->getFeatures());
 				params.featuresToEnable = getRequiredDeviceFeatures().unionWith(supportedPreferredFormats);
+				params.featuresToEnable.meshShader = true;
+				params.featuresToEnable.taskShader = true;
 
 				m_device = m_physicalDevice->createLogicalDevice(std::move(params));
 				if (!m_device)
diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index a691fa6af6..6c3935d302 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -95,6 +95,7 @@ class IAsset : virtual public core::IReferenceCounted
 			ET_PIPELINE_CACHE = 1ull<<21,						//!< asset::ICPUPipelineCache
 			ET_SCENE = 1ull<<22,								//!< reserved, to implement later
 			ET_RAYTRACING_PIPELINE = 1ull << 23, //!< asset::ICPURayTracingPipeline
+			ET_MESH_PIPELINE = 1ull << 24,
 			ET_IMPLEMENTATION_SPECIFIC_METADATA = 1ull<<31u,    //!< lights, etc.
 			//! Reserved special value used for things like terminating lists of this enum
 
diff --git a/include/nbl/asset/ICPUMeshPipeline.h b/include/nbl/asset/ICPUMeshPipeline.h
new file mode 100644
index 0000000000..b21a44b82c
--- /dev/null
+++ b/include/nbl/asset/ICPUMeshPipeline.h
@@ -0,0 +1,145 @@
+#ifndef _NBL_I_CPU_MESH_PIPELINE_H_INCLUDED_
+#define _NBL_I_CPU_MESH_PIPELINE_H_INCLUDED_
+
+
+#include "nbl/asset/IMeshPipeline.h"
+#include "nbl/asset/ICPURenderpass.h"
+#include "nbl/asset/ICPUPipeline.h"
+
+
+namespace nbl::asset
+{
+
+class ICPUMeshPipeline final : public ICPUPipeline<IMeshPipeline<ICPUPipelineLayout,ICPURenderpass>>
+{
+        using pipeline_base_t = IMeshPipeline<ICPUPipelineLayout, ICPURenderpass>;
+        using base_t = ICPUPipeline<pipeline_base_t>;
+
+    public:
+        
+        static core::smart_refctd_ptr<ICPUMeshPipeline> create(ICPUPipelineLayout* layout, ICPURenderpass* renderpass = nullptr)
+        {
+            auto retval = new ICPUMeshPipeline(layout, renderpass);
+            return core::smart_refctd_ptr<ICPUMeshPipeline>(retval,core::dont_grab);
+        }
+
+        constexpr static inline auto AssetType = ET_MESH_PIPELINE;
+        inline E_TYPE getAssetType() const override { return AssetType; }
+        
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams()
+        {
+            assert(isMutable());
+            return m_params;
+        }
+
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override final
+        {
+            switch (stage) {
+                case hlsl::ShaderStage::ESS_TASK:       return { &m_specInfos[0], 1 };
+                case hlsl::ShaderStage::ESS_MESH:       return { &m_specInfos[1], 1 };
+                case hlsl::ShaderStage::ESS_FRAGMENT:   return { &m_specInfos[2], 1 };
+            }
+            return {};
+        }
+
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
+        SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage)
+        {
+            if (!isMutable()) return nullptr;
+            switch (stage) {
+                case hlsl::ShaderStage::ESS_TASK:       return &m_specInfos[0];
+                case hlsl::ShaderStage::ESS_MESH:       return &m_specInfos[1];
+                case hlsl::ShaderStage::ESS_FRAGMENT:   return &m_specInfos[2];
+            }
+            return nullptr;
+        }
+
+        const SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) const
+        {
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return &m_specInfos[stageIndex];
+            return nullptr;
+        }
+
+        inline bool valid() const override
+        {
+            if (!m_layout) return false;
+            if (!m_layout->valid())return false;
+
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+            if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false;
+            
+            core::bitflag<hlsl::ShaderStage> stagePresence = {};
+            for (auto shader_i = 0u; shader_i < m_specInfos.size(); shader_i++)
+            {
+                const auto& info = m_specInfos[shader_i];
+                if (info.shader)
+                    stagePresence |= indexToStage(shader_i);
+            }
+            return hasRequiredStages(stagePresence);
+        }
+
+    protected:
+        using base_t::base_t;
+        virtual ~ICPUMeshPipeline() override = default;
+
+        std::array<SShaderSpecInfo, MESH_SHADER_STAGE_COUNT> m_specInfos;
+
+    private:
+        explicit ICPUMeshPipeline(ICPUPipelineLayout* layout, ICPURenderpass* renderpass)
+            : base_t(layout, {}, renderpass)
+            {}
+
+        static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
+        {
+            const auto stageIx = hlsl::findLSB(stage);
+            if (stageIx < 0 || stageIx >= MESH_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+              return -1;
+            return stageIx;
+        }
+
+        static inline hlsl::ShaderStage indexToStage(const int8_t index)
+        {
+            switch (index) {
+                case 0: return hlsl::ShaderStage::ESS_TASK;
+                case 1: return hlsl::ShaderStage::ESS_MESH;
+                case 2: return hlsl::ShaderStage::ESS_FRAGMENT;
+            }
+            return hlsl::ShaderStage::ESS_UNKNOWN;
+        }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto* newPipeline = new ICPUMeshPipeline(layout.get(), m_renderpass.get());
+            newPipeline->m_params = m_params;
+            
+            for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
+            {
+                newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth);
+            }
+
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
+        }
+
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        {
+            if (!visit(m_layout.get())) return;
+            if (!visit(m_renderpass.get())) return;
+            for (const auto& info : m_specInfos)
+              if (!visit(info.shader.get())) return;
+        }
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/asset/IMeshPipeline.h b/include/nbl/asset/IMeshPipeline.h
new file mode 100644
index 0000000000..1b19e89f37
--- /dev/null
+++ b/include/nbl/asset/IMeshPipeline.h
@@ -0,0 +1,59 @@
+#ifndef _NBL_ASSET_I_MESH_PIPELINE_H_INCLUDED_
+#define _NBL_ASSET_I_MESH_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IShader.h"
+#include "nbl/asset/RasterizationStates.h"
+#include "nbl/asset/IPipeline.h"
+
+
+namespace nbl::asset {
+    class IMeshPipelineBase : public virtual core::IReferenceCounted {
+    public:
+        constexpr static inline uint8_t MESH_SHADER_STAGE_COUNT = 3u; //i dont know what this is going to be used for yet, might be redundant
+        struct SCachedCreationParams final {
+            SRasterizationParams rasterization = {};
+            SBlendParams blend = {};
+            uint32_t subpassIx = 0u; //this subpass stuff is eluding me rn. i might just need to crack open the vulkan documentation
+            uint8_t requireFullSubgroups = false;
+        };
+
+    };
+
+    template<typename PipelineLayoutType, typename RenderpassType>
+    class IMeshPipeline : public IPipeline<PipelineLayoutType>, public IMeshPipelineBase {
+    protected:
+        using renderpass_t = RenderpassType;
+        //using base_creation_params_t = IPipeline<PipelineLayoutType>;//compute uses this, idk if its necessary yet
+    public:
+
+        static inline bool hasRequiredStages(const core::bitflag<hlsl::ShaderStage>& stagePresence)
+        {
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
+            if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_MESH)) {
+                return false;
+            }
+            //i dont quite understand why igraphicspipeline doesnt require a fragment shader. is it not required by vulkan?
+            if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_FRAGMENT)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
+
+    protected:
+        explicit IMeshPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, renderpass_t* renderpass) :
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
+            m_params(cachedParams), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
+        {
+        }
+
+        SCachedCreationParams m_params = {};
+        core::smart_refctd_ptr<renderpass_t> m_renderpass = nullptr;
+    };
+
+}
+
+
+#endif
diff --git a/include/nbl/builtin/glsl/utils/morton.glsl b/include/nbl/builtin/glsl/utils/morton.glsl
index de3be8b9c7..fd07a9cad8 100644
--- a/include/nbl/builtin/glsl/utils/morton.glsl
+++ b/include/nbl/builtin/glsl/utils/morton.glsl
@@ -22,6 +22,18 @@ uint nbl_glsl_morton_decode2d8bComponent(in uint x)
     return x;
 }
 
+uint nbl_glsl_morton_decode2d32bComponent(in uint x) 
+{
+    x &= 0x55555555u;
+    x = (x ^ (x >>  1u)) & 0x33333333u;
+    x = (x ^ (x >>  2u)) & 0x0f0f0f0fu;
+    x = (x ^ (x >>  4u)) & 0x00ff00ffu;
+    x = (x ^ (x >>  8u)) & 0x0000ffffu;
+    x = (x ^ (x >>  16u));
+    return x;
+}
+
+
 uvec2 nbl_glsl_morton_decode2d4b(in uint x)
 {
     return uvec2(nbl_glsl_morton_decode2d4bComponent(x), nbl_glsl_morton_decode2d4bComponent(x >> 1u));
@@ -32,4 +44,9 @@ uvec2 nbl_glsl_morton_decode2d8b(in uint x)
     return uvec2(nbl_glsl_morton_decode2d8bComponent(x), nbl_glsl_morton_decode2d8bComponent(x >> 1u));
 }
 
+uvec2 nbl_glsl_morton_decode2d32b(in uint x)
+{
+    return uvec2(nbl_glsl_morton_decode2d32bComponent(x), nbl_glsl_morton_decode2d32bComponent(x >> 1u));
+}
+
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/bxdf/base/cook_torrance_base.hlsl b/include/nbl/builtin/hlsl/bxdf/base/cook_torrance_base.hlsl
index a185dc8d98..d70e8823da 100644
--- a/include/nbl/builtin/hlsl/bxdf/base/cook_torrance_base.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/base/cook_torrance_base.hlsl
@@ -8,6 +8,7 @@
 #include "nbl/builtin/hlsl/bxdf/config.hlsl"
 #include "nbl/builtin/hlsl/bxdf/ndf.hlsl"
 #include "nbl/builtin/hlsl/bxdf/fresnel.hlsl"
+#include "nbl/builtin/hlsl/sampling/basic.hlsl"
 #include "nbl/builtin/hlsl/bxdf/ndf/microfacet_to_light_transform.hlsl"
 
 namespace nbl
@@ -130,19 +131,22 @@ struct SCookTorrance
 
     template<class Interaction=conditional_t<IsAnisotropic,anisotropic_interaction_type,isotropic_interaction_type>,
             typename C=bool_constant<!fresnel_type::ReturnsMonochrome> NBL_FUNC_REQUIRES(C::value && !fresnel_type::ReturnsMonochrome)
-    static scalar_type __getScaledReflectance(NBL_CONST_REF_ARG(fresnel_type) orientedFresnel, NBL_CONST_REF_ARG(Interaction) interaction, scalar_type clampedVdotH)
+    static scalar_type __getScaledReflectance(NBL_CONST_REF_ARG(fresnel_type) orientedFresnel, NBL_CONST_REF_ARG(Interaction) interaction, scalar_type clampedVdotH, bool transmitted, NBL_REF_ARG(spectral_type) outFresnelVal)
     {
         spectral_type throughputWeights = interaction.getLuminosityContributionHint();
-        return hlsl::dot<spectral_type>(impl::__implicit_promote<spectral_type, typename fresnel_type::vector_type>::__call(orientedFresnel(clampedVdotH)), throughputWeights);
+        spectral_type reflectance = orientedFresnel(clampedVdotH);
+        outFresnelVal = hlsl::mix(reflectance, hlsl::promote<spectral_type>(1.0)-reflectance, transmitted);
+        return hlsl::dot<spectral_type>(outFresnelVal, throughputWeights);
     }
     template<class Interaction=conditional_t<IsAnisotropic,anisotropic_interaction_type,isotropic_interaction_type>,
             typename C=bool_constant<fresnel_type::ReturnsMonochrome> NBL_FUNC_REQUIRES(C::value && fresnel_type::ReturnsMonochrome)
-    static scalar_type __getScaledReflectance(NBL_CONST_REF_ARG(fresnel_type) orientedFresnel, NBL_CONST_REF_ARG(Interaction) interaction, scalar_type clampedVdotH)
+    static scalar_type __getScaledReflectance(NBL_CONST_REF_ARG(fresnel_type) orientedFresnel, NBL_CONST_REF_ARG(Interaction) interaction, scalar_type clampedVdotH, bool transmitted, NBL_REF_ARG(spectral_type) outFresnelVal)
     {
-        return orientedFresnel(clampedVdotH)[0];
+        scalar_type reflectance = orientedFresnel(clampedVdotH)[0];
+        return hlsl::mix(reflectance, scalar_type(1.0)-reflectance, transmitted);
     }
 
-    bool __dotIsUnity(const vector3_type a, const vector3_type b, const scalar_type value)
+    bool __dotIsValue(const vector3_type a, const vector3_type b, const scalar_type value)
     {
         const scalar_type ab = hlsl::dot(a, b);
         return hlsl::max(ab, value / ab) <= scalar_type(value + 1e-3);
@@ -209,11 +213,11 @@ struct SCookTorrance
         ray_dir_info_type V = interaction.getV();
         const matrix3x3_type fromTangent = interaction.getFromTangentSpace();
         // tangent frame orthonormality
-        assert(__dotIsUnity(fromTangent[0],fromTangent[1],0.0));
-        assert(__dotIsUnity(fromTangent[1],fromTangent[2],0.0));
-        assert(__dotIsUnity(fromTangent[2],fromTangent[0],0.0));
+        assert(__dotIsValue(fromTangent[0],fromTangent[1],0.0));
+        assert(__dotIsValue(fromTangent[1],fromTangent[2],0.0));
+        assert(__dotIsValue(fromTangent[2],fromTangent[0],0.0));
         // NDF sampling produced a unit length direction
-        assert(__dotIsUnity(localH,localH,1.0));
+        assert(__dotIsValue(localH,localH,1.0));
         const vector3_type H = hlsl::mul(interaction.getFromTangentSpace(), localH);
         Refract<scalar_type> r = Refract<scalar_type>::create(V.getDirection(), H);
 
@@ -276,7 +280,7 @@ struct SCookTorrance
         const scalar_type NdotV = localV.z;
 
         fresnel_type _f = __getOrientedFresnel(fresnel, NdotV);
-        fresnel::OrientedEtaRcps<monochrome_type> rcpEta = _f.getOrientedEtaRcps();
+        fresnel::OrientedEtaRcps<monochrome_type> rcpEta = _f.getRefractionOrientedEtaRcps();
 
         const vector3_type upperHemisphereV = ieee754::flipSignIfRHSNegative<vector3_type>(localV, hlsl::promote<vector3_type>(NdotV));
         const vector3_type localH = ndf.generateH(upperHemisphereV, u.xy);
@@ -294,11 +298,14 @@ struct SCookTorrance
             assert(NdotV*VdotH >= scalar_type(0.0));
         }
 
-        const scalar_type reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(VdotH));
+        spectral_type dummy;
+        const scalar_type reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(VdotH), false, dummy);
 
         scalar_type rcpChoiceProb;
         scalar_type z = u.z;
-        bool transmitted = math::partitionRandVariable(reflectance, z, rcpChoiceProb);
+        sampling::PartitionRandVariable<scalar_type> partitionRandVariable;
+        partitionRandVariable.leftProb = reflectance;
+        bool transmitted = partitionRandVariable(z, rcpChoiceProb);
 
         const scalar_type LdotH = hlsl::mix(VdotH, ieee754::copySign(hlsl::sqrt(rcpEta.value2[0]*VdotH*VdotH + scalar_type(1.0) - rcpEta.value2[0]), -VdotH), transmitted);
         bool valid;
@@ -337,8 +344,9 @@ struct SCookTorrance
 
         NBL_IF_CONSTEXPR(IsBSDF)
         {
-            const scalar_type reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(cache.getVdotH()));    
-            return hlsl::mix(reflectance, scalar_type(1.0) - reflectance, cache.isTransmission()) * DG1.projectedLightMeasure;
+            spectral_type dummy;
+            const scalar_type reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(cache.getVdotH()), cache.isTransmission(), dummy);    
+            return reflectance * DG1.projectedLightMeasure;
         }
         else
         {
@@ -389,10 +397,9 @@ struct SCookTorrance
                 quo = hlsl::promote<spectral_type>(G2_over_G1);
             else
             {
-                const scalar_type scaled_reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(cache.getVdotH()));
-                spectral_type reflectance = impl::__implicit_promote<spectral_type, typename fresnel_type::vector_type>::__call(_f(hlsl::abs(cache.getVdotH())));
-                quo = hlsl::mix(reflectance / scaled_reflectance,
-                        (hlsl::promote<spectral_type>(1.0) - reflectance) / (scalar_type(1.0) - scaled_reflectance), cache.isTransmission()) * G2_over_G1;
+                spectral_type reflectance;
+                const scalar_type scaled_reflectance = __getScaledReflectance(_f, interaction, hlsl::abs(cache.getVdotH()), cache.isTransmission(), reflectance);
+                quo = reflectance / scaled_reflectance * G2_over_G1;
             }
         }
         else
@@ -409,6 +416,18 @@ struct SCookTorrance
     fresnel_type fresnel;   // always front-facing
 };
 
+
+template<class Config, class N, class F>
+struct traits<SCookTorrance<Config,N,F> >
+{
+   using __type = SCookTorrance<Config,N,F>;
+
+    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = conditional_value<__type::IsBSDF, BxDFType, BxDFType::BT_BSDF, BxDFType::BT_BRDF>::value;
+    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
+    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = !__type::IsBSDF;
+    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = !__type::IsBSDF;
+};
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/common.hlsl b/include/nbl/builtin/hlsl/bxdf/common.hlsl
index ebad0a925c..6af3b4c01b 100644
--- a/include/nbl/builtin/hlsl/bxdf/common.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/common.hlsl
@@ -374,19 +374,17 @@ NBL_CONCEPT_END(
     ((NBL_CONCEPT_REQ_TYPE)(T::scalar_type))
     ((NBL_CONCEPT_REQ_TYPE)(T::vector3_type))
     ((NBL_CONCEPT_REQ_TYPE)(T::matrix3x3_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getL()), ::nbl::hlsl::is_same_v, typename T::ray_dir_info_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getTdotL()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getTdotL2()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getBdotL()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getBdotL2()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getNdotL(clampMode)), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getNdotL2()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.isValid()), ::nbl::hlsl::is_same_v, bool))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::createFromTangentSpace(rdirinfo,frame)), ::nbl::hlsl::is_same_v, T))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::create(rdirinfo,pV)), ::nbl::hlsl::is_same_v, T))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::create(rdirinfo,pV,pV,pV)), ::nbl::hlsl::is_same_v, T))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::create(rdirinfo,pV,pV,pNdotL)), ::nbl::hlsl::is_same_v, T))
-    // ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::template create<surface_interactions::SIsotropic<typename T::ray_dir_info_type, typename T::vector3_type> >(pV,inter)), ::nbl::hlsl::is_same_v, T)) // NOTE: temporarily commented out due to dxc bug https://github.com/microsoft/DirectXShaderCompiler/issues/7154
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.L), ::nbl::hlsl::is_same_v, typename T::ray_dir_info_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.VdotL), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.TdotL), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.BdotL), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.NdotL), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.NdotL2), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::createFromTangentSpace(pV,rdirinfo,frame)), ::nbl::hlsl::is_same_v, T))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::create(rdirinfo,pVdotL,pV)), ::nbl::hlsl::is_same_v, T))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::create(rdirinfo,pVdotL,pV,pV,pV)), ::nbl::hlsl::is_same_v, T))
+    //((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::template create<typename T::ray_dir_info_type>(pV,iso)), ::nbl::hlsl::is_same_v, T)) // NOTE: temporarily commented out due to dxc bug https://github.com/microsoft/DirectXShaderCompiler/issues/7154
+    //((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::template create<typename T::ray_dir_info_type>(pV,aniso)), ::nbl::hlsl::is_same_v, T))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sample.getTangentSpaceL()), ::nbl::hlsl::is_same_v, typename T::vector3_type))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((T::createInvalid()), ::nbl::hlsl::is_same_v, T))
     ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(ray_dir_info::Basic, typename T::ray_dir_info_type))
@@ -447,32 +445,22 @@ struct SLightSample
 
         return retval;
     }
-    static this_t create(NBL_CONST_REF_ARG(ray_dir_info_type) L, const vector3_type T, const vector3_type B, const scalar_type NdotL)
-    {
-        this_t retval;
-
-        retval.L = L;
-        retval.TdotL = nbl::hlsl::dot<vector3_type>(T,L.getDirection());
-        retval.BdotL = nbl::hlsl::dot<vector3_type>(B,L.getDirection());
-        retval.NdotL = NdotL;
-        retval.NdotL2 = NdotL * NdotL;
-
-        return retval;
-    }
-
-    template<class SurfaceInteraction NBL_FUNC_REQUIRES(surface_interactions::Isotropic<SurfaceInteraction>)
-    static this_t create(const vector3_type L, NBL_CONST_REF_ARG(SurfaceInteraction) interaction)
-    {
-        const vector3_type V = interaction.V.getDirection();
-        const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V,L);
-        this_t retval;
-        NBL_IF_CONSTEXPR(surface_interactions::Anisotropic<SurfaceInteraction>)
-            retval = create(L,interaction.T,interaction.B,interaction.N);
-        else
-            retval = create(L,interaction.N);
-        return retval;
-    }
-    
+    // overloads for surface_interactions, NOTE: temporarily commented out due to dxc bug https://github.com/microsoft/DirectXShaderCompiler/issues/7154
+    // template<class ObserverRayDirInfo>
+    // static this_t create(NBL_CONST_REF_ARG(vector3_type) L, NBL_CONST_REF_ARG(surface_interactions::SIsotropic<ObserverRayDirInfo>) interaction)
+    // {
+    //     const vector3_type V = interaction.V.getDirection();
+    //     const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V,L);
+    //     return create(L, VdotL, interaction.N);
+    // }
+    // template<class ObserverRayDirInfo>
+    // static this_t create(NBL_CONST_REF_ARG(vector3_type) L, NBL_CONST_REF_ARG(surface_interactions::SAnisotropic<ObserverRayDirInfo>) interaction)
+    // {
+    //     const vector3_type V = interaction.V.getDirection();
+    //     const scalar_type VdotL = nbl::hlsl::dot<vector3_type>(V,L);
+    //     return create(L,VdotL,interaction.T,interaction.B,interaction.N);
+    // }
+    //
     vector3_type getTangentSpaceL() NBL_CONST_MEMBER_FUNC
     {
         return vector3_type(TdotL, BdotL, NdotL);
@@ -615,7 +603,7 @@ struct SIsotropicMicrofacetCache
 
         // not coming from the medium (reflected) OR
         // exiting at the macro scale AND ( (not L outside the cone of possible directions given IoR with constraint VdotH*LdotH<0.0) OR (microfacet not facing toward the macrosurface, i.e. non heightfield profile of microsurface) )
-        const bool valid = ComputeMicrofacetNormal<scalar_type>::isValidMicrofacet(transmitted, VdotL, retval.absNdotH, computeMicrofacetNormal.orientedEta);
+        const bool valid = ComputeMicrofacetNormal<scalar_type>::isValidMicrofacet(transmitted, VdotL, retval.absNdotH, fresnel::OrientedEtas<monochrome_type>::create(1.0, computeMicrofacetNormal.orientedEta));
         if (valid)
         {
             retval.VdotH = hlsl::dot<vector3_type>(computeMicrofacetNormal.V,H);
@@ -638,7 +626,7 @@ struct SIsotropicMicrofacetCache
         const bool transmitted = ComputeMicrofacetNormal<scalar_type>::isTransmissionPath(NdotV,NdotL);
 
         ComputeMicrofacetNormal<scalar_type> computeMicrofacetNormal = ComputeMicrofacetNormal<scalar_type>::create(V,L,N,1.0);
-        computeMicrofacetNormal.orientedEta = orientedEtas;
+        computeMicrofacetNormal.orientedEta = orientedEtas.value[0];
         
         return create(transmitted, computeMicrofacetNormal, VdotL, N, H);
     }
@@ -664,7 +652,7 @@ struct SIsotropicMicrofacetCache
         const bool transmitted = ComputeMicrofacetNormal<scalar_type>::isTransmissionPath(interaction.getNdotV(),_sample.getNdotL());
 
         ComputeMicrofacetNormal<scalar_type> computeMicrofacetNormal = ComputeMicrofacetNormal<scalar_type>::create(V,L,N,1.0);
-        computeMicrofacetNormal.orientedEta = orientedEtas;
+        computeMicrofacetNormal.orientedEta = orientedEtas.value[0];
         
         return create(transmitted, computeMicrofacetNormal, hlsl::dot<vector3_type>(V, L), N, H);
     }
@@ -759,7 +747,11 @@ struct SAnisotropicMicrofacetCache
     using scalar_type = typename IsoCache::scalar_type;
     using vector3_type = vector<scalar_type, 3>;
     using matrix3x3_type = matrix<scalar_type, 3, 3>;
-    using monochrome_type = vector<scalar_type, 1>;
+
+    using ray_dir_info_type = ray_dir_info::SBasic<scalar_type>;
+    using anisotropic_type = surface_interactions::SAnisotropic<ray_dir_info_type>;
+    using isocache_type = SIsotropicMicrofacetCache<U>;
+    using sample_type = SLightSample<ray_dir_info_type>;
 
     // always valid by construction
     static this_t createForReflection(const vector3_type tangentSpaceV, const vector3_type tangentSpaceH)
@@ -819,11 +811,15 @@ struct SAnisotropicMicrofacetCache
         NBL_CONST_REF_ARG(fresnel::OrientedEtas<monochrome_type>) orientedEtas, NBL_REF_ARG(vector3_type) H
     )
     {
-        this_t retval;
-        retval.iso_cache = isocache_type::create(V,L,N,orientedEtas,H);
-        retval.TdotH = nbl::hlsl::dot<vector3_type>(T,H);
-        retval.BdotH = nbl::hlsl::dot<vector3_type>(B,H);
-        return retval;
+        isocache_type iso = (isocache_type)retval;
+        const bool valid = isocache_type::compute(iso,transmitted,V,L,N,NdotL,VdotL,orientedEta,rcpOrientedEta,H);
+        retval = (this_t)iso;
+        if (valid)
+        {
+            retval.TdotH = nbl::hlsl::dot<vector3_type>(T,H);
+            retval.BdotH = nbl::hlsl::dot<vector3_type>(B,H);
+        }
+        return valid;
     }
     template<class AnisotropicInteraction, class LS NBL_FUNC_REQUIRES(surface_interactions::Anisotropic<AnisotropicInteraction> && LightSample<LS>)
     static this_t create(
@@ -832,27 +828,15 @@ struct SAnisotropicMicrofacetCache
         NBL_CONST_REF_ARG(fresnel::OrientedEtas<monochrome_type>) orientedEtas
     )
     {
-        this_t retval;
         vector3_type H;
-        retval.iso_cache = isocache_type::template create<typename AnisotropicInteraction::isotropic_interaction_type, LS>(interaction.isotropic,_sample,orientedEtas,H);
-        retval.TdotH = nbl::hlsl::dot<vector3_type>(interaction.getT(),H);
-        retval.BdotH = nbl::hlsl::dot<vector3_type>(interaction.getB(),H);
-        return retval;
-    }
-    static this_t createPartial(
-        const scalar_type VdotH, const scalar_type LdotH, const scalar_type NdotH,
-        bool transmitted, NBL_CONST_REF_ARG(fresnel::OrientedEtaRcps<monochrome_type>) rcpOrientedEta
-    )
-    {
-        this_t retval;
-        retval.iso_cache.VdotH = VdotH;
-        retval.iso_cache.LdotH = LdotH;
-        retval.iso_cache.VdotL = hlsl::mix(scalar_type(2.0) * VdotH * VdotH  - scalar_type(1.0),
-                                    VdotH * (VdotH * rcpOrientedEta.value[0] + LdotH) - rcpOrientedEta.value[0], transmitted);
-        assert(NdotH > scalar_type(0.0));
-        retval.iso_cache.absNdotH = hlsl::abs(NdotH);
-        retval.iso_cache.NdotH2 = NdotH * NdotH;
-        return retval;
+        const bool valid = isocache_type::compute(iso,interaction,_sample,eta,H);
+        retval = (this_t)iso;
+        if (valid)
+        {
+            retval.TdotH = nbl::hlsl::dot<vector3_type>(interaction.T,H);
+            retval.BdotH = nbl::hlsl::dot<vector3_type>(interaction.B,H);
+        }
+        return valid;
     }
 
     void fillTangents(const vector3_type T, const vector3_type B, const vector3_type H)
@@ -1097,27 +1081,11 @@ NBL_CONCEPT_END(
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 }
 
-#define NBL_CONCEPT_NAME MicrofacetBRDF
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (bxdf, T)
-#define NBL_CONCEPT_PARAM_1 (aniso, typename T::anisotropic_interaction_type)
-#define NBL_CONCEPT_PARAM_2 (u, vector<typename T::scalar_type, 2>)
-#define NBL_CONCEPT_PARAM_3 (anisocache, typename T::anisocache_type)
-NBL_CONCEPT_BEGIN(4)
-#define bxdf NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define aniso NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-#define anisocache NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(impl::microfacet_bxdf_common, T))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((bxdf.generate(aniso,u,anisocache)), ::nbl::hlsl::is_same_v, typename T::sample_type))
-);
-#undef anisocache
-#undef u
-#undef aniso
-#undef bxdf
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// unified param struct for calls to BxDF::eval, BxDF::pdf, BxDF::quotient_and_pdf
+template<typename Scalar NBL_PRIMARY_REQUIRES(is_scalar_v<Scalar>)
+struct SBxDFParams
+{
+    using this_t = SBxDFParams<Scalar>;
 
 #define NBL_CONCEPT_NAME MicrofacetBSDF
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
@@ -1141,27 +1109,21 @@ NBL_CONCEPT_END(
 #undef bxdf
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
-#define NBL_CONCEPT_NAME IsotropicMicrofacetBRDF
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (bxdf, T)
-#define NBL_CONCEPT_PARAM_1 (iso, typename T::isotropic_interaction_type)
-#define NBL_CONCEPT_PARAM_2 (u, vector<typename T::scalar_type, 2>)
-#define NBL_CONCEPT_PARAM_3 (isocache, typename T::isocache_type)
-NBL_CONCEPT_BEGIN(4)
-#define bxdf NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define iso NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-#define isocache NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(impl::iso_microfacet_bxdf_common, T))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((bxdf.generate(iso,u,isocache)), ::nbl::hlsl::is_same_v, typename T::sample_type))
-);
-#undef isocache
-#undef u
-#undef iso
-#undef bxdf
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+    template<class LightSample, class Aniso NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Anisotropic<Aniso>)
+    static this_t create(LightSample _sample, Aniso interaction, BxDFClampMode clamp = BCM_NONE)
+    {
+        this_t retval;
+        retval.NdotV = clamp == BCM_ABS ? abs<Scalar>(interaction.NdotV) : 
+                        clamp == BCM_MAX ? max<Scalar>(interaction.NdotV, 0.0) :
+                                        interaction.NdotV;
+        retval.uNdotV = interaction.NdotV;
+        retval.NdotV2 = interaction.NdotV2;
+        retval.NdotL = clamp == BCM_ABS ? abs<Scalar>(_sample.NdotL) :
+                        clamp == BCM_MAX ? max<Scalar>(_sample.NdotL, 0.0) :
+                                        _sample.NdotL;
+        retval.uNdotL = _sample.NdotL;
+        retval.NdotL2 = _sample.NdotL2;
+        retval.VdotL = _sample.VdotL;
 
 #define NBL_CONCEPT_NAME IsotropicMicrofacetBSDF
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
@@ -1185,6 +1147,106 @@ NBL_CONCEPT_END(
 #undef bxdf
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
+    template<class LightSample, class Iso, class Cache NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && IsotropicMicrofacetCache<Cache>)
+    static this_t create(LightSample _sample, Iso interaction, Cache cache, BxDFClampMode clamp = BCM_NONE)
+    {
+        this_t retval;
+        retval.NdotH = cache.NdotH;
+        retval.NdotH2 = cache.NdotH2;
+        retval.NdotV = clamp == BCM_ABS ? abs<Scalar>(interaction.NdotV) : 
+                        clamp == BCM_MAX ? max<Scalar>(interaction.NdotV, 0.0) :
+                                        interaction.NdotV;
+        retval.uNdotV = interaction.NdotV;
+        retval.NdotV2 = interaction.NdotV2;
+        retval.NdotL = clamp == BCM_ABS ? abs<Scalar>(_sample.NdotL) :
+                        clamp == BCM_MAX ? max<Scalar>(_sample.NdotL, 0.0) :
+                                        _sample.NdotL;
+        retval.uNdotL = _sample.NdotL;
+        retval.NdotL2 = _sample.NdotL2;
+        retval.VdotH = cache.VdotH;
+        retval.LdotH = cache.LdotH;
+        retval.VdotL = _sample.VdotL;
+        retval.is_aniso = false;
+        return retval;
+    }
+
+    template<class LightSample, class Aniso, class Cache NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Anisotropic<Aniso> && AnisotropicMicrofacetCache<Cache>)
+    static this_t create(LightSample _sample, Aniso interaction, Cache cache, BxDFClampMode clamp = BCM_NONE)
+    {
+        this_t retval;
+        retval.NdotH = cache.NdotH;
+        retval.NdotH2 = cache.NdotH2;
+        retval.NdotV = clamp == BCM_ABS ? abs<Scalar>(interaction.NdotV) : 
+                        clamp == BCM_MAX ? max<Scalar>(interaction.NdotV, 0.0) :
+                                        interaction.NdotV;
+        retval.uNdotV = interaction.NdotV;
+        retval.NdotV2 = interaction.NdotV2;
+        retval.NdotL = clamp == BCM_ABS ? abs<Scalar>(_sample.NdotL) :
+                        clamp == BCM_MAX ? max<Scalar>(_sample.NdotL, 0.0) :
+                                        _sample.NdotL;
+        retval.uNdotL = _sample.NdotL;
+        retval.NdotL2 = _sample.NdotL2;
+        retval.VdotH = cache.VdotH;
+        retval.LdotH = cache.LdotH;
+        retval.VdotL = _sample.VdotL;
+
+        retval.is_aniso = true;
+        retval.TdotH2 = cache.TdotH * cache.TdotH;
+        retval.BdotH2 = cache.BdotH * cache.BdotH;
+        retval.TdotL2 = _sample.TdotL * _sample.TdotL;
+        retval.BdotL2 = _sample.BdotL * _sample.BdotL;
+        retval.TdotV2 = interaction.TdotV * interaction.TdotV;
+        retval.BdotV2 = interaction.BdotV * interaction.BdotV;
+        return retval;
+    }
+
+    Scalar getMaxNdotV() { return max<Scalar>(uNdotV, 0.0); }
+    Scalar getAbsNdotV() { return abs<Scalar>(uNdotV); }
+
+    Scalar getMaxNdotL() { return max<Scalar>(uNdotL, 0.0); }
+    Scalar getAbsNdotL() { return abs<Scalar>(uNdotL); }
+
+    // iso
+    Scalar NdotH;
+    Scalar NdotH2;
+    Scalar NdotV;
+    Scalar NdotV2;
+    Scalar NdotL;
+    Scalar NdotL2;
+    Scalar VdotH;
+    Scalar LdotH;
+    Scalar VdotL;
+
+    // aniso
+    bool is_aniso;
+    Scalar TdotH2;
+    Scalar BdotH2;
+    Scalar TdotL2;
+    Scalar BdotL2;
+    Scalar TdotV2;
+    Scalar BdotV2;
+
+    // original, unclamped
+    Scalar uNdotL;
+    Scalar uNdotV;
+};
+
+// unified param struct for calls to BxDF::create
+template<typename Scalar, typename Spectrum NBL_PRIMARY_REQUIRES(is_scalar_v<Scalar>)
+struct SBxDFCreationParams
+{
+    bool is_aniso;
+    vector<Scalar, 2> A;    // roughness
+    Spectrum ior0;          // source ior
+    Spectrum ior1;          // destination ior
+    Scalar eta;             // in most cases, eta will be calculated from ior0 and ior1; see monochromeEta in pathtracer.hlsl
+    Spectrum eta2;
+    Spectrum luminosityContributionHint;
+};
+
+// fresnel stuff
+namespace impl
+{
 template<typename T>
 NBL_BOOL_CONCEPT MicrofacetBxDF = MicrofacetBRDF<T> || MicrofacetBSDF<T>;
 template<typename T>
diff --git a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
index 56ea88080c..33faa79efc 100644
--- a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
@@ -141,7 +141,7 @@ struct ComputeMicrofacetNormal
     vector_type unnormalized(const bool _refract)
     {
         assert(hlsl::dot(V, L) <= -hlsl::min(orientedEta, scalar_type(1.0) / orientedEta));
-        const scalar_type etaFactor = hlsl::mix(scalar_type(1.0), orientedEta.value, _refract);
+        const scalar_type etaFactor = hlsl::mix(scalar_type(1.0), orientedEta, _refract);
         vector_type tmpH = V + L * etaFactor;
         tmpH = ieee754::flipSign<vector_type>(tmpH, _refract && orientedEta > scalar_type(1.0));
         return tmpH;
@@ -313,14 +313,18 @@ NBL_CONCEPT_BEGIN(2)
 NBL_CONCEPT_END(
     ((NBL_CONCEPT_REQ_TYPE)(T::scalar_type))
     ((NBL_CONCEPT_REQ_TYPE)(T::vector_type))
-    ((NBL_CONCEPT_REQ_TYPE)(T::eta_type))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((fresnel(cosTheta)), ::nbl::hlsl::is_same_v, typename T::vector_type))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((fresnel.getOrientedEtaRcps()), ::nbl::hlsl::is_same_v, OrientedEtaRcps<typename T::eta_type>))
 );
 #undef cosTheta
 #undef fresnel
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
+namespace impl
+{
+template<typename T>
+NBL_BOOL_CONCEPT VectorIsMonochrome = vector_traits<T>::Dimension == 1;
+}
+
 #define NBL_CONCEPT_NAME TwoSidedFresnel
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
 #define NBL_CONCEPT_TPLT_PRM_NAMES (T)
@@ -331,8 +335,11 @@ NBL_CONCEPT_BEGIN(2)
 #define cosTheta NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
 NBL_CONCEPT_END(
     ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(Fresnel, T))
+    ((NBL_CONCEPT_REQ_TYPE)(T::eta_type))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((fresnel.getRefractionOrientedEta()), ::nbl::hlsl::is_same_v, typename T::scalar_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((fresnel.getRefractionOrientedEtaRcps()), ::nbl::hlsl::is_same_v, OrientedEtaRcps<typename T::eta_type>))
     ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((fresnel.getReorientedFresnel(cosTheta)), ::nbl::hlsl::is_same_v, T))
+    ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(impl::VectorIsMonochrome, typename T::eta_type))
 );
 #undef cosTheta
 #undef fresnel
@@ -362,7 +369,7 @@ struct Schlick
         return F0 + (1.0 - F0) * x*x*x*x*x;
     }
 
-    OrientedEtaRcps<eta_type> getOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
+    OrientedEtaRcps<eta_type> getRefractionOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
     {
         const eta_type sqrtF0 = hlsl::sqrt(F0);        
         OrientedEtaRcps<eta_type> rcpEta;
@@ -424,13 +431,13 @@ struct Conductor
         return (rs2 + rp2) * hlsl::promote<T>(0.5);
     }
 
-    OrientedEtaRcps<eta_type> getOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
-    {
-        OrientedEtaRcps<eta_type> rcpEta;
-        rcpEta.value = hlsl::promote<eta_type>(1.0) / eta;
-        rcpEta.value2 = rcpEta.value * rcpEta.value;
-        return rcpEta;
-    }
+    // OrientedEtaRcps<eta_type> getRefractionOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
+    // {
+    //     OrientedEtaRcps<eta_type> rcpEta;
+    //     rcpEta.value = hlsl::promote<eta_type>(1.0) / eta;
+    //     rcpEta.value2 = rcpEta.value * rcpEta.value;
+    //     return rcpEta;
+    // }
 
     T eta;
     T etak2;
@@ -484,7 +491,7 @@ struct Dielectric
     // default to monochrome, but it is possible to have RGB fresnel without dispersion fixing the refraction Eta
     // to be something else than the etas used to compute RGB reflectance or some sort of interpolation of them
     scalar_type getRefractionOrientedEta() NBL_CONST_MEMBER_FUNC { return orientedEta.value[0]; }
-    OrientedEtaRcps<T> getOrientedEtaRcps() NBL_CONST_MEMBER_FUNC { return orientedEta.getReciprocals(); }
+    OrientedEtaRcps<eta_type> getRefractionOrientedEtaRcps() NBL_CONST_MEMBER_FUNC { return orientedEta.getReciprocals(); }
 
     Dielectric<T> getReorientedFresnel(const scalar_type NdotI) NBL_CONST_MEMBER_FUNC
     {
@@ -497,7 +504,7 @@ struct Dielectric
 };
 
 // adapted from https://belcour.github.io/blog/research/publication/2017/05/01/brdf-thin-film.html
-template<typename T, bool SupportsTransmission NBL_STRUCT_CONSTRAINABLE>
+template<typename T, bool SupportsTransmission, typename Colorspace = colorspace::scRGB NBL_STRUCT_CONSTRAINABLE>
 struct Iridescent;
 
 namespace impl
@@ -508,25 +515,26 @@ struct iridescent_helper
     using scalar_type = typename vector_traits<T>::scalar_type;
     using vector_type = T;
 
-    // returns reflectance R = (rp, rs), phi is the phase shift for each plane of polarization (p,s)
-    static void phase_shift(const vector_type orientedEta, const vector_type orientedEtak, const vector_type cosTheta, NBL_REF_ARG(vector_type) phiS, NBL_REF_ARG(vector_type) phiP)
+    // returns phi, the phase shift for each plane of polarization (p,s)
+    static void phase_shift(const vector_type ior1, const vector_type ior2, const vector_type iork2, const vector_type cosTheta, NBL_REF_ARG(vector_type) phiS, NBL_REF_ARG(vector_type) phiP)
     {
-        vector_type cosTheta_2 = cosTheta * cosTheta;
-        vector_type sinTheta2 = hlsl::promote<vector_type>(1.0) - cosTheta_2;
-        const vector_type eta2 = orientedEta*orientedEta;
-        const vector_type etak2 = orientedEtak*orientedEtak;
-
-        vector_type z = eta2 - etak2 - sinTheta2;
-        vector_type w = hlsl::sqrt(z * z + scalar_type(4.0) * eta2 * eta2 * etak2);
-        vector_type a2 = (z + w) * hlsl::promote<vector_type>(0.5);
-        vector_type b2 = (w - z) * hlsl::promote<vector_type>(0.5);
-        vector_type b = hlsl::sqrt(b2);
+        const vector_type cosTheta2 = cosTheta * cosTheta;
+        const vector_type sinTheta2 = hlsl::promote<vector_type>(1.0) - cosTheta2;
+        const vector_type ior1_2 = ior1*ior1;
+        const vector_type ior2_2 = ior2*ior2;
+        const vector_type iork2_2 = iork2*iork2;
 
-        const vector_type t0 = eta2 + etak2;
-        const vector_type t1 = t0 * cosTheta_2;
+        const vector_type z = ior2_2 * (hlsl::promote<vector_type>(1.0) - iork2_2) - ior1_2 * sinTheta2;
+        const vector_type w = hlsl::sqrt(z*z + scalar_type(4.0) * ior2_2 * ior2_2 * iork2_2);
+        const vector_type a2 = hlsl::max(z + w, hlsl::promote<vector_type>(0.0)) * hlsl::promote<vector_type>(0.5);
+        const vector_type b2 = hlsl::max(w - z, hlsl::promote<vector_type>(0.0)) * hlsl::promote<vector_type>(0.5);
+        const vector_type a = hlsl::sqrt(a2);
+        const vector_type b = hlsl::sqrt(b2);
 
-        phiS = hlsl::atan2(hlsl::promote<vector_type>(2.0) * b * cosTheta, a2 + b2 - cosTheta_2);
-        phiP = hlsl::atan2(hlsl::promote<vector_type>(2.0) * eta2 * cosTheta * (hlsl::promote<vector_type>(2.0) * orientedEtak * hlsl::sqrt(a2) - etak2 * b), t1 - a2 + b2);
+        phiS = hlsl::atan2(scalar_type(2.0) * ior1 * b * cosTheta, a2 + b2 - ior1_2*cosTheta2);
+        const vector_type k2_plus_one = hlsl::promote<vector_type>(1.0) + iork2_2;
+        phiP = hlsl::atan2(scalar_type(2.0) * ior1 * ior2_2 * cosTheta * (scalar_type(2.0) * iork2 * a - (hlsl::promote<vector_type>(1.0) - iork2_2) * b),
+                ior2_2 * cosTheta2 * k2_plus_one * k2_plus_one - ior1_2*(a2+b2));
     }
 
     // Evaluation XYZ sensitivity curves in Fourier space
@@ -543,55 +551,56 @@ struct iridescent_helper
         return xyz / scalar_type(1.0685e-7);
     }
 
-    template<typename Params>
-    static T __call(NBL_CONST_REF_ARG(Params) params, const scalar_type clampedCosTheta)
+    template<typename Colorspace>
+    static T __call(const vector_type _D, const vector_type ior1, const vector_type ior2, const vector_type ior3, const vector_type iork3,
+                    const vector_type eta12, const vector_type eta23, const vector_type etak23, const scalar_type clampedCosTheta)
     {
-        const vector_type wavelengths = vector_type(colorspace::scRGB::wavelength_R, colorspace::scRGB::wavelength_G, colorspace::scRGB::wavelength_B);
-
-        const vector_type eta12 = params.getEta12();
-        const vector_type eta23 = params.getEta23();
-        const vector_type etak23 = params.getEtak23();
         const scalar_type cosTheta_1 = clampedCosTheta;
-        vector_type cosTheta_2;
-
         vector_type R12p, R23p, R12s, R23s;
-        const vector_type scale = scalar_type(1.0)/eta12;
-        const vector_type cosTheta2_2 = hlsl::promote<vector_type>(1.0) - hlsl::promote<vector_type>(1.0-cosTheta_1*cosTheta_1) * scale * scale;
-
-        cosTheta_2 = hlsl::sqrt(hlsl::max(cosTheta2_2, hlsl::promote<vector_type>(0.0)));
-        Dielectric<vector_type>::__polarized(eta12, hlsl::promote<vector_type>(cosTheta_1), R12p, R12s);
+        vector_type cosTheta_2;
+        vector<bool,vector_traits<vector_type>::Dimension> notTIR;
+        {
+            const vector_type scale = scalar_type(1.0)/eta12;
+            const vector_type cosTheta2_2 = hlsl::promote<vector_type>(1.0) - hlsl::promote<vector_type>(scalar_type(1.0)-cosTheta_1*cosTheta_1) * scale * scale;
+            notTIR = cosTheta2_2 > hlsl::promote<vector_type>(0.0);
+            cosTheta_2 = hlsl::sqrt(hlsl::max(cosTheta2_2, hlsl::promote<vector_type>(0.0)));
+        }
 
-        // Reflected part by the base
-        // if kappa==0, base material is dielectric
-        NBL_IF_CONSTEXPR(SupportsTransmission)
-            Dielectric<vector_type>::__polarized(eta23 * eta23, cosTheta_2, R23p, R23s);
-        else
+        if (hlsl::any(notTIR))
         {
-            vector_type etaLen2 = eta23 * eta23 + etak23 * etak23;
-            Conductor<vector_type>::__polarized(eta23, etaLen2, cosTheta_2, R23p, R23s);
+            Dielectric<vector_type>::__polarized(eta12 * eta12, hlsl::promote<vector_type>(cosTheta_1), R12p, R12s);
+
+            // Reflected part by the base
+            // if kappa==0, base material is dielectric
+            NBL_IF_CONSTEXPR(SupportsTransmission)
+                Dielectric<vector_type>::__polarized(eta23 * eta23, cosTheta_2, R23p, R23s);
+            else
+            {
+                vector_type etaLen2 = eta23 * eta23 + etak23 * etak23;
+                Conductor<vector_type>::__polarized(eta23, etaLen2, cosTheta_2, R23p, R23s);
+            }
         }
 
         // Check for total internal reflection
-        R12s = hlsl::mix(R12s, hlsl::promote<vector_type>(1.0), cosTheta2_2 <= hlsl::promote<vector_type>(0.0));
-        R12p = hlsl::mix(R12p, hlsl::promote<vector_type>(1.0), cosTheta2_2 <= hlsl::promote<vector_type>(0.0));
-
-        R23s = hlsl::mix(R23s, hlsl::promote<vector_type>(0.0), cosTheta2_2 <= hlsl::promote<vector_type>(0.0));
-        R23p = hlsl::mix(R23p, hlsl::promote<vector_type>(0.0), cosTheta2_2 <= hlsl::promote<vector_type>(0.0));
+        const vector_type notTIRFactor = vector_type(notTIR); // 0 when TIR, 1 otherwise
+        R12s = R12s * notTIRFactor;
+        R12p = R12p * notTIRFactor;
+        R23s = R23s * notTIRFactor;
+        R23p = R23p * notTIRFactor;
 
         // Compute the transmission coefficients
         vector_type T121p = hlsl::promote<vector_type>(1.0) - R12p;
         vector_type T121s = hlsl::promote<vector_type>(1.0) - R12s;
 
         // Optical Path Difference
-        const vector_type D = hlsl::promote<vector_type>(2.0 * params.getDinc()) * params.getThinFilmIor() * cosTheta_2;
-        const vector_type Dphi = hlsl::promote<vector_type>(2.0 * numbers::pi<scalar_type>) * D / wavelengths;
+        const vector_type D = _D * cosTheta_2;
 
         vector_type phi21p, phi21s, phi23p, phi23s, r123s, r123p, Rs;
         vector_type I = hlsl::promote<vector_type>(0.0);
 
         // Evaluate the phase shift
-        phase_shift(eta12, hlsl::promote<vector_type>(0.0), hlsl::promote<vector_type>(cosTheta_1), phi21p, phi21s);
-        phase_shift(eta23, etak23, cosTheta_2, phi23p, phi23s);
+        phase_shift(ior1, ior2, hlsl::promote<vector_type>(0.0), hlsl::promote<vector_type>(cosTheta_1), phi21s, phi21p);
+        phase_shift(ior2, ior3, iork3, cosTheta_2, phi23s, phi23p);
         phi21p = hlsl::promote<vector_type>(numbers::pi<scalar_type>) - phi21p;
         phi21s = hlsl::promote<vector_type>(numbers::pi<scalar_type>) - phi21s;
 
@@ -612,7 +621,7 @@ struct iridescent_helper
         NBL_UNROLL for (int m=1; m<=2; ++m)
         {
             Cm *= r123p;
-            Sm  = hlsl::promote<vector_type>(2.0) * evalSensitivity(hlsl::promote<vector_type>(m)*D, hlsl::promote<vector_type>(m)*(phi23p+phi21p));
+            Sm  = hlsl::promote<vector_type>(2.0) * evalSensitivity(hlsl::promote<vector_type>(scalar_type(m))*D, hlsl::promote<vector_type>(scalar_type(m))*(phi23p+phi21p));
             I  += Cm*Sm;
         }
 
@@ -626,90 +635,135 @@ struct iridescent_helper
         NBL_UNROLL for (int m=1; m<=2; ++m)
         {
             Cm *= r123s;
-            Sm  = hlsl::promote<vector_type>(2.0) * evalSensitivity(hlsl::promote<vector_type>(m)*D, hlsl::promote<vector_type>(m) *(phi23s+phi21s));
+            Sm  = hlsl::promote<vector_type>(2.0) * evalSensitivity(hlsl::promote<vector_type>(scalar_type(m))*D, hlsl::promote<vector_type>(scalar_type(m)) *(phi23s+phi21s));
             I  += Cm*Sm;
         }
 
-        return hlsl::max(colorspace::scRGB::FromXYZ(I), hlsl::promote<vector_type>(0.0)) * hlsl::promote<vector_type>(0.5);
+        return hlsl::max(Colorspace::FromXYZ(I) * hlsl::promote<vector_type>(0.5), hlsl::promote<vector_type>(0.0));
     }
 };
 
-template<typename T, bool SupportsTransmission NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeVectorial<T>)    
+template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeVectorial<T>)    
 struct iridescent_base
 {
     using scalar_type = typename vector_traits<T>::scalar_type;
     using vector_type = T;
 
-    scalar_type getDinc() NBL_CONST_MEMBER_FUNC { return Dinc; }
-    vector_type getThinFilmIor() NBL_CONST_MEMBER_FUNC { return thinFilmIor; }
-    vector_type getEta12() NBL_CONST_MEMBER_FUNC { return eta12; }
-    vector_type getEta23() NBL_CONST_MEMBER_FUNC { return eta23; }
-    vector_type getEtak23() NBL_CONST_MEMBER_FUNC
-    {
-        NBL_IF_CONSTEXPR(SupportsTransmission)
-            return hlsl::promote<vector_type>(0.0);
-        else
-            return etak23;
-    }
-
-    scalar_type Dinc;       // thickness of thin film in nanometers, rec. 100-25000nm
-    vector_type thinFilmIor;
+    vector_type D;
+    vector_type ior1;
+    vector_type ior2;
+    vector_type ior3;
+    vector_type iork3;
     vector_type eta12;      // outside (usually air 1.0) -> thin-film IOR
     vector_type eta23;      // thin-film -> base material IOR
-    vector_type etak23;     // thin-film -> complex component, k==0 makes dielectric
+    vector_type eta13;
 };
 }
 
-template<typename T>
+template<typename T, typename Colorspace>
 NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<T>)
-struct Iridescent<T, false NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<T>) >
+struct Iridescent<T, false, Colorspace NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<T>) > : impl::iridescent_base<T>
 {
-    using this_t = Iridescent<T,false>;
+    using this_t = Iridescent<T,false,Colorspace>;
     using scalar_type = typename vector_traits<T>::scalar_type;
     using vector_type = T;  // assert dim==3?
     using eta_type = vector_type;
-    using base_type = impl::iridescent_base<T, false>;
+    using base_type = impl::iridescent_base<T>;
 
     NBL_CONSTEXPR_STATIC_INLINE bool ReturnsMonochrome = vector_traits<vector_type>::Dimension == 1;
 
+    struct SCreationParams
+    {
+        scalar_type Dinc;   // thickness of thin film in nanometers, rec. 100-25000nm
+        vector_type ior1;   // outside (usually air 1.0)
+        vector_type ior2;   // thin-film ior
+        vector_type ior3;   // base mat ior
+        vector_type iork3;
+    };
+    using creation_params_type = SCreationParams;
+
+    static this_t create(NBL_CONST_REF_ARG(creation_params_type) params)
+    {
+        this_t retval;
+        retval.D = hlsl::promote<vector_type>(2.0 * params.Dinc) * params.ior2;
+        retval.ior1 = params.ior1;
+        retval.ior2 = params.ior2;
+        retval.ior3 = params.ior3;
+        retval.iork3 = params.iork3;
+        retval.eta12 = params.ior2/params.ior1;
+        retval.eta23 = params.ior3/params.ior2;
+        retval.etak23 = params.iork3/params.ior2;
+        retval.eta13 = params.ior3/params.ior1;
+        return retval;
+    }
+
     T operator()(const scalar_type clampedCosTheta) NBL_CONST_MEMBER_FUNC
     {
-        return impl::iridescent_helper<T,false>::template __call<base_type>(__base, clampedCosTheta);
+        return impl::iridescent_helper<T,false>::template __call<Colorspace>(base_type::D, base_type::ior1, base_type::ior2, base_type::ior3, base_type::iork3,
+                                                            base_type::eta12, base_type::eta23, getEtak23(), clampedCosTheta);
     }
 
-    OrientedEtaRcps<eta_type> getOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
+    // OrientedEtaRcps<eta_type> getRefractionOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
+    // {
+    //     OrientedEtaRcps<eta_type> rcpEta;
+    //     rcpEta.value = hlsl::promote<eta_type>(1.0) / base_type::eta13;
+    //     rcpEta.value2 = rcpEta.value * rcpEta.value;
+    //     return rcpEta;
+    // }
+
+    vector_type getEtak23() NBL_CONST_MEMBER_FUNC
     {
-        OrientedEtaRcps<eta_type> rcpEta;
-        rcpEta.value = hlsl::promote<eta_type>(1.0) / __base.eta23;
-        rcpEta.value2 = rcpEta.value * rcpEta.value;
-        return rcpEta;
+        return etak23;
     }
 
-    base_type __base;
+    vector_type etak23;     // thin-film -> complex component
 };
 
-template<typename T>
+template<typename T, typename Colorspace>
 NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<T>)
-struct Iridescent<T, true NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<T>) >
+struct Iridescent<T, true, Colorspace NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<T>) > : impl::iridescent_base<T>
 {
-    using this_t = Iridescent<T,true>;
+    using this_t = Iridescent<T,true,Colorspace>;
     using scalar_type = typename vector_traits<T>::scalar_type;
     using vector_type = T;  // assert dim==3?
     using eta_type = vector<scalar_type, 1>;
-    using base_type = impl::iridescent_base<T, true>;
+    using base_type = impl::iridescent_base<T>;
 
     NBL_CONSTEXPR_STATIC_INLINE bool ReturnsMonochrome = vector_traits<vector_type>::Dimension == 1;
 
+    struct SCreationParams
+    {
+        scalar_type Dinc;   // thickness of thin film in nanometers, rec. 100-25000nm
+        vector_type ior1;   // outside (usually air 1.0)
+        vector_type ior2;   // thin-film ior
+        vector_type ior3;   // base mat ior
+    };
+    using creation_params_type = SCreationParams;
+
+    static this_t create(NBL_CONST_REF_ARG(creation_params_type) params)
+    {
+        this_t retval;
+        retval.D = hlsl::promote<vector_type>(2.0 * params.Dinc) * params.ior2;
+        retval.ior1 = params.ior1;
+        retval.ior2 = params.ior2;
+        retval.ior3 = params.ior3;
+        retval.eta12 = params.ior2/params.ior1;
+        retval.eta23 = params.ior3/params.ior2;
+        retval.eta13 = params.ior3/params.ior1;
+        return retval;
+    }
+
     T operator()(const scalar_type clampedCosTheta) NBL_CONST_MEMBER_FUNC
     {
-        return impl::iridescent_helper<T,true>::template __call<base_type>(__base, clampedCosTheta);
+        return impl::iridescent_helper<T,true>::template __call<Colorspace>(base_type::D, base_type::ior1, base_type::ior2, base_type::ior3, getEtak23(),
+                                                            base_type::eta12, base_type::eta23, getEtak23(), clampedCosTheta);
     }
 
-    scalar_type getRefractionOrientedEta() NBL_CONST_MEMBER_FUNC { return __base.eta23[0]; }
-    OrientedEtaRcps<eta_type> getOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
+    scalar_type getRefractionOrientedEta() NBL_CONST_MEMBER_FUNC { return base_type::eta13[0]; }
+    OrientedEtaRcps<eta_type> getRefractionOrientedEtaRcps() NBL_CONST_MEMBER_FUNC
     {
         OrientedEtaRcps<eta_type> rcpEta;
-        rcpEta.value = hlsl::promote<eta_type>(1.0) / __base.eta23[0];
+        rcpEta.value = hlsl::promote<eta_type>(1.0) / hlsl::promote<eta_type>(base_type::eta13[0]);
         rcpEta.value2 = rcpEta.value * rcpEta.value;
         return rcpEta;
     }
@@ -718,15 +772,20 @@ struct Iridescent<T, true NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectori
     {
         const bool flip = NdotI < scalar_type(0.0);
         this_t orientedFresnel;
-        orientedFresnel.__base.Dinc = __base.Dinc;
-        orientedFresnel.__base.thinFilmIor = __base.thinFilmIor;
-        orientedFresnel.__base.eta12 = hlsl::mix(__base.eta12, hlsl::promote<vector_type>(1.0)/__base.eta12, flip);
-        orientedFresnel.__base.eta23 = hlsl::mix(__base.eta23, hlsl::promote<vector_type>(1.0)/__base.eta23, flip);
-        orientedFresnel.__base.etak23 = hlsl::promote<vector_type>(0.0);
+        orientedFresnel.D = base_type::D;
+        orientedFresnel.ior1 = hlsl::mix(base_type::ior1, base_type::ior3, flip);
+        orientedFresnel.ior2 = base_type::ior2;
+        orientedFresnel.ior3 = hlsl::mix(base_type::ior3, base_type::ior1, flip);
+        orientedFresnel.eta12 = hlsl::mix(base_type::eta12, hlsl::promote<vector_type>(1.0)/base_type::eta23, flip);
+        orientedFresnel.eta23 = hlsl::mix(base_type::eta23, hlsl::promote<vector_type>(1.0)/base_type::eta12, flip);
+        orientedFresnel.eta13 = hlsl::mix(base_type::eta13, hlsl::promote<vector_type>(1.0)/base_type::eta13, flip);
         return orientedFresnel;
     }
 
-    base_type __base;
+    vector_type getEtak23() NBL_CONST_MEMBER_FUNC
+    {
+        return hlsl::promote<vector_type>(0.0);
+    }
 };
 
 
diff --git a/include/nbl/builtin/hlsl/bxdf/geom_smith.hlsl b/include/nbl/builtin/hlsl/bxdf/geom_smith.hlsl
new file mode 100644
index 0000000000..5a6f6cdf26
--- /dev/null
+++ b/include/nbl/builtin/hlsl/bxdf/geom_smith.hlsl
@@ -0,0 +1,291 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_BXDF_GEOM_INCLUDED_
+#define _NBL_BUILTIN_HLSL_BXDF_GEOM_INCLUDED_
+
+#include "nbl/builtin/hlsl/bxdf/ndf.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace bxdf
+{
+namespace smith
+{
+
+template<typename NDF>
+typename NDF::scalar_type VNDF_pdf_wo_clamps(typename NDF::scalar_type ndf, typename NDF::scalar_type lambda_V, typename NDF::scalar_type maxNdotV, NBL_REF_ARG(typename NDF::scalar_type) onePlusLambda_V)
+{
+    onePlusLambda_V = 1.0 + lambda_V;
+    ndf::microfacet_to_light_measure_transform<NDF,ndf::REFLECT_BIT> transform = ndf::microfacet_to_light_measure_transform<NDF,ndf::REFLECT_BIT>::create(ndf / onePlusLambda_V, maxNdotV);
+    return transform();
+}
+
+template<typename NDF>
+typename NDF::scalar_type VNDF_pdf_wo_clamps(typename NDF::scalar_type ndf, typename NDF::scalar_type lambda_V, typename NDF::scalar_type absNdotV, bool transmitted, typename NDF::scalar_type VdotH, typename NDF::scalar_type LdotH, typename NDF::scalar_type VdotHLdotH, typename NDF::scalar_type orientedEta, typename NDF::scalar_type reflectance, NBL_REF_ARG(typename NDF::scalar_type) onePlusLambda_V)
+{
+    onePlusLambda_V = 1.0 + lambda_V;
+    ndf::microfacet_to_light_measure_transform<NDF,ndf::REFLECT_REFRACT_BIT> transform 
+        = ndf::microfacet_to_light_measure_transform<NDF,ndf::REFLECT_REFRACT_BIT>::create((transmitted ? (1.0 - reflectance) : reflectance) * ndf / onePlusLambda_V, absNdotV, transmitted, VdotH, LdotH, VdotHLdotH, orientedEta);
+    return transform();
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+T VNDF_pdf_wo_clamps(T ndf, T G1_over_2NdotV)
+{
+    return ndf * 0.5 * G1_over_2NdotV;
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+T FVNDF_pdf_wo_clamps(T fresnel_ndf, T G1_over_2NdotV, T absNdotV, bool transmitted, T VdotH, T LdotH, T VdotHLdotH, T orientedEta)
+{
+    T FNG = fresnel_ndf * G1_over_2NdotV;
+    T factor = 0.5;
+    if (transmitted)
+    {
+        const T VdotH_etaLdotH = (VdotH + orientedEta * LdotH);
+        // VdotHLdotH is negative under transmission, so this factor is negative
+        factor *= -2.0 * VdotHLdotH / (VdotH_etaLdotH * VdotH_etaLdotH);
+    }
+    return FNG * factor;
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+T VNDF_pdf_wo_clamps(T ndf, T G1_over_2NdotV, T absNdotV, bool transmitted, T VdotH, T LdotH, T VdotHLdotH, T orientedEta, T reflectance)
+{
+    T FN = (transmitted ? (1.0 - reflectance) : reflectance) * ndf;
+    return FVNDF_pdf_wo_clamps<T>(FN, G1_over_2NdotV, absNdotV, transmitted, VdotH, LdotH, VdotHLdotH, orientedEta);
+}
+
+
+template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
+struct SIsotropicParams
+{
+    using this_t = SIsotropicParams<T>;
+
+    static this_t create(T a2, T NdotV2, T NdotL2, T lambdaV_plus_one)  // beckmann
+    {
+        this_t retval;
+        retval.a2 = a2;
+        retval.NdotV2 = NdotV2;
+        retval.NdotL2 = NdotL2;
+        retval.lambdaV_plus_one = lambdaV_plus_one;
+        return retval;
+    }
+
+    static this_t create(T a2, T NdotV, T NdotV2, T NdotL, T NdotL2)    // ggx
+    {
+        this_t retval;
+        retval.a2 = a2;
+        retval.NdotV = NdotV;
+        retval.NdotV2 = NdotV2;
+        retval.NdotL = NdotL;
+        retval.NdotL2 = NdotL2;
+        retval.one_minus_a2 = 1.0 - a2;
+        return retval;
+    }
+
+    T a2;
+    T NdotV;
+    T NdotL;
+    T NdotV2;
+    T NdotL2;
+    T lambdaV_plus_one;
+    T one_minus_a2;
+};
+
+template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
+struct SAnisotropicParams
+{
+    using this_t = SAnisotropicParams<T>;
+
+    static this_t create(T ax2, T ay2, T TdotV2, T BdotV2, T NdotV2, T TdotL2, T BdotL2, T NdotL2, T lambdaV_plus_one)  // beckmann
+    {
+        this_t retval;
+        retval.ax2 = ax2;
+        retval.ay2 = ay2;
+        retval.TdotV2 = TdotV2;
+        retval.BdotV2 = BdotV2;
+        retval.NdotV2 = NdotV2;
+        retval.TdotL2 = TdotL2;
+        retval.BdotL2 = BdotL2;
+        retval.NdotL2 = NdotL2;
+        retval.lambdaV_plus_one = lambdaV_plus_one;
+        return retval;
+    }
+
+    static this_t create(T ax2, T ay2, T NdotV, T TdotV2, T BdotV2, T NdotV2, T NdotL, T TdotL2, T BdotL2, T NdotL2)    // ggx
+    {
+        this_t retval;
+        retval.ax2 = ax2;
+        retval.ay2 = ay2;
+        retval.NdotL = NdotL;
+        retval.NdotV = NdotV;
+        retval.TdotV2 = TdotV2;
+        retval.BdotV2 = BdotV2;
+        retval.NdotV2 = NdotV2;
+        retval.TdotL2 = TdotL2;
+        retval.BdotL2 = BdotL2;
+        retval.NdotL2 = NdotL2;
+        return retval;
+    }
+
+    T ax2;
+    T ay2;
+    T NdotV;
+    T NdotL;
+    T TdotV2;
+    T BdotV2;
+    T NdotV2;
+    T TdotL2;
+    T BdotL2;
+    T NdotL2;
+    T lambdaV_plus_one;
+};
+
+
+// beckmann
+template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
+struct Beckmann
+{
+    using scalar_type = T;
+
+    scalar_type G1(scalar_type lambda)
+    {
+        return 1.0 / (1.0 + lambda);
+    }
+
+    scalar_type C2(scalar_type NdotX2, scalar_type a2)
+    {
+        return NdotX2 / (a2 * (1.0 - NdotX2));    
+    }
+
+    scalar_type C2(scalar_type TdotX2, scalar_type BdotX2, scalar_type NdotX2, scalar_type ax2, scalar_type ay2)
+    {
+        return NdotX2 / (TdotX2 * ax2 + BdotX2 * ay2);
+    }
+
+    scalar_type Lambda(scalar_type c2)
+    {
+        scalar_type c = sqrt<scalar_type>(c2);
+        scalar_type nom = 1.0 - 1.259 * c + 0.396 * c2;
+        scalar_type denom = 2.181 * c2 + 3.535 * c;
+        return nbl::hlsl::mix<scalar_type,scalar_type>(0.0, nom / denom, c < 1.6);
+    }
+
+    scalar_type Lambda(scalar_type NdotX2, scalar_type a2)
+    {
+        return Lambda(C2(NdotX2, a2));
+    }
+
+    scalar_type Lambda(scalar_type TdotX2, scalar_type BdotX2, scalar_type NdotX2, scalar_type ax2, scalar_type ay2)
+    {
+        return Lambda(C2(TdotX2, BdotX2, NdotX2, ax2, ay2));
+    }
+
+    scalar_type correlated(SIsotropicParams<scalar_type> params)
+    {
+        scalar_type c2 = C2(params.NdotV2, params.a2);
+        scalar_type L_v = Lambda(c2);
+        c2 = C2(params.NdotL2, params.a2);
+        scalar_type L_l = Lambda(c2);
+        return G1(L_v + L_l);
+    }
+
+    scalar_type correlated(SAnisotropicParams<scalar_type> params)
+    {
+        scalar_type c2 = C2(params.TdotV2, params.BdotV2, params.NdotV2, params.ax2, params.ay2);
+        scalar_type L_v = Lambda(c2);
+        c2 = C2(params.TdotL2, params.BdotL2, params.NdotL2, params.ax2, params.ay2);
+        scalar_type L_l = Lambda(c2);
+        return G1(L_v + L_l);
+    }
+
+    scalar_type G2_over_G1(SIsotropicParams<scalar_type> params)
+    {
+        scalar_type lambdaL = Lambda(params.NdotL2, params.a2);
+        return params.lambdaV_plus_one / (params.lambdaV_plus_one + lambdaL);
+    }
+
+    scalar_type G2_over_G1(SAnisotropicParams<scalar_type> params)
+    {
+        scalar_type c2 = C2(params.TdotL2, params.BdotL2, params.NdotL2, params.ax2, params.ay2);
+        scalar_type lambdaL = Lambda(c2);
+        return params.lambdaV_plus_one / (params.lambdaV_plus_one + lambdaL);
+    }
+};
+
+
+// ggx
+template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
+struct GGX
+{
+    using scalar_type = T;
+
+    scalar_type devsh_part(scalar_type NdotX2, scalar_type a2, scalar_type one_minus_a2)
+    {
+        return sqrt(a2 + one_minus_a2 * NdotX2);
+    }
+
+    scalar_type devsh_part(scalar_type TdotX2, scalar_type BdotX2, scalar_type NdotX2, scalar_type ax2, scalar_type ay2)
+    {
+        return sqrt(TdotX2 * ax2 + BdotX2 * ay2 + NdotX2);
+    }
+
+    scalar_type G1_wo_numerator(scalar_type NdotX, scalar_type NdotX2, scalar_type a2, scalar_type one_minus_a2)
+    {
+        return 1.0 / (NdotX + devsh_part(NdotX2,a2,one_minus_a2));
+    }
+
+    scalar_type G1_wo_numerator(scalar_type NdotX, scalar_type TdotX2, scalar_type BdotX2, scalar_type NdotX2, scalar_type ax2, scalar_type ay2)
+    {
+        return 1.0 / (NdotX + devsh_part(TdotX2, BdotX2, NdotX2, ax2, ay2));
+    }
+
+    scalar_type G1_wo_numerator(scalar_type NdotX, scalar_type devsh_part)
+    {
+        return 1.0 / (NdotX + devsh_part);
+    }
+
+    scalar_type correlated_wo_numerator(SIsotropicParams<scalar_type> params)
+    {
+        scalar_type Vterm = params.NdotL * devsh_part(params.NdotV2, params.a2, params.one_minus_a2);
+        scalar_type Lterm = params.NdotV * devsh_part(params.NdotL2, params.a2, params.one_minus_a2);
+        return 0.5 / (Vterm + Lterm);
+    }
+
+    scalar_type correlated_wo_numerator(SAnisotropicParams<scalar_type> params)
+    {
+        scalar_type Vterm = params.NdotL * devsh_part(params.TdotV2, params.BdotV2, params.NdotV2, params.ax2, params.ay2);
+        scalar_type Lterm = params.NdotV * devsh_part(params.TdotL2, params.BdotL2, params.NdotL2, params.ax2, params.ay2);
+        return 0.5 / (Vterm + Lterm);
+    }
+
+    scalar_type G2_over_G1(SIsotropicParams<scalar_type> params)
+    {
+        scalar_type devsh_v = devsh_part(params.NdotV2, params.a2, params.one_minus_a2);
+        scalar_type G2_over_G1 = params.NdotL * (devsh_v + params.NdotV); // alternative `Vterm+NdotL*NdotV /// NdotL*NdotV could come as a parameter
+        G2_over_G1 /= params.NdotV * devsh_part(params.NdotL2, params.a2, params.one_minus_a2) + params.NdotL * devsh_v;
+
+        return G2_over_G1;
+    }
+
+    scalar_type G2_over_G1(SAnisotropicParams<scalar_type> params)
+    {
+        scalar_type devsh_v = devsh_part(params.TdotV2, params.BdotV2, params.NdotV2, params.ax2, params.ay2);
+        scalar_type G2_over_G1 = params.NdotL * (devsh_v + params.NdotV);
+        G2_over_G1 /= params.NdotV * devsh_part(params.TdotL2, params.BdotL2, params.NdotL2, params.ax2, params.ay2) + params.NdotL * devsh_v;
+
+        return G2_over_G1;
+    }
+
+};
+
+}
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/bxdf/ndf/beckmann.hlsl b/include/nbl/builtin/hlsl/bxdf/ndf/beckmann.hlsl
index 1406bc8d4f..c719bbfd4e 100644
--- a/include/nbl/builtin/hlsl/bxdf/ndf/beckmann.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/ndf/beckmann.hlsl
@@ -339,8 +339,8 @@ struct Beckmann
         if (isInfinity)
         {
             quant_type dmq;
-            dmq.microfacetMeasure = scalar_type(0.0);
-            dmq.projectedLightMeasure = scalar_type(0.0);
+            dmq.microfacetMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
+            dmq.projectedLightMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
             return dmq;
         }
         scalar_type dg1 = D / (scalar_type(1.0) + query.getLambdaV());
diff --git a/include/nbl/builtin/hlsl/bxdf/ndf/ggx.hlsl b/include/nbl/builtin/hlsl/bxdf/ndf/ggx.hlsl
index 58f697e19c..c64f6e3b84 100644
--- a/include/nbl/builtin/hlsl/bxdf/ndf/ggx.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/ndf/ggx.hlsl
@@ -279,8 +279,8 @@ struct GGX
         quant_type dmq;
         if (isInfinity)
         {
-            dmq.microfacetMeasure = scalar_type(0.0);
-            dmq.projectedLightMeasure = scalar_type(0.0);
+            dmq.microfacetMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
+            dmq.projectedLightMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
             return dmq;
         }
 
@@ -337,8 +337,8 @@ struct GGX
         if (isInfinity)
         {
             quant_type dmq;
-            dmq.microfacetMeasure = scalar_type(0.0);
-            dmq.projectedLightMeasure = scalar_type(0.0);
+            dmq.microfacetMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
+            dmq.projectedLightMeasure = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
             return dmq;
         }
         dg *= correlated_wo_numerator<LS, Interaction, MicrofacetCache>(query, _sample, interaction, cache);
diff --git a/include/nbl/builtin/hlsl/bxdf/reflection.hlsl b/include/nbl/builtin/hlsl/bxdf/reflection.hlsl
index c5d4b019c8..dd2e4e60a8 100644
--- a/include/nbl/builtin/hlsl/bxdf/reflection.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/reflection.hlsl
@@ -16,9 +16,752 @@ namespace nbl
 namespace hlsl
 {
 
-// After Clang-HLSL introduces https://en.cppreference.com/w/cpp/language/namespace_alias
-// namespace brdf = bxdf::reflection;
+// still need these?
+template<class LightSample, class Iso, class Aniso, class RayDirInfo, typename Scalar
+    NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso> && ray_dir_info::Basic<RayDirInfo> && is_scalar_v<Scalar>)
+LightSample cos_generate(NBL_CONST_REF_ARG(Iso) interaction)
+{
+    return LightSample(interaction.V.reflect(interaction.N,interaction.NdotV),interaction.NdotV,interaction.N);
+}
+template<class LightSample, class Iso, class Aniso, class RayDirInfo, typename Scalar
+    NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso> && ray_dir_info::Basic<RayDirInfo> && is_scalar_v<Scalar>)
+LightSample cos_generate(NBL_CONST_REF_ARG(Aniso) interaction)
+{
+    return LightSample(interaction.V.reflect(interaction.N,interaction.NdotV),interaction.NdotV,interaction.T,interaction.B,interaction.N);
+}
+
+// for information why we don't check the relation between `V` and `L` or `N` and `H`, see comments for `nbl::hlsl::transmission::cos_quotient_and_pdf`
+template<typename SpectralBins, typename Pdf NBL_FUNC_REQUIRES(spectral_of<SpectralBins,Pdf> && is_floating_point_v<Pdf>)
+quotient_and_pdf<SpectralBins, Pdf> cos_quotient_and_pdf()
+{
+    return quotient_and_pdf<SpectralBins, Pdf>::create(SpectralBins(1.f), numeric_limits<Pdf>::infinity);
+}
+
+// basic bxdfs
+template<class LightSample, class Iso, class Aniso, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso>)
+struct SLambertianBxDF
+{
+    using this_t = SLambertianBxDF<LightSample, Iso, Aniso, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using isotropic_type = Iso;
+    using anisotropic_type = Aniso;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    static this_t create()
+    {
+        this_t retval;
+        // nothing here, just keeping in convention with others
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        return create();
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        // do nothing
+    }
+
+    scalar_type __eval_pi_factored_out(scalar_type maxNdotL)
+    {
+        return maxNdotL;
+    }
+
+    scalar_type eval(NBL_CONST_REF_ARG(params_t) params)
+    {
+        return __eval_pi_factored_out(params.NdotL) * numbers::inv_pi<scalar_type>;
+    }
+
+    sample_type generate_wo_clamps(anisotropic_type interaction, vector<scalar_type, 2> u)
+    {
+        ray_dir_info_type L;
+        L.direction = projected_hemisphere_generate<scalar_type>(u);
+        return sample_type::createFromTangentSpace(interaction.getTangentSpaceV(), L, interaction.getFromTangentSpace());
+    }
+
+    sample_type generate(anisotropic_type interaction, vector<scalar_type, 2> u)
+    {
+        return generate_wo_clamps(interaction, u);
+    }
+
+    scalar_type pdf(NBL_CONST_REF_ARG(params_t) params)
+    {
+        return projected_hemisphere_pdf<scalar_type>(params.NdotL);
+    }
+
+    quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(params_t) params)
+    {
+        scalar_type _pdf;
+        scalar_type q = projected_hemisphere_quotient_and_pdf<scalar_type>(_pdf, params.NdotL);
+        return quotient_pdf_type::create((spectral_type)(q), _pdf);
+    }
+};
+
+
+template<class LightSample, class Iso, class Aniso, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso>)
+struct SOrenNayarBxDF
+{
+    using this_t = SOrenNayarBxDF<LightSample, Iso, Aniso, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+
+    using isotropic_type = Iso;
+    using anisotropic_type = Aniso;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    static this_t create(scalar_type A)
+    {
+        this_t retval;
+        retval.A = A;
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        return create(params.A.x);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        A = params.A.x;
+    }
+
+    scalar_type __rec_pi_factored_out_wo_clamps(scalar_type VdotL, scalar_type maxNdotL, scalar_type maxNdotV)
+    {
+        scalar_type A2 = A * 0.5;
+        vector2_type AB = vector2_type(1.0, 0.0) + vector2_type(-0.5, 0.45) * vector2_type(A2, A2) / vector2_type(A2 + 0.33, A2 + 0.09);
+        scalar_type C = 1.0 / max<scalar_type>(maxNdotL, maxNdotV);
+
+        scalar_type cos_phi_sin_theta = max<scalar_type>(VdotL - maxNdotL * maxNdotV, 0.0);
+        return (AB.x + AB.y * cos_phi_sin_theta * C);
+    }
+
+    scalar_type eval(params_t params)
+    {
+        return params.NdotL * numbers::inv_pi<scalar_type> * __rec_pi_factored_out_wo_clamps(params.VdotL, params.NdotL, params.NdotV);
+    }
+
+    sample_type generate_wo_clamps(anisotropic_type interaction, vector2_type u)
+    {
+        ray_dir_info_type L;
+        L.direction = projected_hemisphere_generate<scalar_type>(u);
+        return sample_type::createFromTangentSpace(interaction.getTangentSpaceV(), L, interaction.getFromTangentSpace());
+    }
+
+    sample_type generate(anisotropic_type interaction, vector2_type u)
+    {
+        return generate_wo_clamps(interaction, u);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        return projected_hemisphere_pdf<scalar_type>(params.NdotL);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        scalar_type _pdf;
+        projected_hemisphere_quotient_and_pdf<scalar_type>(_pdf, params.NdotL);
+        scalar_type q = __rec_pi_factored_out_wo_clamps(params.VdotL, params.NdotL, params.NdotV);
+        return quotient_pdf_type::create((spectral_type)(q), _pdf);
+    }
+
+    scalar_type A;
+};
+
+
+// microfacet bxdfs
+
+// do not use, not tested, also shit
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>)
+struct SBlinnPhongBxDF
+{
+    using this_t = SBlinnPhongBxDF<LightSample, IsoCache, AnisoCache, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using matrix2x3_type = matrix<scalar_type,3,2>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    static this_t create(vector2_type n, spectral_type ior0, spectral_type ior1)
+    {
+        this_t retval;
+        retval.n = n;
+        retval.ior0 = ior0;
+        retval.ior1 = ior1;
+        return retval;
+    }
+
+    template <typename T>
+    static T phong_exp_to_alpha2(T n)
+    {
+        return 2.0 / (n + 2.0);
+    }
+
+    template <typename T>
+    static T alpha2_to_phong_exp(T a2)
+    {
+        return 2.0 / a2 - 2.0;
+    }
+
+    template<bool aniso>    // this or specialize?
+    scalar_type __eval_DG_wo_clamps(params_t params, vector2_type a2)
+    {
+        if (aniso)
+        {
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(params.NdotH, 1.0 / (1.0 - params.NdotH2), params.TdotH2, params.BdotH2, n.x, n.y);
+            ndf::BlinnPhong<scalar_type> blinn_phong;
+            scalar_type DG = blinn_phong(ndfparams);
+            if (any<vector<bool, 2> >(a2 > (vector2_type)numeric_limits<scalar_type>::min))
+            {
+                smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(a2.x, a2.y, params.TdotV2, params.BdotV2, params.NdotV2, params.TdotL2, params.BdotL2, params.NdotL2, 0);
+                smith::Beckmann<scalar_type> beckmann;
+                DG *= beckmann.correlated(smithparams);
+            }
+            return DG;
+        }
+        else
+        {
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(n, params.NdotH, params.NdotH2);
+            ndf::BlinnPhong<scalar_type> blinn_phong;
+            scalar_type NG = blinn_phong(ndfparams);
+            if (any<vector<bool, 2> >(a2 > (vector2_type)numeric_limits<scalar_type>::min))
+            {
+                smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(a2.x, params.NdotV2, params.NdotL2, 0);
+                smith::Beckmann<scalar_type> beckmann;
+                NG *= beckmann.correlated(smithparams);
+            }
+            return NG;
+        }
+    }
+
+    template<bool aniso>
+    vector3_type __eval_wo_clamps(params_t params)
+    {
+        scalar_type scalar_part;
+        if (aniso)
+        {
+            vector2_type a2 = phong_exp_to_alpha2<vector2_type>(n);
+            scalar_part = __eval_DG_wo_clamps<aniso>(params, a2);
+        }
+        else
+        {
+            vector2_type a2 = (vector2_type)phong_exp_to_alpha2<scalar_type>(n);
+            scalar_part = __eval_DG_wo_clamps<aniso>(params, a2);
+        }
+        ndf::microfacet_to_light_measure_transform<ndf::BlinnPhong<scalar_type>,ndf::REFLECT_BIT> microfacet_transform = ndf::microfacet_to_light_measure_transform<ndf::BlinnPhong<scalar_type>,ndf::REFLECT_BIT>::create(scalar_part, params.NdotV);
+        return fresnelConductor<scalar_type>(ior0, ior1, params.VdotH) * microfacet_transform();
+    }
+
+    vector3_type eval(sample_type _sample, isotropic_type interaction, isocache_type cache)
+    {
+        if (interaction.NdotV > numeric_limits<scalar_type>::min)
+        {
+            params_t params = params_t::template create<sample_type, isotropic_type, isocache_type>(_sample, interaction, cache);
+            return __eval_wo_clamps<false>(params);
+        }
+        else
+            return (vector3_type)0.0;
+    }
+
+    vector3_type eval(sample_type _sample, anisotropic_type interaction, anisocache_type cache)
+    {
+        if (interaction.NdotV > numeric_limits<scalar_type>::min)
+        {
+            params_t params = params_t::template create<sample_type, anisotropic_type, anisocache_type>(_sample, interaction, cache);
+            return __eval_wo_clamps<true>(params);
+        }
+        else
+            return (vector3_type)0.0;
+    }
+
+    vector3_type generate(vector2_type u, scalar_type n)
+    {
+        scalar_type phi = 2.0 * numbers::pi<scalar_type> * u.y;
+        scalar_type cosTheta = pow<scalar_type>(u.x, 1.0/(n+1.0));
+        scalar_type sinTheta = sqrt<scalar_type>(1.0 - cosTheta * cosTheta);
+        scalar_type cosPhi = cos<scalar_type>(phi);
+        scalar_type sinPhi = sin<scalar_type>(phi);
+        return vector3_type(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
+    }
+
+    sample_type generate(anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const vector3_type H = generate(u, n.x);
+        const vector3_type localV = interaction.getTangentSpaceV();
+
+        cache = anisocache_type::create(localV, H);
+        ray_dir_info_type localL;
+        localL.direction = math::reflect<scalar_type>(localV, H, cache.VdotH);
+
+        return sample_type::createFromTangentSpace(localV, localL, interaction.getFromTangentSpace());
+    }
+
+    // where pdf?
+
+    vector2_type n;
+    spectral_type ior0, ior1;
+};
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>)
+struct SBeckmannBxDF
+{
+    using this_t = SBeckmannBxDF<LightSample, IsoCache, AnisoCache, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using matrix2x3_type = matrix<scalar_type,3,2>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
 
+    // iso
+    static this_t create(scalar_type A, spectral_type ior0, spectral_type ior1)
+    {
+        this_t retval;
+        retval.A = vector2_type(A,A);
+        retval.ior0 = ior0;
+        retval.ior1 = ior1;
+        return retval;
+    }
+
+    // aniso
+    static this_t create(scalar_type ax, scalar_type ay, spectral_type ior0, spectral_type ior1)
+    {
+        this_t retval;
+        retval.A = vector2_type(ax,ay);
+        retval.ior0 = ior0;
+        retval.ior1 = ior1;
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        if (params.is_aniso)
+            return create(params.A.x, params.A.y, params.ior0, params.ior1);
+        else
+            return create(params.A.x, params.ior0, params.ior1);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        A = params.A;
+        ior0 = params.ior0;
+        ior1 = params.ior1;
+    }
+
+    scalar_type __eval_DG_wo_clamps(params_t params)
+    {
+        if (params.is_aniso)
+        {
+            const scalar_type ax2 = A.x*A.x;
+            const scalar_type ay2 = A.y*A.y;
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, ax2, ay2, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            scalar_type NG = beckmann_ndf(ndfparams);
+            if (any<vector<bool, 2> >(A > (vector2_type)numeric_limits<scalar_type>::min))
+            {
+                smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(ax2, ay2, params.TdotV2, params.BdotV2, params.NdotV2, params.TdotL2, params.BdotL2, params.NdotL2, 0);
+                smith::Beckmann<scalar_type> beckmann_smith;
+                NG *= beckmann_smith.correlated(smithparams);
+            }
+            return NG;
+        }
+        else
+        {
+            scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            scalar_type NG = beckmann_ndf(ndfparams);
+            if (a2 > numeric_limits<scalar_type>::min)
+            {
+                smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(a2, params.NdotV2, params.NdotL2, 0);
+                smith::Beckmann<scalar_type> beckmann_smith;
+                NG *= beckmann_smith.correlated(smithparams);
+            }
+            return NG;
+        }
+    }
+
+    spectral_type eval(params_t params)
+    {
+        if (params.uNdotV > numeric_limits<scalar_type>::min)
+        {
+            scalar_type scalar_part = __eval_DG_wo_clamps(params);
+            ndf::microfacet_to_light_measure_transform<ndf::Beckmann<scalar_type>,ndf::REFLECT_BIT> microfacet_transform = ndf::microfacet_to_light_measure_transform<ndf::Beckmann<scalar_type>,ndf::REFLECT_BIT>::create(scalar_part, params.uNdotV);
+            return fresnelConductor<spectral_type>(ior0, ior1, params.VdotH) * microfacet_transform();
+        }
+        else
+            return (spectral_type)0.0;
+    }
+
+    vector3_type __generate(vector3_type localV, vector2_type u)
+    {
+        //stretch
+        vector3_type V = nbl::hlsl::normalize<vector3_type>(vector3_type(A.x * localV.x, A.y * localV.y, localV.z));
+
+        vector2_type slope;
+        if (V.z > 0.9999)//V.z=NdotV=cosTheta in tangent space
+        {
+            scalar_type r = sqrt<scalar_type>(-log<scalar_type>(1.0 - u.x));
+            scalar_type sinPhi = sin<scalar_type>(2.0 * numbers::pi<scalar_type> * u.y);
+            scalar_type cosPhi = cos<scalar_type>(2.0 * numbers::pi<scalar_type> * u.y);
+            slope = (vector2_type)r * vector2_type(cosPhi,sinPhi);
+        }
+        else
+        {
+            scalar_type cosTheta = V.z;
+            scalar_type sinTheta = sqrt<scalar_type>(1.0 - cosTheta * cosTheta);
+            scalar_type tanTheta = sinTheta / cosTheta;
+            scalar_type cotTheta = 1.0 / tanTheta;
+
+            scalar_type a = -1.0;
+            scalar_type c = erf<scalar_type>(cosTheta);
+            scalar_type sample_x = max<scalar_type>(u.x, 1.0e-6);
+            scalar_type theta = acos<scalar_type>(cosTheta);
+            scalar_type fit = 1.0 + theta * (-0.876 + theta * (0.4265 - 0.0594*theta));
+            scalar_type b = c - (1.0 + c) * pow<scalar_type>(1.0-sample_x, fit);
+
+            scalar_type normalization = 1.0 / (1.0 + c + numbers::inv_sqrtpi<scalar_type> * tanTheta * exp<scalar_type>(-cosTheta*cosTheta));
+
+            const int ITER_THRESHOLD = 10;
+            const float MAX_ACCEPTABLE_ERR = 1.0e-5;
+            int it = 0;
+            float value=1000.0;
+            while (++it < ITER_THRESHOLD && nbl::hlsl::abs<scalar_type>(value) > MAX_ACCEPTABLE_ERR)
+            {
+                if (!(b >= a && b <= c))
+                    b = 0.5 * (a + c);
+
+                float invErf = erfInv<scalar_type>(b);
+                value = normalization * (1.0 + b + numbers::inv_sqrtpi<scalar_type> * tanTheta * exp<scalar_type>(-invErf * invErf)) - sample_x;
+                float derivative = normalization * (1.0 - invErf * cosTheta);
+
+                if (value > 0.0)
+                    c = b;
+                else
+                    a = b;
+
+                b -= value/derivative;
+            }
+            // TODO: investigate if we can replace these two erf^-1 calls with a box muller transform
+            slope.x = erfInv<scalar_type>(b);
+            slope.y = erfInv<scalar_type>(2.0 * max<scalar_type>(u.y, 1.0e-6) - 1.0);
+        }
+
+        scalar_type sinTheta = sqrt<scalar_type>(1.0 - V.z*V.z);
+        scalar_type cosPhi = sinTheta==0.0 ? 1.0 : clamp<scalar_type>(V.x/sinTheta, -1.0, 1.0);
+        scalar_type sinPhi = sinTheta==0.0 ? 0.0 : clamp<scalar_type>(V.y/sinTheta, -1.0, 1.0);
+        //rotate
+        scalar_type tmp = cosPhi*slope.x - sinPhi*slope.y;
+        slope.y = sinPhi*slope.x + cosPhi*slope.y;
+        slope.x = tmp;
+
+        //unstretch
+        slope = vector2_type(A.x,A.y)*slope;
+
+        return nbl::hlsl::normalize<vector3_type>(vector3_type(-slope, 1.0));
+    }
+
+    sample_type generate(anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const vector3_type localV = interaction.getTangentSpaceV();
+        const vector3_type H = __generate(localV, u);
+
+        cache = anisocache_type::create(localV, H);
+        ray_dir_info_type localL;
+        localL.direction = math::reflect<scalar_type>(localV, H, cache.VdotH);
+
+        return sample_type::createFromTangentSpace(localV, localL, interaction.getFromTangentSpace());
+    }
+
+    scalar_type pdf(params_t params, NBL_REF_ARG(scalar_type) onePlusLambda_V)
+    {
+        scalar_type ndf, lambda;
+        if (params.is_aniso)
+        {
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, A.x*A.x, A.y*A.y, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            ndf = beckmann_ndf(ndfparams);
+
+            smith::Beckmann<scalar_type> beckmann_smith;
+            const scalar_type c2 = beckmann_smith.C2(params.TdotV2, params.BdotV2, params.NdotV2, A.x, A.y);
+            lambda = beckmann_smith.Lambda(c2);
+        }
+        else
+        {
+            scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            ndf = beckmann_ndf(ndfparams);
+
+            smith::Beckmann<scalar_type> beckmann_smith;
+            lambda = beckmann_smith.Lambda(params.NdotV2, a2);
+        }
+
+        return smith::VNDF_pdf_wo_clamps<smith::Beckmann<scalar_type> >(ndf, lambda, params.uNdotV, onePlusLambda_V);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        scalar_type dummy;
+        return pdf(params, dummy);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        scalar_type onePlusLambda_V;
+        scalar_type _pdf = pdf(params, onePlusLambda_V);
+
+        smith::Beckmann<scalar_type> beckmann_smith;
+        spectral_type quo = (spectral_type)0.0;
+        if (params.uNdotL > numeric_limits<scalar_type>::min && params.uNdotV > numeric_limits<scalar_type>::min)
+        {
+            scalar_type G2_over_G1;
+            if (params.is_aniso)
+            {
+                smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(A.x*A.x, A.y*A.y, params.TdotV2, params.BdotV2, params.NdotV2, params.TdotL2, params.BdotL2, params.NdotL2, onePlusLambda_V);
+                G2_over_G1 = beckmann_smith.G2_over_G1(smithparams);
+            }
+            else
+            {
+                smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(A.x*A.x, params.NdotV2, params.NdotL2, onePlusLambda_V);
+                G2_over_G1 = beckmann_smith.G2_over_G1(smithparams);
+            }
+            const spectral_type reflectance = fresnelConductor<spectral_type>(ior0, ior1, params.VdotH);
+            quo = reflectance * G2_over_G1;
+        }
+
+        return quotient_pdf_type::create(quo, _pdf);
+    }
+
+    vector2_type A;
+    spectral_type ior0, ior1;
+};
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>)
+struct SGGXBxDF
+{
+    using this_t = SGGXBxDF<LightSample, IsoCache, AnisoCache, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using matrix2x3_type = matrix<scalar_type,3,2>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    // iso
+    static this_t create(scalar_type A, spectral_type ior0, spectral_type ior1)
+    {
+        this_t retval;
+        retval.A = vector2_type(A,A);
+        retval.ior0 = ior0;
+        retval.ior1 = ior1;
+        return retval;
+    }
+
+    // aniso
+    static this_t create(scalar_type ax, scalar_type ay, spectral_type ior0, spectral_type ior1)
+    {
+        this_t retval;
+        retval.A = vector2_type(ax,ay);
+        retval.ior0 = ior0;
+        retval.ior1 = ior1;
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        if (params.is_aniso)
+            return create(params.A.x, params.A.y, params.ior0, params.ior1);
+        else
+            return create(params.A.x, params.ior0, params.ior1);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        A = params.A;
+        ior0 = params.ior0;
+        ior1 = params.ior1;
+    }
+
+    scalar_type __eval_DG_wo_clamps(params_t params)
+    {
+        if (params.is_aniso)
+        {
+            const scalar_type ax2 = A.x*A.x;
+            const scalar_type ay2 = A.y*A.y;
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, ax2, ay2, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            scalar_type NG = ggx_ndf(ndfparams);
+            if (any<vector<bool, 2> >(A > (vector2_type)numeric_limits<scalar_type>::min))
+            {
+                smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(ax2, ay2, params.NdotV, params.TdotV2, params.BdotV2, params.NdotV2, params.NdotL, params.TdotL2, params.BdotL2, params.NdotL2);
+                smith::GGX<scalar_type> ggx_smith;
+                NG *= ggx_smith.correlated_wo_numerator(smithparams);
+            }
+            return NG;
+        }
+        else
+        {
+            scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            scalar_type NG = ggx_ndf(ndfparams);
+            if (a2 > numeric_limits<scalar_type>::min)
+            {
+                smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(a2, params.NdotV, params.NdotV2, params.NdotL, params.NdotL2);
+                smith::GGX<scalar_type> ggx_smith;
+                NG *= ggx_smith.correlated_wo_numerator(smithparams);
+            }
+            return NG;
+        }
+    }
+
+    spectral_type eval(params_t params)
+    {
+        if (params.uNdotL > numeric_limits<scalar_type>::min && params.uNdotV > numeric_limits<scalar_type>::min)
+        {
+            scalar_type scalar_part = __eval_DG_wo_clamps(params);
+            ndf::microfacet_to_light_measure_transform<ndf::GGX<scalar_type>,ndf::REFLECT_BIT> microfacet_transform = ndf::microfacet_to_light_measure_transform<ndf::GGX<scalar_type>,ndf::REFLECT_BIT>::create(scalar_part, params.NdotL);
+            return fresnelConductor<spectral_type>(ior0, ior1, params.VdotH) * microfacet_transform();
+        }
+        else
+            return (spectral_type)0.0;
+    }
+
+    vector3_type __generate(vector3_type localV, vector2_type u)
+    {
+        vector3_type V = nbl::hlsl::normalize<vector3_type>(vector3_type(A.x*localV.x, A.y*localV.y, localV.z));//stretch view vector so that we're sampling as if roughness=1.0
+
+        scalar_type lensq = V.x*V.x + V.y*V.y;
+        vector3_type T1 = lensq > 0.0 ? vector3_type(-V.y, V.x, 0.0) * rsqrt<scalar_type>(lensq) : vector3_type(1.0,0.0,0.0);
+        vector3_type T2 = cross<scalar_type>(V,T1);
+
+        scalar_type r = sqrt<scalar_type>(u.x);
+        scalar_type phi = 2.0 * numbers::pi<scalar_type> * u.y;
+        scalar_type t1 = r * cos<scalar_type>(phi);
+        scalar_type t2 = r * sin<scalar_type>(phi);
+        scalar_type s = 0.5 * (1.0 + V.z);
+        t2 = (1.0 - s)*sqrt<scalar_type>(1.0 - t1*t1) + s*t2;
+
+        //reprojection onto hemisphere
+        //TODO try it wothout the max(), not sure if -t1*t1-t2*t2>-1.0
+        vector3_type H = t1*T1 + t2*T2 + sqrt<scalar_type>(max<scalar_type>(0.0, 1.0-t1*t1-t2*t2))*V;
+        //unstretch
+        return nbl::hlsl::normalize<vector3_type>(vector3_type(A.x*H.x, A.y*H.y, H.z));
+    }
+
+    sample_type generate(anisotropic_type interaction, vector2_type u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const vector3_type localV = interaction.getTangentSpaceV();
+        const vector3_type H = __generate(localV, u);
+
+        cache = anisocache_type::create(localV, H);
+        ray_dir_info_type localL;
+        localL.direction = math::reflect<scalar_type>(localV, H, cache.VdotH);
+
+        return sample_type::createFromTangentSpace(localV, localL, interaction.getFromTangentSpace());
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        scalar_type ndf, G1_over_2NdotV;
+        if (params.is_aniso)
+        {
+            const scalar_type ax2 = A.x*A.x;
+            const scalar_type ay2 = A.y*A.y;
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, ax2, ay2, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            ndf = ggx_ndf(ndfparams);
+
+            smith::GGX<scalar_type> ggx_smith;
+            const scalar_type devsh_v = ggx_smith.devsh_part(params.TdotV2, params.BdotV2, params.NdotV2, ax2, ay2);
+            G1_over_2NdotV = ggx_smith.G1_wo_numerator(params.uNdotV, devsh_v);
+        }
+        else
+        {
+            const scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            ndf = ggx_ndf(ndfparams);
+
+            smith::GGX<scalar_type> ggx_smith;
+            const scalar_type devsh_v = ggx_smith.devsh_part(params.NdotV2, a2, 1.0-a2);
+            G1_over_2NdotV = ggx_smith.G1_wo_numerator(params.uNdotV, devsh_v);
+        }
+        return smith::VNDF_pdf_wo_clamps<scalar_type>(ndf, G1_over_2NdotV);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        scalar_type _pdf = pdf(params);
+
+        spectral_type quo = (spectral_type)0.0;
+        if (params.uNdotL > numeric_limits<scalar_type>::min && params.uNdotV > numeric_limits<scalar_type>::min)
+        {
+            scalar_type G2_over_G1;
+            smith::GGX<scalar_type> ggx_smith;
+            if (params.is_aniso)
+            {
+                const scalar_type ax2 = A.x*A.x;
+                const scalar_type ay2 = A.y*A.y;
+                smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(ax2, ay2, params.uNdotV, params.TdotV2, params.BdotV2, params.NdotV2, params.uNdotL, params.TdotL2, params.BdotL2, params.NdotL2);
+                G2_over_G1 = ggx_smith.G2_over_G1(smithparams);
+            }
+            else
+            {
+                const scalar_type a2 = A.x*A.x;
+                smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(a2, params.uNdotV, params.NdotV2, params.uNdotL, params.NdotL2);
+                G2_over_G1 = ggx_smith.G2_over_G1(smithparams);
+            }
+            const spectral_type reflectance = fresnelConductor<spectral_type>(ior0, ior1, params.VdotH);
+            quo = reflectance * G2_over_G1;
+        }
+
+        return quotient_pdf_type::create(quo, _pdf);
+    }
+
+    vector2_type A;
+    spectral_type ior0, ior1;
+};
+
+}
+}
 }
 }
 
diff --git a/include/nbl/builtin/hlsl/bxdf/reflection/beckmann.hlsl b/include/nbl/builtin/hlsl/bxdf/reflection/beckmann.hlsl
index f37d0d9fd8..cb7743e02d 100644
--- a/include/nbl/builtin/hlsl/bxdf/reflection/beckmann.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/reflection/beckmann.hlsl
@@ -27,23 +27,7 @@ using SBeckmannAnisotropic = SCookTorrance<Config, ndf::Beckmann<typename Config
 
 }
 
-template<typename C>
-struct traits<bxdf::reflection::SBeckmannIsotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BRDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
-
-template<typename C>
-struct traits<bxdf::reflection::SBeckmannAnisotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BRDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/reflection/ggx.hlsl b/include/nbl/builtin/hlsl/bxdf/reflection/ggx.hlsl
index 049480afab..0f49d0be43 100644
--- a/include/nbl/builtin/hlsl/bxdf/reflection/ggx.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/reflection/ggx.hlsl
@@ -27,23 +27,7 @@ using SGGXAnisotropic = SCookTorrance<Config, ndf::GGX<typename Config::scalar_t
 
 }
 
-template<typename C>
-struct traits<bxdf::reflection::SGGXIsotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BRDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
-
-template<typename C>
-struct traits<bxdf::reflection::SGGXAnisotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BRDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/reflection/iridescent.hlsl b/include/nbl/builtin/hlsl/bxdf/reflection/iridescent.hlsl
index 07762d1298..a6120233bb 100644
--- a/include/nbl/builtin/hlsl/bxdf/reflection/iridescent.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/reflection/iridescent.hlsl
@@ -16,18 +16,11 @@ namespace reflection
 {
 
 template<class Config>
-using SIridescent = SCookTorrance<Config, ndf::GGX<typename Config::scalar_type, false, ndf::MTT_REFLECT>, fresnel::Iridescent<typename Config::spectral_type, false> >;
+using SIridescent = SCookTorrance<Config, ndf::GGX<typename Config::scalar_type, false, ndf::MTT_REFLECT>, fresnel::Iridescent<typename Config::spectral_type, false, colorspace::scRGB> >;
 
 }
 
-template<typename C>
-struct traits<bxdf::reflection::SIridescent<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BRDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/transmission.hlsl b/include/nbl/builtin/hlsl/bxdf/transmission.hlsl
index b5b6e101c1..4087b715c3 100644
--- a/include/nbl/builtin/hlsl/bxdf/transmission.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/transmission.hlsl
@@ -17,9 +17,654 @@ namespace nbl
 namespace hlsl
 {
 
-// After Clang-HLSL introduces https://en.cppreference.com/w/cpp/language/namespace_alias
-// namespace bsdf = bxdf::transmission;
+template<class LightSample, class Iso, class Aniso, class RayDirInfo, typename Scalar
+        NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso> && ray_dir_info::Basic<RayDirInfo> && is_scalar_v<Scalar>)
+LightSample cos_generate(NBL_CONST_REF_ARG(Iso) interaction)
+{
+    return LightSample(interaction.V.transmit(),-1.f,interaction.N);
+}
+template<class LightSample, class Iso, class Aniso, class RayDirInfo, typename Scalar
+    NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso> && ray_dir_info::Basic<RayDirInfo> && is_scalar_v<Scalar>)
+LightSample cos_generate(NBL_CONST_REF_ARG(Aniso) interaction)
+{
+    return LightSample(interaction.V.transmit(),-1.f,interaction.T,interaction.B,interaction.N);
+}
+
+// Why don't we check that the incoming and outgoing directions equal each other
+// (or similar for other delta distributions such as reflect, or smooth [thin] dielectrics):
+// - The `quotient_and_pdf` functions are meant to be used with MIS and RIS
+// - Our own generator can never pick an improbable path, so no checking necessary
+// - For other generators the estimator will be `f_BSDF*f_Light*f_Visibility*clampedCos(theta)/(1+(p_BSDF^alpha+p_otherNonChosenGenerator^alpha+...)/p_ChosenGenerator^alpha)`
+//	 therefore when `p_BSDF` equals `nbl_glsl_FLT_INF` it will drive the overall MIS estimator for the other generators to 0 so no checking necessary
+template<typename SpectralBins, typename Pdf NBL_FUNC_REQUIRES(spectral_of<SpectralBins,Pdf> && is_floating_point_v<Pdf>)
+quotient_and_pdf<SpectralBins, Pdf> cos_quotient_and_pdf()
+{
+    return quotient_and_pdf<SpectralBins, Pdf>::create(SpectralBins(1.f), numeric_limits<Pdf>::infinity);
+}
+
+// basic bxdf
+template<class LightSample, class Iso, class Aniso, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && surface_interactions::Isotropic<Iso> && surface_interactions::Anisotropic<Aniso>)
+struct SLambertianBxDF
+{
+    using this_t = SLambertianBxDF<LightSample, Iso, Aniso, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using isotropic_type = Iso;
+    using anisotropic_type = Aniso;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    static this_t create()
+    {
+        this_t retval;
+        // nothing here, just keeping convention with others
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        return create();
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        // do nothing
+    }
+
+    scalar_type __eval_pi_factored_out(scalar_type absNdotL)
+    {
+        return absNdotL;
+    }
+
+    scalar_type eval(params_t params)
+    {
+        return __eval_pi_factored_out(params.NdotL) * numbers::inv_pi<scalar_type> * 0.5;
+    }
+
+    sample_type generate_wo_clamps(anisotropic_type interaction, vector<scalar_type, 3> u)
+    {
+        ray_dir_info_type L;
+        L.direction = projected_sphere_generate<scalar_type>(u);
+        return sample_type::createFromTangentSpace(interaction.getTangentSpaceV(), L, interaction.getFromTangentSpace());
+    }
+
+    sample_type generate(anisotropic_type interaction, vector<scalar_type, 3> u)
+    {
+        return generate_wo_clamps(interaction, u);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        return projected_sphere_pdf<scalar_type>(params.NdotL);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        scalar_type _pdf;
+        scalar_type q = projected_sphere_quotient_and_pdf<scalar_type>(_pdf, params.NdotL);
+        return quotient_pdf_type::create((spectral_type)(q), _pdf);
+    }
+};
+
+
+// microfacet bxdfs
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum, bool thin> // NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>) // dxc won't let me put this in
+struct SSmoothDielectricBxDF;
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum>
+struct SSmoothDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum, false>
+{
+    using this_t = SSmoothDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum, false>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector3_type = vector<scalar_type, 3>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    static this_t create(scalar_type eta)
+    {
+        this_t retval;
+        retval.eta = eta;
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        return create(params.eta);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        eta = params.eta;
+    }
+
+    spectral_type eval(params_t params)
+    {
+        return (spectral_type)0;
+    }
+
+    sample_type __generate_wo_clamps(vector3_type V, vector3_type T, vector3_type B, vector3_type N, bool backside, scalar_type NdotV, scalar_type absNdotV, scalar_type NdotV2, NBL_REF_ARG(vector3_type) u, scalar_type rcpOrientedEta, scalar_type orientedEta2, scalar_type rcpOrientedEta2, NBL_REF_ARG(bool) transmitted)
+    {
+        const scalar_type reflectance = fresnelDielectric_common<scalar_type>(orientedEta2, absNdotV);
+
+        scalar_type rcpChoiceProb;
+        transmitted = math::partitionRandVariable(reflectance, u.z, rcpChoiceProb);
+
+        ray_dir_info_type L;
+        L.direction = math::reflectRefract(transmitted, V, N, backside, NdotV, NdotV2, rcpOrientedEta, rcpOrientedEta2);
+        return sample_type::create(L, nbl::hlsl::dot<vector3_type>(V, L.direction), T, B, N);
+    }
+
+    sample_type generate_wo_clamps(anisotropic_type interaction, NBL_REF_ARG(vector<scalar_type, 3>) u)
+    {
+        scalar_type orientedEta, rcpOrientedEta;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, rcpOrientedEta, interaction.NdotV, eta);
+        bool dummy;
+        return __generate_wo_clamps(interaction.V.direction, interaction.T, interaction.B, interaction.N, backside, interaction.NdotV,
+            interaction.NdotV, interaction.NdotV*interaction.NdotV, u, rcpOrientedEta, orientedEta*orientedEta, rcpOrientedEta*rcpOrientedEta, dummy);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector<scalar_type, 3>) u)
+    {
+        scalar_type orientedEta, rcpOrientedEta;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, rcpOrientedEta, interaction.NdotV, eta);
+        bool dummy;
+        return __generate_wo_clamps(interaction.V.direction, interaction.T, interaction.B, interaction.N, backside, interaction.NdotV,
+            nbl::hlsl::abs<scalar_type>(interaction.NdotV), interaction.NdotV*interaction.NdotV, u, rcpOrientedEta, orientedEta*orientedEta, rcpOrientedEta*rcpOrientedEta, dummy);
+    }
+
+    // eval and pdf return 0 because smooth dielectric/conductor BxDFs are dirac delta distributions, model perfectly specular objects that scatter light to only one outgoing direction
+    scalar_type pdf(params_t params)
+    {
+        return 0;
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        const bool transmitted = isTransmissionPath(params.uNdotV, params.uNdotL);
+
+        scalar_type dummy, rcpOrientedEta;
+        const bool backside = math::getOrientedEtas<scalar_type>(dummy, rcpOrientedEta, params.NdotV, eta);
+
+        const scalar_type _pdf = numeric_limits<scalar_type>::infinity;
+        scalar_type quo = transmitted ? rcpOrientedEta : 1.0;
+        return quotient_pdf_type::create((spectral_type)(quo), _pdf);
+    }
+
+    scalar_type eta;
+};
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum>
+struct SSmoothDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum, true>
+{
+    using this_t = SSmoothDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum, true>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector3_type = vector<scalar_type, 3>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    static this_t create(spectral_type eta2, spectral_type luminosityContributionHint)
+    {
+        this_t retval;
+        retval.eta2 = eta2;
+        retval.luminosityContributionHint = luminosityContributionHint;
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        return create(params.eta2, params.luminosityContributionHint);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        eta2 = params.eta2;
+        luminosityContributionHint = params.luminosityContributionHint;
+    }
+
+    spectral_type eval(params_t params)
+    {
+        return (spectral_type)0;
+    }
+
+    // usually `luminosityContributionHint` would be the Rec.709 luma coefficients (the Y row of the RGB to CIE XYZ matrix)
+    // its basically a set of weights that determine
+    // assert(1.0==luminosityContributionHint.r+luminosityContributionHint.g+luminosityContributionHint.b);
+    // `remainderMetadata` is a variable which the generator function returns byproducts of sample generation that would otherwise have to be redundantly calculated `quotient_and_pdf`
+    sample_type __generate_wo_clamps(vector3_type V, vector3_type T, vector3_type B, vector3_type N, scalar_type NdotV, scalar_type absNdotV, NBL_REF_ARG(vector3_type) u, spectral_type eta2, spectral_type luminosityContributionHint, NBL_REF_ARG(spectral_type) remainderMetadata)
+    {
+        // we will only ever intersect from the outside
+        const spectral_type reflectance = thindielectricInfiniteScatter<spectral_type>(fresnelDielectric_common<vector3_type>(eta2,absNdotV));
+
+        // we are only allowed one choice for the entire ray, so make the probability a weighted sum
+        const scalar_type reflectionProb = nbl::hlsl::dot<spectral_type>(reflectance, luminosityContributionHint);
+
+        scalar_type rcpChoiceProb;
+        const bool transmitted = math::partitionRandVariable(reflectionProb, u.z, rcpChoiceProb);
+        remainderMetadata = (transmitted ? ((spectral_type)(1.0) - reflectance) : reflectance) * rcpChoiceProb;
+
+        ray_dir_info_type L;
+        L.direction = (transmitted ? (vector3_type)(0.0) : N * 2.0f * NdotV) - V;
+        return sample_type::create(L, nbl::hlsl::dot<vector3_type>(V, L.direction), T, B, N);
+    }
+
+    sample_type generate_wo_clamps(anisotropic_type interaction, NBL_REF_ARG(vector<scalar_type, 3>) u)
+    {
+        vector3_type dummy;
+        return __generate_wo_clamps(interaction.V.direction, interaction.T, interaction.B, interaction.N, interaction.NdotV, interaction.NdotV, u, eta2, luminosityContributionHint, dummy);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector<scalar_type, 3>) u)
+    {
+        vector3_type dummy;
+        return __generate_wo_clamps(interaction.V.direction, interaction.T, interaction.B, interaction.N, interaction.NdotV, nbl::hlsl::abs<scalar_type>(interaction.NdotV), u, eta2, luminosityContributionHint, dummy);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        return 0;
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)   // isotropic
+    {
+        const bool transmitted = isTransmissionPath(params.uNdotV, params.uNdotL);
+        const spectral_type reflectance = thindielectricInfiniteScatter<spectral_type>(fresnelDielectric_common<spectral_type>(eta2, params.NdotV));
+        const spectral_type sampleValue = transmitted ? ((spectral_type)(1.0) - reflectance) : reflectance;
+
+        const scalar_type sampleProb = nbl::hlsl::dot<spectral_type>(sampleValue,luminosityContributionHint);
+
+        const scalar_type _pdf = numeric_limits<scalar_type>::infinity;
+        return quotient_pdf_type::create((spectral_type)(sampleValue / sampleProb), _pdf);
+    }
+
+    spectral_type eta2;
+    spectral_type luminosityContributionHint;
+};
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>)
+struct SBeckmannDielectricBxDF
+{
+    using this_t = SBeckmannDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using matrix3x3_type = matrix<scalar_type,3,3>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    static this_t create(scalar_type eta, scalar_type A)
+    {
+        this_t retval;
+        retval.eta = eta;
+        retval.A = vector2_type(A, A);
+        return retval;
+    }
+
+    static this_t create(scalar_type eta, scalar_type ax, scalar_type ay)
+    {
+        this_t retval;
+        retval.eta = eta;
+        retval.A = vector2_type(ax, ay);
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        if (params.is_aniso)
+            return create(params.eta, params.A.x, params.A.y);
+        else
+            return create(params.eta, params.A.x);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        A = params.A;
+        eta = params.eta;
+    }
+
+    spectral_type eval(params_t params)
+    {
+        scalar_type orientedEta, dummy;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, dummy, params.VdotH, eta);
+        const scalar_type orientedEta2 = orientedEta * orientedEta;
 
+        const scalar_type VdotHLdotH = params.VdotH * params.LdotH;
+        const bool transmitted = VdotHLdotH < 0.0;
+
+        spectral_type dummyior;
+        reflection::SBeckmannBxDF<sample_type, isocache_type, anisocache_type, spectral_type> beckmann;
+        if (params.is_aniso)
+            beckmann = reflection::SBeckmannBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, A.y, dummyior, dummyior);
+        else
+            beckmann = reflection::SBeckmannBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, dummyior, dummyior);
+        const scalar_type scalar_part = beckmann.__eval_DG_wo_clamps(params);
+
+        ndf::microfacet_to_light_measure_transform<ndf::Beckmann<scalar_type>,ndf::REFLECT_REFRACT_BIT> microfacet_transform =
+            ndf::microfacet_to_light_measure_transform<ndf::Beckmann<scalar_type>,ndf::REFLECT_REFRACT_BIT>::create(scalar_part,params.NdotV,transmitted,params.VdotH,params.LdotH,VdotHLdotH,orientedEta);
+        return (spectral_type)fresnelDielectric_common<scalar_type>(orientedEta2, nbl::hlsl::abs<scalar_type>(params.VdotH)) * microfacet_transform();
+    }
+
+    sample_type __generate_wo_clamps(vector3_type localV, bool backside, vector3_type H, matrix3x3_type m, NBL_REF_ARG(vector3_type) u, scalar_type rcpOrientedEta, scalar_type orientedEta2, scalar_type rcpOrientedEta2, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const scalar_type localVdotH = nbl::hlsl::dot<vector3_type>(localV,H);
+        const scalar_type reflectance = fresnelDielectric_common<scalar_type>(orientedEta2,nbl::hlsl::abs<scalar_type>(localVdotH));
+
+        scalar_type rcpChoiceProb;
+        bool transmitted = math::partitionRandVariable(reflectance, u.z, rcpChoiceProb);
+
+        cache = anisocache_type::create(localV, H);
+
+        const scalar_type VdotH = cache.VdotH;
+        cache.LdotH = transmitted ? math::reflectRefract_computeNdotT<scalar_type>(VdotH < 0.0, VdotH * VdotH, rcpOrientedEta2) : VdotH;
+        ray_dir_info_type localL;
+        localL.direction = math::reflectRefract_impl(transmitted, localV, H, VdotH, cache.LdotH, rcpOrientedEta);
+
+        return sample_type::createFromTangentSpace(localV, localL, m);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const vector3_type localV = interaction.getTangentSpaceV();
+
+        scalar_type orientedEta, rcpOrientedEta;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, rcpOrientedEta, interaction.NdotV, eta);
+
+        const vector3_type upperHemisphereV = backside ? -localV : localV;
+
+        spectral_type dummyior;
+        reflection::SBeckmannBxDF<sample_type, isocache_type, anisocache_type, spectral_type> beckmann = reflection::SBeckmannBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, A.y, dummyior, dummyior);
+        const vector3_type H = beckmann.__generate(upperHemisphereV, u.xy);
+
+        return __generate_wo_clamps(localV, backside, H, interaction.getFromTangentSpace(), u, rcpOrientedEta, orientedEta*orientedEta, rcpOrientedEta*rcpOrientedEta, cache);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector3_type) u)
+    {
+        anisocache_type dummycache;
+        return generate(interaction, u, dummycache);
+    }
+
+    scalar_type pdf(params_t params, NBL_REF_ARG(scalar_type) onePlusLambda_V)
+    {
+        scalar_type orientedEta, dummy;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, dummy, params.VdotH, eta);
+        const scalar_type orientedEta2 = orientedEta * orientedEta;
+
+        const scalar_type VdotHLdotH = params.VdotH * params.LdotH;
+        const bool transmitted = VdotHLdotH < 0.0;
+
+        const scalar_type reflectance = fresnelDielectric_common<scalar_type>(orientedEta2, nbl::hlsl::abs<scalar_type>(params.VdotH));
+
+        scalar_type ndf, lambda;
+        if (params.is_aniso)
+        {
+            const scalar_type ax2 = A.x*A.x;
+            const scalar_type ay2 = A.y*A.y;
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, ax2, ay2, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            ndf = beckmann_ndf(ndfparams);
+
+            smith::Beckmann<scalar_type> beckmann_smith;
+            scalar_type c2 = beckmann_smith.C2(params.TdotV2, params.BdotV2, params.NdotV2, ax2, ay2);
+            lambda = beckmann_smith.Lambda(c2);
+        }
+        else
+        {
+            const scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::Beckmann<scalar_type> beckmann_ndf;
+            ndf = beckmann_ndf(ndfparams);
+
+            smith::Beckmann<scalar_type> beckmann_smith;
+            lambda = beckmann_smith.Lambda(params.NdotV2, a2);
+        }
+
+        return smith::VNDF_pdf_wo_clamps<smith::Beckmann<scalar_type> >(ndf,lambda,params.NdotV,transmitted,params.VdotH,params.LdotH,VdotHLdotH,orientedEta,reflectance,onePlusLambda_V);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        scalar_type dummy;
+        return pdf(params, dummy);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        scalar_type onePlusLambda_V;
+        scalar_type _pdf = pdf(params, onePlusLambda_V);
+
+        scalar_type quo;
+        if (params.is_aniso)
+        {
+            smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(A.x*A.x, A.y*A.y, params.TdotV2, params.BdotV2, params.NdotV2, params.TdotL2, params.BdotL2, params.NdotL2, onePlusLambda_V);
+            smith::Beckmann<scalar_type> beckmann_smith;
+            quo = beckmann_smith.G2_over_G1(smithparams);
+        }
+        else
+        {
+            smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(A.x*A.x, params.NdotV2, params.NdotL2, onePlusLambda_V);
+            smith::Beckmann<scalar_type> beckmann_smith;
+            quo = beckmann_smith.G2_over_G1(smithparams);
+        }
+
+        return quotient_pdf_type::create((spectral_type)(quo), _pdf);
+    }
+
+    vector2_type A;
+    scalar_type eta;
+};
+
+template<class LightSample, class IsoCache, class AnisoCache, class Spectrum NBL_FUNC_REQUIRES(Sample<LightSample> && IsotropicMicrofacetCache<IsoCache> && AnisotropicMicrofacetCache<AnisoCache>)
+struct SGGXDielectricBxDF
+{
+    using this_t = SGGXDielectricBxDF<LightSample, IsoCache, AnisoCache, Spectrum>;
+    using scalar_type = typename LightSample::scalar_type;
+    using ray_dir_info_type = typename LightSample::ray_dir_info_type;
+    using vector2_type = vector<scalar_type, 2>;
+    using vector3_type = vector<scalar_type, 3>;
+    using matrix3x3_type = matrix<scalar_type,3,3>;
+    using params_t = SBxDFParams<scalar_type>;
+
+    using isotropic_type = typename IsoCache::isotropic_type;
+    using anisotropic_type = typename AnisoCache::anisotropic_type;
+    using sample_type = LightSample;
+    using spectral_type = Spectrum;
+    using quotient_pdf_type = quotient_and_pdf<spectral_type, scalar_type>;
+    using isocache_type = IsoCache;
+    using anisocache_type = AnisoCache;
+
+    static this_t create(scalar_type eta, scalar_type A)
+    {
+        this_t retval;
+        retval.eta = eta;
+        retval.A = vector2_type(A, A);
+        return retval;
+    }
+
+    static this_t create(scalar_type eta, scalar_type ax, scalar_type ay)
+    {
+        this_t retval;
+        retval.eta = eta;
+        retval.A = vector2_type(ax, ay);
+        return retval;
+    }
+
+    static this_t create(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        if (params.is_aniso)
+            return create(params.eta, params.A.x, params.A.y);
+        else
+            return create(params.eta, params.A.x);
+    }
+
+    void init(SBxDFCreationParams<scalar_type, spectral_type> params)
+    {
+        A = params.A;
+        eta = params.eta;
+    }
+
+    spectral_type eval(params_t params)
+    {
+        scalar_type orientedEta, dummy;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, dummy, params.VdotH, eta);
+        const scalar_type orientedEta2 = orientedEta * orientedEta;
+
+        const scalar_type VdotHLdotH = params.VdotH * params.LdotH;
+        const bool transmitted = VdotHLdotH < 0.0;
+
+        scalar_type NG_already_in_reflective_dL_measure;
+        if (params.is_aniso)
+        {
+            spectral_type dummyior;
+            reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type> ggx = reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, A.y, dummyior, dummyior);
+            NG_already_in_reflective_dL_measure = ggx.__eval_DG_wo_clamps(params);
+        }
+        else
+        {
+            spectral_type dummyior;
+            reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type> ggx = reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, dummyior, dummyior);
+            NG_already_in_reflective_dL_measure = ggx.__eval_DG_wo_clamps(params);
+        }
+
+        ndf::microfacet_to_light_measure_transform<ndf::GGX<scalar_type>,ndf::REFLECT_REFRACT_BIT> microfacet_transform =
+            ndf::microfacet_to_light_measure_transform<ndf::GGX<scalar_type>,ndf::REFLECT_REFRACT_BIT>::create(NG_already_in_reflective_dL_measure,params.NdotL,transmitted,params.VdotH,params.LdotH,VdotHLdotH,orientedEta);
+        return (spectral_type)fresnelDielectric_common<scalar_type>(orientedEta2, nbl::hlsl::abs<scalar_type>(params.VdotH)) * microfacet_transform();
+    }
+
+    sample_type __generate_wo_clamps(vector3_type localV, bool backside, vector3_type H, matrix3x3_type m, NBL_REF_ARG(vector3_type) u, scalar_type rcpOrientedEta, scalar_type orientedEta2, scalar_type rcpOrientedEta2, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const scalar_type localVdotH = nbl::hlsl::dot<vector3_type>(localV,H);
+        const scalar_type reflectance = fresnelDielectric_common<scalar_type>(orientedEta2,nbl::hlsl::abs<scalar_type>(localVdotH));
+
+        scalar_type rcpChoiceProb;
+        bool transmitted = math::partitionRandVariable(reflectance, u.z, rcpChoiceProb);
+
+        cache = anisocache_type::create(localV, H);
+
+        const scalar_type VdotH = cache.VdotH;
+        cache.LdotH = transmitted ? math::reflectRefract_computeNdotT<scalar_type>(VdotH < 0.0, VdotH * VdotH, rcpOrientedEta2) : VdotH;
+        ray_dir_info_type localL;
+        localL.direction = math::reflectRefract_impl(transmitted, localV, H, VdotH, cache.LdotH, rcpOrientedEta);
+
+        return sample_type::createFromTangentSpace(localV, localL, m);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector3_type) u, NBL_REF_ARG(anisocache_type) cache)
+    {
+        const vector3_type localV = interaction.getTangentSpaceV();
+
+        scalar_type orientedEta, rcpOrientedEta;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, rcpOrientedEta, interaction.NdotV, eta);
+
+        const vector3_type upperHemisphereV = backside ? -localV : localV;
+
+        spectral_type dummyior;
+        reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type> ggx = reflection::SGGXBxDF<sample_type, isocache_type, anisocache_type, spectral_type>::create(A.x, A.y, dummyior, dummyior);
+        const vector3_type H = ggx.__generate(upperHemisphereV, u.xy);
+
+        return __generate_wo_clamps(localV, backside, H, interaction.getFromTangentSpace(), u, rcpOrientedEta, orientedEta*orientedEta, rcpOrientedEta*rcpOrientedEta, cache);
+    }
+
+    sample_type generate(anisotropic_type interaction, NBL_REF_ARG(vector3_type) u)
+    {
+        anisocache_type dummycache;
+        return generate(interaction, u, dummycache);
+    }
+
+    scalar_type pdf(params_t params)
+    {
+        scalar_type orientedEta, dummy;
+        const bool backside = math::getOrientedEtas<scalar_type>(orientedEta, dummy, params.VdotH, eta);
+        const scalar_type orientedEta2 = orientedEta * orientedEta;
+
+        const scalar_type VdotHLdotH = params.VdotH * params.LdotH;
+        const bool transmitted = VdotHLdotH < 0.0;
+
+        const scalar_type reflectance = fresnelDielectric_common<scalar_type>(orientedEta2, nbl::hlsl::abs<scalar_type>(params.VdotH));
+
+        scalar_type ndf, devsh_v;
+        if (params.is_aniso)
+        {
+            const scalar_type ax2 = A.x*A.x;
+            const scalar_type ay2 = A.y*A.y;
+
+            ndf::SAnisotropicParams<scalar_type> ndfparams = ndf::SAnisotropicParams<scalar_type>::create(A.x, A.y, ax2, ay2, params.TdotH2, params.BdotH2, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            ndf = ggx_ndf(ndfparams);
+
+            smith::GGX<scalar_type> ggx_smith;
+            devsh_v = ggx_smith.devsh_part(params.TdotV2, params.BdotV2, params.NdotV2, ax2, ay2);
+        }
+        else
+        {
+            const scalar_type a2 = A.x*A.x;
+            ndf::SIsotropicParams<scalar_type> ndfparams = ndf::SIsotropicParams<scalar_type>::create(a2, params.NdotH, params.NdotH2);
+            ndf::GGX<scalar_type> ggx_ndf;
+            ndf = ggx_ndf(ndfparams);
+
+            smith::GGX<scalar_type> ggx_smith;
+            devsh_v = ggx_smith.devsh_part(params.NdotV2, a2, 1.0-a2);
+        }
+
+        smith::GGX<scalar_type> ggx_smith;
+        const scalar_type lambda = ggx_smith.G1_wo_numerator(params.NdotV, devsh_v);
+        return smith::VNDF_pdf_wo_clamps<scalar_type>(ndf, lambda, params.NdotV, transmitted, params.VdotH, params.LdotH, VdotHLdotH, orientedEta, reflectance);
+    }
+
+    quotient_pdf_type quotient_and_pdf(params_t params)
+    {
+        const scalar_type ax2 = A.x*A.x;
+        const scalar_type ay2 = A.y*A.y;
+
+        scalar_type _pdf = pdf(params);
+
+        smith::GGX<scalar_type> ggx_smith;
+        scalar_type quo;
+        if (params.is_aniso)
+        {
+            smith::SAnisotropicParams<scalar_type> smithparams = smith::SAnisotropicParams<scalar_type>::create(ax2, ay2, params.NdotV, params.TdotV2, params.BdotV2, params.NdotV2, params.NdotL, params.TdotL2, params.BdotL2, params.NdotL2);
+            quo = ggx_smith.G2_over_G1(smithparams);
+        }
+        else
+        {
+            smith::SIsotropicParams<scalar_type> smithparams = smith::SIsotropicParams<scalar_type>::create(ax2, params.NdotV, params.NdotV2, params.NdotL, params.NdotL2);
+            quo = ggx_smith.G2_over_G1(smithparams);
+        }
+
+        return quotient_pdf_type::create((spectral_type)(quo), _pdf);
+    }
+
+    vector2_type A;
+    scalar_type eta;
+};
+
+}
+}
 }
 }
 
diff --git a/include/nbl/builtin/hlsl/bxdf/transmission/beckmann.hlsl b/include/nbl/builtin/hlsl/bxdf/transmission/beckmann.hlsl
index fa315b40ea..8c61692c5c 100644
--- a/include/nbl/builtin/hlsl/bxdf/transmission/beckmann.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/transmission/beckmann.hlsl
@@ -27,23 +27,7 @@ using SBeckmannDielectricAnisotropic = SCookTorrance<Config, ndf::Beckmann<typen
 
 }
 
-template<typename C>
-struct traits<bxdf::transmission::SBeckmannDielectricIsotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BSDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
-
-template<typename C>
-struct traits<bxdf::transmission::SBeckmannDielectricAnisotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BSDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/transmission/ggx.hlsl b/include/nbl/builtin/hlsl/bxdf/transmission/ggx.hlsl
index 51f096532b..cdd4483c7f 100644
--- a/include/nbl/builtin/hlsl/bxdf/transmission/ggx.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/transmission/ggx.hlsl
@@ -27,23 +27,7 @@ using SGGXDielectricAnisotropic = SCookTorrance<Config, ndf::GGX<typename Config
 
 }
 
-template<typename C>
-struct traits<bxdf::transmission::SGGXDielectricIsotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BSDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
-
-template<typename C>
-struct traits<bxdf::transmission::SGGXDielectricAnisotropic<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BSDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/transmission/iridescent.hlsl b/include/nbl/builtin/hlsl/bxdf/transmission/iridescent.hlsl
index 2e7aa0e56e..05b1753aca 100644
--- a/include/nbl/builtin/hlsl/bxdf/transmission/iridescent.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/transmission/iridescent.hlsl
@@ -16,18 +16,11 @@ namespace transmission
 {
 
 template<class Config>
-using SIridescent = SCookTorrance<Config, ndf::GGX<typename Config::scalar_type, false, ndf::MTT_REFLECT_REFRACT>, fresnel::Iridescent<typename Config::spectral_type, true> >;
+using SIridescent = SCookTorrance<Config, ndf::GGX<typename Config::scalar_type, false, ndf::MTT_REFLECT_REFRACT>, fresnel::Iridescent<typename Config::spectral_type, true, colorspace::scRGB> >;
 
 }
 
-template<typename C>
-struct traits<bxdf::transmission::SIridescent<C> >
-{
-    NBL_CONSTEXPR_STATIC_INLINE BxDFType type = BT_BSDF;
-    NBL_CONSTEXPR_STATIC_INLINE bool IsMicrofacet = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotV = true;
-    NBL_CONSTEXPR_STATIC_INLINE bool clampNdotL = true;
-};
+// inherit trait from cook torrance base
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/bxdf/transmission/smooth_dielectric.hlsl b/include/nbl/builtin/hlsl/bxdf/transmission/smooth_dielectric.hlsl
index 17400adfe2..6d5744fb49 100644
--- a/include/nbl/builtin/hlsl/bxdf/transmission/smooth_dielectric.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/transmission/smooth_dielectric.hlsl
@@ -6,6 +6,7 @@
 
 #include "nbl/builtin/hlsl/bxdf/common.hlsl"
 #include "nbl/builtin/hlsl/bxdf/bxdf_traits.hlsl"
+#include "nbl/builtin/hlsl/sampling/basic.hlsl"
 #include "nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl"
 
 namespace nbl
@@ -39,7 +40,9 @@ struct SSmoothDielectric
         const scalar_type reflectance = fresnel::Dielectric<monochrome_type>::__call(orientedEta.value*orientedEta.value, interaction.getNdotV(_clamp))[0];
 
         scalar_type rcpChoiceProb;
-        bool transmitted = math::partitionRandVariable(reflectance, u.z, rcpChoiceProb);
+        sampling::PartitionRandVariable<scalar_type> partitionRandVariable;
+        partitionRandVariable.leftProb = reflectance;
+        bool transmitted = partitionRandVariable(u.z, rcpChoiceProb);
 
         ray_dir_info_type V = interaction.getV();
         Refract<scalar_type> r = Refract<scalar_type>::create(V.getDirection(), interaction.getN());
@@ -125,7 +128,9 @@ struct SThinSmoothDielectric
 
         scalar_type rcpChoiceProb;
         scalar_type z = u.z;
-        const bool transmitted = math::partitionRandVariable(reflectionProb, z, rcpChoiceProb);
+        sampling::PartitionRandVariable<scalar_type> partitionRandVariable;
+        partitionRandVariable.leftProb = reflectionProb;
+        const bool transmitted = partitionRandVariable(z, rcpChoiceProb);
         remainderMetadata = hlsl::mix(reflectance, hlsl::promote<spectral_type>(1.0) - reflectance, transmitted) * rcpChoiceProb;
 
         ray_dir_info_type V = interaction.getV();
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl
index c272eeb1ab..8c7251214d 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl
@@ -16,8 +16,15 @@ namespace concepts
 {
 namespace accessors
 {
+
+// concept `LoadableImage` translates to smth like this:
+//template<typename U, typename T, int32_t Dims>
+//concept LoadableImage = requires(U a, vector<uint16_t, Dims> uv, uint16_t layer) {
+//    ::nbl::hlsl::is_same_v<decltype(declval<U>().template get<T,Dims>(uv,layer)), vector<T,4>>;
+//};
+
 // declare concept
-#define NBL_CONCEPT_NAME StorableImage
+#define NBL_CONCEPT_NAME LoadableImage
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t)
 #define NBL_CONCEPT_TPLT_PRM_NAMES (U)(T)(Dims)
 // not the greatest syntax but works
@@ -26,12 +33,12 @@ namespace accessors
 #define NBL_CONCEPT_PARAM_2 (layer,uint16_t)
 // start concept
 NBL_CONCEPT_BEGIN(3)
-// need to be defined AFTER the cocnept begins
+// need to be defined AFTER the concept begins
 #define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
 #define uv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
 #define layer NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
 NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get<T,Dims>(uv,layer)) , ::nbl::hlsl::is_same_v, vector<T,4>))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((a.template get<T,Dims>(uv,layer)), ::nbl::hlsl::is_same_v, vector<T,4>))
 );
 #undef layer
 #undef uv
@@ -39,7 +46,7 @@ NBL_CONCEPT_END(
 #include <nbl/builtin/hlsl/concepts/__end.hlsl>
 
 // declare concept
-#define NBL_CONCEPT_NAME MipmappedStorableImage
+#define NBL_CONCEPT_NAME MipmappedLoadableImage
 #define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t)
 #define NBL_CONCEPT_TPLT_PRM_NAMES (U)(T)(Dims)
 // not the greatest syntax but works
diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index a5e48debbf..2b1f56d57d 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -268,6 +268,20 @@ struct mix_helper<T, T NBL_PARTIAL_REQ_BOT(spirv::FMixIsCallable<T>) >
 	}
 };
 
+template<typename T, typename U>
+NBL_PARTIAL_REQ_TOP(spirv::SelectIsCallable<T,U>)
+struct mix_helper<T, U NBL_PARTIAL_REQ_BOT(spirv::SelectIsCallable<T,U>) >
+{
+	using return_t = conditional_t<is_vector_v<T>, vector<typename vector_traits<T>::scalar_type, vector_traits<T>::Dimension>, T>;
+	// for a component of a that is false, the corresponding component of x is returned
+	// for a component of a that is true, the corresponding component of y is returned
+	// so we make sure this is correct when calling the operation
+	static inline return_t __call(const T x, const T y, const U a)
+	{
+		return spirv::select<T, U>(a, y, x);
+	}
+};
+
 template<typename SquareMatrix> NBL_PARTIAL_REQ_TOP(matrix_traits<SquareMatrix>::Square)
 struct determinant_helper<SquareMatrix NBL_PARTIAL_REQ_BOT(matrix_traits<SquareMatrix>::Square) >
 {
@@ -968,8 +982,19 @@ struct mix_helper<T, T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT && !imp
 	}
 };
 
-template<typename T, typename U> NBL_PARTIAL_REQ_TOP((concepts::Vectorial<T> || concepts::Scalar<T>) && concepts::BooleanScalar<U> && !impl::MixCallingBuiltins<T, U>)
-struct mix_helper<T, U NBL_PARTIAL_REQ_BOT((concepts::Vectorial<T> || concepts::Scalar<T>) && concepts::BooleanScalar<U> && !impl::MixCallingBuiltins<T, U>) >
+namespace impl
+{
+template<typename T, typename U>
+NBL_BOOL_CONCEPT MixCallingSelect =
+#ifdef __HLSL_VERSION
+spirv::SelectIsCallable<T, U>;
+#else
+concepts::Boolean<U> && (concepts::Scalar<U> || (concepts::Vector<T> && vector_traits<T>::Dimension==vector_traits<U>::Dimension)) && !MixCallingBuiltins<T, U>;
+#endif
+}
+
+template<typename T, typename U> NBL_PARTIAL_REQ_TOP(impl::MixCallingSelect<T, U>)
+struct mix_helper<T, U NBL_PARTIAL_REQ_BOT(impl::MixCallingSelect<T, U>) >
 {
 	using return_t = T;
 	static return_t __call(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(U) a)
diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
index 1ee5edf275..712ce5e979 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
@@ -44,6 +44,15 @@ struct matrix final : private glm::mat<N,M,T>
     {
         return glm::operator*(reinterpret_cast<Base const&>(rhs), lhs);
     }
+
+    inline friend bool operator==(matrix const& lhs, matrix const& rhs)
+    {
+        return glm::operator==(reinterpret_cast<Base const&>(lhs), reinterpret_cast<Base const&>(rhs));
+    }
+    inline friend bool operator!=(matrix const& lhs, matrix const& rhs)
+    {
+        return glm::operator!=(reinterpret_cast<Base const&>(lhs), reinterpret_cast<Base const&>(rhs));
+    }
 };
 #endif
 
diff --git a/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl b/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl
index 6a8476e644..1887f4b51f 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl
@@ -22,7 +22,7 @@ struct Promote
 };
 
 template<typename To, typename From> NBL_PARTIAL_REQ_TOP(concepts::Vectorial<To> && (concepts::IntegralLikeScalar<From> || concepts::FloatingPointLikeScalar<From>) && is_same_v<typename vector_traits<To>::scalar_type, From>)
-struct Promote<To, From NBL_PARTIAL_REQ_BOT(concepts::Vectorial<To> && is_scalar_v<From> && is_same_v<typename vector_traits<To>::scalar_type, From>) >
+struct Promote<To, From NBL_PARTIAL_REQ_BOT(concepts::Vectorial<To> && (concepts::IntegralLikeScalar<From> || concepts::FloatingPointLikeScalar<From>) && is_same_v<typename vector_traits<To>::scalar_type, From>) >
 {
     NBL_CONSTEXPR_FUNC To operator()(const From v)
     {
diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
index 4eb8b7bf06..d0c728a8c7 100644
--- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
+++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
@@ -134,14 +134,6 @@ NBL_CONSTEXPR_FUNC this_t operator OP() NBL_CONST_MEMBER_FUNC \
 }
 
 #define NBL_EMULATED_VECTOR_ARITHMETIC_OPERATOR(OP)\
-NBL_CONSTEXPR_FUNC this_t operator OP (component_t val) NBL_CONST_MEMBER_FUNC \
-{\
-    this_t output;\
-    [[unroll]]\
-    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
-        output.setComponent(i, this_t::getComponent(i) OP val);\
-    return output;\
-}\
 NBL_CONSTEXPR_FUNC this_t operator OP (this_t other) NBL_CONST_MEMBER_FUNC \
 {\
     this_t output;\
@@ -183,6 +175,14 @@ NBL_CONSTEXPR_FUNC vector<bool, CRTP::Dimension> operator OP (vector<component_t
 #define NBL_EMULATED_VECTOR_CREATION_AND_COMPONENT_SUM \
 using this_t = emulated_vector<ComponentType, CRTP>;\
 using component_t = ComponentType;\
+template<typename T>\
+NBL_CONSTEXPR_STATIC this_t create(vector<T, CRTP::Dimension> other)\
+{\
+    this_t output;\
+    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
+        output.setComponent(i, component_t::create(other[i]));\
+    return output;\
+}\
 NBL_CONSTEXPR_STATIC this_t create(this_t other)\
 {\
     CRTP output;\
@@ -209,6 +209,43 @@ NBL_CONSTEXPR_STATIC this_t create(vector<component_t, CRTP::Dimension> other)\
     return output;\
 }
 
+#define DEFINE_OPERATORS_FOR_TYPE(...)\
+NBL_CONSTEXPR_FUNC this_t operator+(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
+{\
+    this_t output;\
+    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
+        output.setComponent(i, CRTP::getComponent(i) + _static_cast<component_t>(val));\
+\
+    return output;\
+}\
+\
+NBL_CONSTEXPR_FUNC this_t operator-(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
+{\
+    this_t output;\
+    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
+        output.setComponent(i, CRTP::getComponent(i) - _static_cast<component_t>(val));\
+\
+    return output;\
+}\
+\
+NBL_CONSTEXPR_FUNC this_t operator*(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
+{\
+    this_t output;\
+    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
+        output.setComponent(i, CRTP::getComponent(i) * _static_cast<component_t>(val));\
+\
+    return output;\
+}\
+\
+NBL_CONSTEXPR_FUNC this_t operator/(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
+{\
+    this_t output;\
+    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
+        output.setComponent(i, CRTP::getComponent(i) / _static_cast<component_t>(val));\
+\
+    return output;\
+}\
+
 // Fundamental, integral
 template <typename ComponentType, typename CRTP> NBL_PARTIAL_REQ_TOP(is_fundamental_v<ComponentType> && concepts::IntegralLikeScalar<ComponentType>)
 struct emulated_vector<ComponentType, CRTP NBL_PARTIAL_REQ_BOT(is_fundamental_v<ComponentType>&& concepts::IntegralLikeScalar<ComponentType>) > : CRTP
@@ -232,6 +269,15 @@ struct emulated_vector<ComponentType, CRTP NBL_PARTIAL_REQ_BOT(is_fundamental_v<
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(<=)
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(>)
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(>=)
+
+    DEFINE_OPERATORS_FOR_TYPE(emulated_uint64_t)
+    DEFINE_OPERATORS_FOR_TYPE(emulated_int64_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint16_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint32_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint64_t)
+    DEFINE_OPERATORS_FOR_TYPE(int16_t)
+    DEFINE_OPERATORS_FOR_TYPE(int32_t)
+    DEFINE_OPERATORS_FOR_TYPE(int64_t)
 };
 
 // Fundamental, not integral
@@ -253,6 +299,15 @@ struct emulated_vector<ComponentType, CRTP NBL_PARTIAL_REQ_BOT(is_fundamental_v<
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(<=)
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(>)
     NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR(>=)
+
+    DEFINE_OPERATORS_FOR_TYPE(emulated_uint64_t)
+    DEFINE_OPERATORS_FOR_TYPE(emulated_int64_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint16_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint32_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint64_t)
+    DEFINE_OPERATORS_FOR_TYPE(int16_t)
+    DEFINE_OPERATORS_FOR_TYPE(int32_t)
+    DEFINE_OPERATORS_FOR_TYPE(int64_t)
 };
 
 // Not fundamental, integral
@@ -278,6 +333,20 @@ struct emulated_vector<ComponentType, CRTP NBL_PARTIAL_REQ_BOT(!is_fundamental_v
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(<=)
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(>)
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(>=)
+
+    DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<true, true>)
+    DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<true, false>)
+    DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<false, true>)
+    DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<false, false>)
+    DEFINE_OPERATORS_FOR_TYPE(float16_t)
+    DEFINE_OPERATORS_FOR_TYPE(float32_t)
+    DEFINE_OPERATORS_FOR_TYPE(float64_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint16_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint32_t)
+    DEFINE_OPERATORS_FOR_TYPE(uint64_t)
+    DEFINE_OPERATORS_FOR_TYPE(int16_t)
+    DEFINE_OPERATORS_FOR_TYPE(int32_t)
+    DEFINE_OPERATORS_FOR_TYPE(int64_t)
 };
 
 // Not fundamental, not integral
@@ -299,107 +368,12 @@ struct emulated_vector<ComponentType, CRTP NBL_PARTIAL_REQ_BOT(!is_fundamental_v
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(<=)
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(>)
     NBL_EMULATED_VECTOR_COMPARISON_OPERATOR(>=)
-};
-
-#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_CREATION_AND_COMPONENT_SUM
-#undef NBL_EMULATED_VECTOR_CREATION_AND_COMPONENT_SUM
-#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR
-#undef NBL_EMULATED_VECTOR_COMPARISON_OPERATOR
-#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_ARITHMETIC_OPERATOR
-#undef NBL_EMULATED_VECTOR_ARITHMETIC_OPERATOR
-#undef NBL_EMULATED_VECTOR_UNARY_OPERATOR
-
-// ----------------------------------------------------- EMULATED FLOAT SPECIALIZATION --------------------------------------------------------------------
-
-#define DEFINE_OPERATORS_FOR_TYPE(...)\
-NBL_CONSTEXPR_FUNC this_t operator+(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
-{\
-    this_t output;\
-    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
-        output.setComponent(i, CRTP::getComponent(i) + component_t::create(val));\
-\
-    return output;\
-}\
-\
-NBL_CONSTEXPR_FUNC this_t operator-(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
-{\
-    this_t output;\
-    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
-        output.setComponent(i, CRTP::getComponent(i) - component_t::create(val));\
-\
-    return output;\
-}\
-\
-NBL_CONSTEXPR_FUNC this_t operator*(__VA_ARGS__ val) NBL_CONST_MEMBER_FUNC \
-{\
-    this_t output;\
-    for (uint32_t i = 0u; i < CRTP::Dimension; ++i)\
-        output.setComponent(i, CRTP::getComponent(i) * component_t::create(val));\
-\
-    return output;\
-}\
-\
-
-
-template <bool FastMath, bool FlushDenormToZero, typename CRTP>
-struct emulated_vector<emulated_float64_t<FastMath, FlushDenormToZero>, CRTP> : CRTP
-{
-    using component_t = emulated_float64_t<FastMath, FlushDenormToZero>;
-    using this_t = emulated_vector<component_t, CRTP>;
-
-    NBL_CONSTEXPR_STATIC this_t create(this_t other)
-    {
-        this_t output;
-
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            output.setComponent(i, other.getComponent(i));
-
-        return output;
-    }
-
-    template<typename T>
-    NBL_CONSTEXPR_STATIC this_t create(vector<T, CRTP::Dimension> other)
-    {
-        this_t output;
-
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            output.setComponent(i, component_t::create(other[i]));
-
-        return output;
-    }
-
-    NBL_CONSTEXPR_FUNC this_t operator+(this_t other) NBL_CONST_MEMBER_FUNC
-    {
-        this_t output;
-
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            output.setComponent(i, CRTP::getComponent(i) + other.getComponent(i));
-
-        return output;
-    }
-    NBL_CONSTEXPR_FUNC this_t operator-(this_t other) NBL_CONST_MEMBER_FUNC
-    {
-        this_t output;
-
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            output.setComponent(i, CRTP::getComponent(i) - other.getComponent(i));
-
-        return output;
-    }
-    NBL_CONSTEXPR_FUNC this_t operator*(this_t other) NBL_CONST_MEMBER_FUNC
-    {
-        this_t output;
-
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            output.setComponent(i, CRTP::getComponent(i) * other.getComponent(i));
-
-        return output;
-    }
 
     DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<true, true>)
     DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<true, false>)
     DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<false, true>)
     DEFINE_OPERATORS_FOR_TYPE(emulated_float64_t<false, false>)
+    DEFINE_OPERATORS_FOR_TYPE(float16_t)
     DEFINE_OPERATORS_FOR_TYPE(float32_t)
     DEFINE_OPERATORS_FOR_TYPE(float64_t)
     DEFINE_OPERATORS_FOR_TYPE(uint16_t)
@@ -408,17 +382,15 @@ struct emulated_vector<emulated_float64_t<FastMath, FlushDenormToZero>, CRTP> :
     DEFINE_OPERATORS_FOR_TYPE(int16_t)
     DEFINE_OPERATORS_FOR_TYPE(int32_t)
     DEFINE_OPERATORS_FOR_TYPE(int64_t)
-
-    NBL_CONSTEXPR_FUNC component_t calcComponentSum() NBL_CONST_MEMBER_FUNC
-    {
-        component_t sum = component_t::create(0);
-        for (uint32_t i = 0u; i < CRTP::Dimension; ++i)
-            sum = sum + CRTP::getComponent(i);
-
-        return sum;
-    }
 };
 
+#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_CREATION_AND_COMPONENT_SUM
+#undef NBL_EMULATED_VECTOR_CREATION_AND_COMPONENT_SUM
+#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_COMPARISON_OPERATOR
+#undef NBL_EMULATED_VECTOR_COMPARISON_OPERATOR
+#undef NBL_EMULATED_FUNDAMENTAL_TYPE_VECTOR_ARITHMETIC_OPERATOR
+#undef NBL_EMULATED_VECTOR_ARITHMETIC_OPERATOR
+#undef NBL_EMULATED_VECTOR_UNARY_OPERATOR
 #undef DEFINE_OPERATORS_FOR_TYPE
 
 template<typename T, uint32_t N>
@@ -587,53 +559,6 @@ NBL_EMULATED_VEC_TO_EMULATED_VEC_STATIC_CAST(4)
 
 #undef NBL_EMULATED_VEC_TO_EMULATED_VEC_STATIC_CAST
 
-#define NBL_EMULATED_VEC_PROMOTION(N) template<typename ComponentType>\
-struct Promote<emulated_vector_t##N <ComponentType>, ComponentType>\
-{\
-    using VecType = emulated_vector_t##N <ComponentType>;\
-    NBL_CONSTEXPR_FUNC VecType operator()(NBL_CONST_REF_ARG(ComponentType) v)\
-    {\
-        array_set<VecType, ComponentType> setter;\
-        VecType promoted;\
-        [[unroll]]\
-        for (int i = 0; i < N; ++i)\
-            setter(promoted, i, v);\
-        return promoted;\
-    }\
-};
-
-NBL_EMULATED_VEC_PROMOTION(2)
-NBL_EMULATED_VEC_PROMOTION(3)
-NBL_EMULATED_VEC_PROMOTION(4)
-
-#undef NBL_EMULATED_VEC_PROMOTION
-
-#define NBL_EMULATED_VEC_TRUNCATION(N, M) template<typename ComponentType>\
-struct Truncate<emulated_vector_t##N <ComponentType>, emulated_vector_t##M <ComponentType> >\
-{\
-    using OutputVecType = emulated_vector_t##N <ComponentType>;\
-    using InputVecType = emulated_vector_t##M <ComponentType>;\
-    NBL_CONSTEXPR_FUNC OutputVecType operator()(NBL_CONST_REF_ARG(InputVecType) vec)\
-    {\
-        array_get<InputVecType, ComponentType> getter;\
-        array_set<OutputVecType, ComponentType> setter;\
-        OutputVecType output;\
-        [[unroll]]\
-        for (int i = 0; i < N; ++i)\
-            setter(output, i, getter(vec, i));\
-        return output;\
-    }\
-};
-
-NBL_EMULATED_VEC_TRUNCATION(2, 2)
-NBL_EMULATED_VEC_TRUNCATION(2, 3)
-NBL_EMULATED_VEC_TRUNCATION(2, 4)
-NBL_EMULATED_VEC_TRUNCATION(3, 3)
-NBL_EMULATED_VEC_TRUNCATION(3, 4)
-NBL_EMULATED_VEC_TRUNCATION(4, 4)
-
-#undef NBL_EMULATED_VEC_TRUNCATION
-
 } //namespace impl
 
 }
diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl
index 29c48a79d1..a3930a362a 100644
--- a/include/nbl/builtin/hlsl/ieee754.hlsl
+++ b/include/nbl/builtin/hlsl/ieee754.hlsl
@@ -251,6 +251,24 @@ NBL_CONSTEXPR_FUNC T flipSignIfRHSNegative(T val, T flip)
 	return impl::flipSignIfRHSNegative_helper<T>::__call(val, flip);
 }
 
+template <typename T NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<T>)
+NBL_CONSTEXPR_FUNC bool isSubnormal(T val)
+{
+	const uint32_t biasedExponent = extractBiasedExponent(val);
+	const typename unsigned_integer_of_size<sizeof(T)>::type mantissa = extractMantissa(val);
+	return biasedExponent == 0 && mantissa != 0u;
+}
+
+template <typename T NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<T>)
+NBL_CONSTEXPR_FUNC bool isZero(T val)
+{
+	using traits_t = traits<T>;
+	using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+
+	const AsUint exponentAndMantissaMask = ~traits_t::signMask;
+	return !(ieee754::impl::bitCastToUintType(val) & exponentAndMantissaMask);
+}
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/indirect_commands.hlsl b/include/nbl/builtin/hlsl/indirect_commands.hlsl
index ca8418bde7..79057fd4a2 100644
--- a/include/nbl/builtin/hlsl/indirect_commands.hlsl
+++ b/include/nbl/builtin/hlsl/indirect_commands.hlsl
@@ -37,6 +37,14 @@ struct DispatchIndirectCommand_t
 	uint32_t  num_groups_z;
 };
 
+// distinct struct, new name with the same data - https://docs.vulkan.org/refpages/latest/refpages/source/VkDrawMeshTasksIndirectCommandEXT.html
+struct DrawMeshTasksIndirectCommand_t
+{
+    uint32_t num_groups_x;
+    uint32_t num_groups_y;
+    uint32_t num_groups_z;
+};
+
 struct TraceRaysIndirectCommand_t
 {
     uint64_t raygenShaderRecordAddress;
diff --git a/include/nbl/builtin/hlsl/limits.hlsl b/include/nbl/builtin/hlsl/limits.hlsl
index ebc6f931e1..fa9edc3bde 100644
--- a/include/nbl/builtin/hlsl/limits.hlsl
+++ b/include/nbl/builtin/hlsl/limits.hlsl
@@ -146,7 +146,7 @@ struct num_base : type_identity<T>
 
     // (TODO) think about what this means for HLSL
     // identifies floating-point types that can represent the special value "quiet not-a-number" (NaN)
-    NBL_CONSTEXPR_STATIC_INLINE bool has_quiet_NaN = !is_integer; 
+    NBL_CONSTEXPR_STATIC_INLINE bool has_quiet_NaN = !is_integer;
     // 	identifies floating-point types that can represent the special value "signaling not-a-number" (NaN)
     NBL_CONSTEXPR_STATIC_INLINE bool has_signaling_NaN = !is_integer;
     // 	identifies the denormalization style used by the floating-point type
diff --git a/include/nbl/builtin/hlsl/math/angle_adding.hlsl b/include/nbl/builtin/hlsl/math/angle_adding.hlsl
index 27d4f2a465..5ab661facb 100644
--- a/include/nbl/builtin/hlsl/math/angle_adding.hlsl
+++ b/include/nbl/builtin/hlsl/math/angle_adding.hlsl
@@ -44,7 +44,7 @@ struct sincos_accumulator
         const T cosB = runningSum.real();
         const T sinB = runningSum.imag();
         // TODO: prove if we infer overflow from sign of `d` instead
-        const bool overflow = abs<T>(min<T>(a, cosB)) > max<T>(a, cosB);
+        const bool overflow = abs<T>(min<T>(cosA, cosB)) > max<T>(cosA, cosB);
         const T c = cosA * cosB - sinA * sinB;
         const T d = sinA * cosB + cosA * sinB;
 
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index d3f5b167f6..a1c51d4e51 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -5,6 +5,7 @@
 #define _NBL_BUILTIN_HLSL_MATH_FUNCTIONS_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
 #include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
 #include "nbl/builtin/hlsl/concepts/vector.hlsl"
@@ -120,25 +121,6 @@ void frisvad(NBL_CONST_REF_ARG(T) normal, NBL_REF_ARG(T) tangent, NBL_REF_ARG(T)
     }
 }
 
-bool partitionRandVariable(float leftProb, NBL_REF_ARG(float) xi, NBL_REF_ARG(float) rcpChoiceProb)
-{
-#ifdef __HLSL_VERSION
-    NBL_CONSTEXPR_FUNC_SCOPE_VAR float NEXT_ULP_AFTER_UNITY = asfloat(0x3f800001u);
-#else
-    NBL_CONSTEXPR_FUNC_SCOPE_VAR float32_t NEXT_ULP_AFTER_UNITY = bit_cast<float32_t>(0x3f800001u);
-#endif
-    const bool pickRight = xi >= leftProb * NEXT_ULP_AFTER_UNITY;
-
-    // This is all 100% correct taking into account the above NEXT_ULP_AFTER_UNITY
-    xi -= pickRight ? leftProb : 0.0f;
-
-    rcpChoiceProb = 1.0f / (pickRight ? (1.0f - leftProb) : leftProb);
-    xi *= rcpChoiceProb;
-
-    return pickRight;
-}
-
-
 namespace impl
 {
 template <typename T NBL_STRUCT_CONSTRAINABLE>
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
new file mode 100644
index 0000000000..4a6cb5dfd3
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -0,0 +1,68 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_MATH_MORTON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_MORTON_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+
+namespace impl
+{
+
+template<typename T, uint32_t bitDepth>
+struct MortonComponent;
+
+template<typename T>
+struct MortonComponent<T, 8u>
+{
+    static T decode2d(T x)
+    {
+        x &= 0x55555555u;
+        x = (x ^ (x >>  1u)) & 0x33333333u;
+        x = (x ^ (x >>  2u)) & 0x0f0f0f0fu;
+        x = (x ^ (x >>  4u)) & 0x00ff00ffu;
+        return x;
+    }
+};
+
+template<typename T>
+struct MortonComponent<T, 32u>
+{
+    static T decode2d(T x)
+    {
+        x &= 0x55555555u;
+        x = (x ^ (x >>  1u)) & 0x33333333u;
+        x = (x ^ (x >>  2u)) & 0x0f0f0f0fu;
+        x = (x ^ (x >>  4u)) & 0x00ff00ffu;
+        x = (x ^ (x >>  8u)) & 0x0000ffffu;
+        x = (x ^ (x >>  16u));
+        return x;
+    }
+};
+
+}
+
+template<typename T, uint32_t bitDepth=sizeof(T)*8u>
+struct Morton
+{
+    using vector2_type = vector<T, 2>;
+    using component_type = impl::MortonComponent<T, bitDepth>;
+
+    static vector2_type decode2d(T x)
+    {
+        return vector2_type(component_type::decode2d(x), component_type::decode2d(x >> 1u));
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/math/quaternions.hlsl b/include/nbl/builtin/hlsl/math/quaternions.hlsl
new file mode 100644
index 0000000000..8d50202f4e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/quaternions.hlsl
@@ -0,0 +1,104 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNIONS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_QUATERNIONS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+
+template <typename T>
+struct quaternion
+{
+    using this_t = quaternion<T>;
+    using scalar_type = T;
+    using data_type = vector<T, 4>;
+    using vector3_type = vector<T, 3>;
+    using matrix_type = matrix<T, 3, 3>;
+
+    static this_t createFromTruncated(const vector3_type first3Components)
+    {
+        this_t retval;
+        retval.data.xyz = first3Components;
+        retval.data.w = hlsl::sqrt(scalar_type(1.0) - hlsl::dot(first3Components, first3Components));
+        return retval;
+    }
+
+    static this_t lerp(const this_t start, const this_t end, const scalar_type fraction, const scalar_type totalPseudoAngle)
+    {
+        using AsUint = typename unsigned_integer_of_size<sizeof(scalar_type)>::type;
+        const AsUint negationMask = hlsl::bit_cast<AsUint>(totalPseudoAngle) & AsUint(0x80000000u);
+        const data_type adjEnd = hlsl::bit_cast<scalar_type>(hlsl::bit_cast<AsUint>(end.data) ^ negationMask);
+
+        this_t retval;
+        retval.data = hlsl::mix(start.data, adjEnd, fraction);
+        return retval;
+    }
+
+    static this_t lerp(const this_t start, const this_t end, const scalar_type fraction)
+    {
+        return lerp(start, end, fraction, hlsl::dot(start.data, end.data));
+    }
+
+    static scalar_type __adj_interpolant(const scalar_type angle, const scalar_type fraction, const scalar_type interpolantPrecalcTerm2, const scalar_type interpolantPrecalcTerm3)
+    {
+        const scalar_type A = scalar_type(1.0904) + angle * (scalar_type(-3.2452) + angle * (scalar_type(3.55645) - angle * scalar_type(1.43519)));
+        const scalar_type B = scalar_type(0.848013) + angle * (scalar_type(-1.06021) + angle * scalar_type(0.215638));
+        const scalar_type k = A * interpolantPrecalcTerm2 + B;
+        return fraction + interpolantPrecalcTerm3 * k;
+    }
+
+    static this_t flerp(const this_t start, const this_t end, const scalar_type fraction)
+    {
+        const scalar_type pseudoAngle = hlsl::dot(start.data,end.data);
+        const scalar_type interpolantPrecalcTerm = fraction - scalar_type(0.5);
+        const scalar_type interpolantPrecalcTerm3 = fraction * interpolantPrecalcTerm * (fraction - scalar_type(1.0));
+        const scalar_type adjFrac = __adj_interpolant(hlsl::abs(pseudoAngle),fraction,interpolantPrecalcTerm*interpolantPrecalcTerm,interpolantPrecalcTerm3);
+        
+        this_t retval = lerp(start,end,adjFrac,pseudoAngle);
+        retval.data = hlsl::normalize(retval.data);
+        return retval;
+    }
+
+    matrix_type constructMatrix()
+    {
+        matrix_type mat;
+        mat[0] = data.yzx * data.ywz + data.zxy * data.zyw * vector3_type( 1.0, 1.0,-1.0);
+        mat[1] = data.yzx * data.xzw + data.zxy * data.wxz * vector3_type(-1.0, 1.0, 1.0);
+        mat[2] = data.yzx * data.wyx + data.zxy * data.xwy * vector3_type( 1.0,-1.0, 1.0);
+        mat[0][0] = scalar_type(0.5) - mat[0][0];
+        mat[1][1] = scalar_type(0.5) - mat[1][1];
+        mat[2][2] = scalar_type(0.5) - mat[2][2];
+        mat *= scalar_type(2.0);
+        return hlsl::transpose(mat);    // TODO: double check transpose?
+    }
+
+    static vector3_type slerp_delta(const vector3_type start, const vector3_type preScaledWaypoint, scalar_type cosAngleFromStart)
+    {
+        vector3_type planeNormal = hlsl::cross(start,preScaledWaypoint);
+    
+        cosAngleFromStart *= scalar_type(0.5);
+        const scalar_type sinAngle = hlsl::sqrt(scalar_type(0.5) - cosAngleFromStart);
+        const scalar_type cosAngle = hlsl::sqrt(scalar_type(0.5) + cosAngleFromStart);
+        
+        planeNormal *= sinAngle;
+        const vector3_type precompPart = hlsl::cross(planeNormal, start) * scalar_type(2.0);
+
+        return precompPart * cosAngle + hlsl::cross(planeNormal, precompPart);
+    }
+
+    data_type data;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl
new file mode 100644
index 0000000000..1ad16dc28d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl
@@ -0,0 +1,235 @@
+#ifndef _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+//TODO: stolen from cameraz branch, don't have epsilonEqual here, maybe uncomment when merging from imguizmo-lights branch
+//// TODO: -> move somewhere else and nbl:: to implement it
+//template<typename T, typename E = double>
+//bool isOrthoBase(const T& x, const T& y, const T& z, const E epsilon = 1e-6)
+//{
+//	auto isNormalized = [](const auto& v, const auto& epsilon) -> bool
+//	{
+//		return glm::epsilonEqual(glm::length(v), 1.0, epsilon);
+//	};
+//
+//	auto isOrthogonal = [](const auto& a, const auto& b, const auto& epsilon) -> bool
+//	{
+//		return glm::epsilonEqual(glm::dot(a, b), 0.0, epsilon);
+//	};
+//
+//	return isNormalized(x, epsilon) && isNormalized(y, epsilon) && isNormalized(z, epsilon) &&
+//		isOrthogonal(x, y, epsilon) && isOrthogonal(x, z, epsilon) && isOrthogonal(y, z, epsilon);
+//}
+//// <-
+
+template<typename T>
+matrix<T, 4, 4> getMatrix3x4As4x4(const matrix<T, 3, 4>& mat)
+{
+	matrix<T, 4, 4> output;
+	for (int i = 0; i < 3; ++i)
+		output[i] = mat[i];
+	output[3] = float32_t4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return output;
+}
+
+template<typename T>
+matrix<T, 4, 4> getMatrix3x3As4x4(const matrix<T, 3, 3>& mat)
+{
+	matrix<T, 4, 4> output;
+	for (int i = 0; i < 3; ++i)
+		output[i] = float32_t4(mat[i], 1.0f);
+	output[3] = float32_t4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return output;
+}
+
+template<typename Tout, typename Tin, uint32_t N>
+inline vector<Tout, N> getCastedVector(const vector<Tin, N>& in)
+{
+	vector<Tout, N> out;
+
+	for (int i = 0; i < N; ++i)
+		out[i] = (Tout)(in[i]);
+
+	return out;
+}
+
+template<typename Tout, typename Tin, uint32_t N, uint32_t M>
+inline matrix<Tout, N, M> getCastedMatrix(const matrix<Tin, N, M>& in)
+{
+	matrix<Tout, N, M> out;
+
+	for (int i = 0; i < N; ++i)
+		out[i] = getCastedVector<Tout>(in[i]);
+
+	return out;
+}
+
+// TODO: use portable_float when merged
+//! multiplies matrices a and b, 3x4 matrices are treated as 4x4 matrices with 4th row set to (0, 0, 0 ,1)
+template<typename T>
+inline matrix<T, 3, 4> concatenateBFollowedByA(const matrix<T, 3, 4>& a, const matrix<T, 3, 4>& b)
+{
+	const auto a4x4 = getMatrix3x4As4x4(a);
+	const auto b4x4 = getMatrix3x4As4x4(b);
+	return matrix<T, 3, 4>(mul(a4x4, b4x4));
+}
+
+// /Arek: glm:: for normalize till dot product is fixed (ambiguity with glm namespace + linker issues)
+
+template<typename T>
+inline matrix<T, 3, 4> buildCameraLookAtMatrixLH(
+	const vector<T, 3>& position,
+	const vector<T, 3>& target,
+	const vector<T, 3>& upVector)
+{
+	const vector<T, 3> zaxis = glm::normalize(target - position);
+	const vector<T, 3> xaxis = glm::normalize(hlsl::cross(upVector, zaxis));
+	const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+	matrix<T, 3, 4> r;
+	r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+	r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+	r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+	return r;
+}
+
+template<typename T>
+inline matrix<T, 3, 4> buildCameraLookAtMatrixRH(
+	const vector<T, 3>& position,
+	const vector<T, 3>& target,
+	const vector<T, 3>& upVector)
+{
+	const vector<T, 3> zaxis = glm::normalize(position - target);
+	const vector<T, 3> xaxis = glm::normalize(hlsl::cross(upVector, zaxis));
+	const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+	matrix<T, 3, 4> r;
+	r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+	r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+	r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+	return r;
+}
+
+// TODO: test, check if there is better implementation
+// TODO: move quaternion to nbl::hlsl
+// TODO: why NBL_REF_ARG(MatType) doesn't work?????
+
+//! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged
+template<typename T, uint32_t N>
+inline void setRotation(matrix<T, N, 4>& outMat, NBL_CONST_REF_ARG(core::quaternion) quat)
+{
+	static_assert(N == 3 || N == 4);
+
+	outMat[0] = vector<T, 4>(
+		1 - 2 * (quat.y * quat.y + quat.z * quat.z),
+		2 * (quat.x * quat.y - quat.z * quat.w),
+		2 * (quat.x * quat.z + quat.y * quat.w),
+		outMat[0][3]
+	);
+
+	outMat[1] = vector<T, 4>(
+		2 * (quat.x * quat.y + quat.z * quat.w),
+		1 - 2 * (quat.x * quat.x + quat.z * quat.z),
+		2 * (quat.y * quat.z - quat.x * quat.w),
+		outMat[1][3]
+	);
+
+	outMat[2] = vector<T, 4>(
+		2 * (quat.x * quat.z - quat.y * quat.w),
+		2 * (quat.y * quat.z + quat.x * quat.w),
+		1 - 2 * (quat.x * quat.x + quat.y * quat.y),
+		outMat[2][3]
+	);
+}
+
+template<typename T, uint32_t N>
+inline void setTranslation(matrix<T, N, 4>& outMat, NBL_CONST_REF_ARG(vector<T, 3>) translation)
+{
+	static_assert(N == 3 || N == 4);
+
+	outMat[0].w = translation.x;
+	outMat[1].w = translation.y;
+	outMat[2].w = translation.z;
+}
+
+
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
+{
+	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, -1.f, 0.f);
+
+	return m;
+}
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
+{
+	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 1.f, 0.f);
+
+	return m;
+}
+
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/rwmc/CascadeAccumulator.hlsl b/include/nbl/builtin/hlsl/rwmc/CascadeAccumulator.hlsl
new file mode 100644
index 0000000000..9413bcee98
--- /dev/null
+++ b/include/nbl/builtin/hlsl/rwmc/CascadeAccumulator.hlsl
@@ -0,0 +1,97 @@
+#ifndef _NBL_HLSL_RWMC_CASCADE_ACCUMULATOR_INCLUDED_
+#define _NBL_HLSL_RWMC_CASCADE_ACCUMULATOR_INCLUDED_
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/promote.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/rwmc/SplattingParameters.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace rwmc
+{
+
+template<typename CascadeLayerType, uint32_t CascadeCount NBL_PRIMARY_REQUIRES(concepts::Vector<CascadeLayerType>)
+struct CascadeAccumulator
+{
+    struct CascadeEntry
+    {
+        uint32_t cascadeSampleCounter[CascadeCount];
+        CascadeLayerType data[CascadeCount];
+
+        void addSampleIntoCascadeEntry(CascadeLayerType _sample, uint32_t lowerCascadeIndex, float lowerCascadeLevelWeight, float higherCascadeLevelWeight, uint32_t sampleCount)
+        {
+            const float reciprocalSampleCount = 1.0f / float(sampleCount);
+
+            uint32_t lowerCascadeSampleCount = cascadeSampleCounter[lowerCascadeIndex];
+            data[lowerCascadeIndex] += (_sample * lowerCascadeLevelWeight - (sampleCount - lowerCascadeSampleCount) * data[lowerCascadeIndex]) * reciprocalSampleCount;
+            cascadeSampleCounter[lowerCascadeIndex] = sampleCount;
+
+            uint32_t higherCascadeIndex = lowerCascadeIndex + 1u;
+            if (higherCascadeIndex < CascadeCount)
+            {
+                uint32_t higherCascadeSampleCount = cascadeSampleCounter[higherCascadeIndex];
+                data[higherCascadeIndex] += (_sample * higherCascadeLevelWeight - (sampleCount - higherCascadeSampleCount) * data[higherCascadeIndex]) * reciprocalSampleCount;
+                cascadeSampleCounter[higherCascadeIndex] = sampleCount;
+            }
+        }
+    };
+
+    using cascade_layer_scalar_type = typename vector_traits<CascadeLayerType>::scalar_type;
+    using this_t = CascadeAccumulator<CascadeLayerType, CascadeCount>;
+    using input_sample_type = CascadeLayerType;
+    using output_storage_type = CascadeEntry;
+    using initialization_data = SplattingParameters;
+    output_storage_type accumulation;
+    
+    SplattingParameters splattingParameters;
+
+    static this_t create(NBL_CONST_REF_ARG(SplattingParameters) settings)
+    {
+        this_t retval;
+        for (int i = 0; i < CascadeCount; ++i)
+        {
+            retval.accumulation.data[i] = promote<CascadeLayerType, float32_t>(0.0f);
+            retval.accumulation.cascadeSampleCounter[i] = 0u;
+        }
+        retval.splattingParameters = settings;
+
+        return retval;
+    }
+    
+    cascade_layer_scalar_type getLuma(NBL_CONST_REF_ARG(CascadeLayerType) col)
+    {
+        return hlsl::dot<CascadeLayerType>(hlsl::transpose(colorspace::scRGBtoXYZ)[1], col);
+    }
+
+    // most of this code is stolen from https://cg.ivd.kit.edu/publications/2018/rwmc/tool/split.cpp
+    void addSample(uint32_t sampleCount, input_sample_type _sample)
+    {
+        const cascade_layer_scalar_type luma = getLuma(_sample);
+        const cascade_layer_scalar_type log2Luma = log2<cascade_layer_scalar_type>(luma);
+        const cascade_layer_scalar_type cascade = log2Luma * splattingParameters.rcpLog2Base - splattingParameters.baseRootOfStart;
+        const cascade_layer_scalar_type clampedCascade = clamp(cascade, 0, CascadeCount - 1);
+        // c<=0 -> 0, c>=Count-1 -> Count-1 
+        uint32_t lowerCascadeIndex = floor<cascade_layer_scalar_type>(cascade);
+        // 0 whenever clamped or `cascade` is integer (when `clampedCascade` is integer)
+        cascade_layer_scalar_type higherCascadeWeight = clampedCascade - floor<cascade_layer_scalar_type>(clampedCascade);
+        // never 0 thanks to magic of `1-fract(x)`
+        cascade_layer_scalar_type lowerCascadeWeight = cascade_layer_scalar_type(1) - higherCascadeWeight;
+
+        // handle super bright sample case
+        if (cascade > CascadeCount - 1)
+            lowerCascadeWeight = splattingParameters.lastCascadeLuma / luma;
+
+        accumulation.addSampleIntoCascadeEntry(_sample, lowerCascadeIndex, lowerCascadeWeight, higherCascadeWeight, sampleCount);
+    }
+
+    
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/rwmc/ResolveParameters.hlsl b/include/nbl/builtin/hlsl/rwmc/ResolveParameters.hlsl
new file mode 100644
index 0000000000..7509eac493
--- /dev/null
+++ b/include/nbl/builtin/hlsl/rwmc/ResolveParameters.hlsl
@@ -0,0 +1,45 @@
+#ifndef _NBL_BUILTIN_HLSL_RWMC_RESOLVE_PARAMETERS_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_RWMC_RESOLVE_PARAMETERS_HLSL_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace rwmc
+{
+
+struct ResolveParameters
+{
+	uint32_t lastCascadeIndex;
+	float initialEmin; // a minimum image brightness that we always consider reliable
+	float reciprocalBase;
+	float reciprocalN;
+	float reciprocalKappa;
+	float colorReliabilityFactor;
+	float NOverKappa;
+};
+
+ResolveParameters computeResolveParameters(float base, uint32_t sampleCount, float minReliableLuma, float kappa, uint32_t cascadeSize)
+{
+	ResolveParameters retval;
+	retval.lastCascadeIndex = cascadeSize - 1u;
+	retval.initialEmin = minReliableLuma;
+	retval.reciprocalBase = 1.f / base;
+	const float N = float(sampleCount);
+	retval.reciprocalN = 1.f / N;
+	retval.reciprocalKappa = 1.f / kappa;
+	// if not interested in exact expected value estimation (kappa!=1.f), can usually accept a bit more variance relative to the image brightness we already have
+	// allow up to ~<cascadeBase> more energy in one sample to lessen bias in some cases
+	retval.colorReliabilityFactor = base + (1.f - base) * retval.reciprocalKappa;
+	retval.NOverKappa = N * retval.reciprocalKappa;
+
+	return retval;
+}
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/rwmc/SplattingParameters.hlsl b/include/nbl/builtin/hlsl/rwmc/SplattingParameters.hlsl
new file mode 100644
index 0000000000..a3a3520415
--- /dev/null
+++ b/include/nbl/builtin/hlsl/rwmc/SplattingParameters.hlsl
@@ -0,0 +1,38 @@
+#ifndef _NBL_BUILTIN_HLSL_RWMC_SPLATTING_PARAMETERS_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_RWMC_SPLATTING_PARAMETERS_HLSL_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace rwmc
+{
+
+struct SplattingParameters
+{
+    using scalar_t = float;
+
+    static SplattingParameters create(const scalar_t base, const scalar_t start, const uint32_t cascadeCount)
+    {
+        SplattingParameters retval;
+        const scalar_t log2Base = hlsl::log2(base);
+        const scalar_t log2Start = hlsl::log2(start);
+        retval.lastCascadeLuma = hlsl::exp2(log2Start + log2Base * (cascadeCount - 1));
+        retval.rcpLog2Base = scalar_t(1.0) / log2Base;
+        retval.baseRootOfStart = log2Start * retval.rcpLog2Base;
+        return retval;
+    }
+
+    scalar_t lastCascadeLuma;
+    scalar_t baseRootOfStart;
+    scalar_t rcpLog2Base;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/rwmc/resolve.hlsl b/include/nbl/builtin/hlsl/rwmc/resolve.hlsl
new file mode 100644
index 0000000000..906cad512b
--- /dev/null
+++ b/include/nbl/builtin/hlsl/rwmc/resolve.hlsl
@@ -0,0 +1,163 @@
+#ifndef _NBL_BUILTIN_HLSL_RWMC_RESOLVE_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_RWMC_RESOLVE_HLSL_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/rwmc/ResolveParameters.hlsl>
+#include <nbl/builtin/hlsl/concepts/accessors/loadable_image.hlsl>
+#include <nbl/builtin/hlsl/colorspace.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace rwmc
+{
+		// declare concept
+#define NBL_CONCEPT_NAME ResolveAccessorBase
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(int32_t)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(VectorScalarType)(Dims)
+// not the greatest syntax but works
+#define NBL_CONCEPT_PARAM_0 (a,T)
+#define NBL_CONCEPT_PARAM_1 (scalar,VectorScalarType)
+// start concept
+	NBL_CONCEPT_BEGIN(2)
+// need to be defined AFTER the concept begins
+#define a NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define scalar NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_EXPR)((a.calcLuma(vector<VectorScalarType, 3>(scalar, scalar, scalar)))))
+);
+#undef a
+#undef scalar
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+/* ResolveAccessor is required to:
+*	- satisfy `LoadableImage` concept requirements
+*	- implement function called `calcLuma` which calculates luma from a 3 component pixel value
+*/
+
+template<typename T, typename VectorScalarType, int32_t Dims>
+NBL_BOOL_CONCEPT ResolveAccessor = ResolveAccessorBase<T, VectorScalarType, Dims> && concepts::accessors::LoadableImage<T, VectorScalarType, Dims>;
+
+template<typename CascadeAccessor, typename OutputColorTypeVec NBL_PRIMARY_REQUIRES(concepts::Vector<OutputColorTypeVec> && ResolveAccessor<CascadeAccessor, typename CascadeAccessor::output_scalar_type, CascadeAccessor::image_dimension>)
+struct Resolver
+{
+	using output_type = OutputColorTypeVec;
+	using scalar_t = typename vector_traits<output_type>::scalar_type;
+
+	struct CascadeSample
+	{
+		float32_t3 centerValue;
+		float normalizedCenterLuma;
+		float normalizedNeighbourhoodAverageLuma;
+	};
+
+	static Resolver create(NBL_REF_ARG(ResolveParameters) resolveParameters)
+	{
+		Resolver retval;
+		retval.params = resolveParameters;
+
+		return retval;
+	}
+
+	output_type operator()(NBL_REF_ARG(CascadeAccessor) acc, const int16_t2 coord)
+	{
+		using scalar_t = typename vector_traits<output_type>::scalar_type;
+
+		scalar_t reciprocalBaseI = 1.f;
+		CascadeSample curr = __sampleCascade(acc, coord, 0u, reciprocalBaseI);
+
+		output_type accumulation = output_type(0.0f, 0.0f, 0.0f);
+		scalar_t Emin = params.initialEmin;
+
+		scalar_t prevNormalizedCenterLuma, prevNormalizedNeighbourhoodAverageLuma;
+		for (int16_t i = 0u; i <= params.lastCascadeIndex; i++)
+		{
+			const bool notFirstCascade = i != 0;
+			const bool notLastCascade = i != params.lastCascadeIndex;
+
+			CascadeSample next;
+			if (notLastCascade)
+			{
+				reciprocalBaseI *= params.reciprocalBase;
+				next = __sampleCascade(acc, coord, int16_t(i + 1), reciprocalBaseI);
+			}
+
+			scalar_t reliability = 1.f;
+			// sample counting-based reliability estimation
+			if (params.reciprocalKappa <= 1.f)
+			{
+				scalar_t localReliability = curr.normalizedCenterLuma;
+				// reliability in 3x3 pixel block (see robustness)
+				scalar_t globalReliability = curr.normalizedNeighbourhoodAverageLuma;
+				if (notFirstCascade)
+				{
+					localReliability += prevNormalizedCenterLuma;
+					globalReliability += prevNormalizedNeighbourhoodAverageLuma;
+				}
+				if (notLastCascade)
+				{
+					localReliability += next.normalizedCenterLuma;
+					globalReliability += next.normalizedNeighbourhoodAverageLuma;
+				}
+				// check if above minimum sampling threshold (avg 9 sample occurences in 3x3 neighbourhood), then use per-pixel reliability (NOTE: tertiary op is in reverse)
+				reliability = globalReliability < params.reciprocalN ? globalReliability : localReliability;
+				{
+					const scalar_t accumLuma = acc.calcLuma(accumulation);
+					if (accumLuma > Emin)
+						Emin = accumLuma;
+
+					const scalar_t colorReliability = Emin * reciprocalBaseI * params.colorReliabilityFactor;
+
+					reliability += colorReliability;
+					reliability *= params.NOverKappa;
+					reliability -= params.reciprocalKappa;
+					reliability = clamp(reliability * 0.5f, 0.f, 1.f);
+				}
+			}
+			accumulation += curr.centerValue * reliability;
+
+			prevNormalizedCenterLuma = curr.normalizedCenterLuma;
+			prevNormalizedNeighbourhoodAverageLuma = curr.normalizedNeighbourhoodAverageLuma;
+			curr = next;
+		}
+
+		return accumulation;
+	}
+
+	ResolveParameters params;
+
+	// pseudo private stuff:
+
+	CascadeSample __sampleCascade(NBL_REF_ARG(CascadeAccessor) acc, int16_t2 coord, uint16_t cascadeIndex, scalar_t reciprocalBaseI)
+	{
+		output_type neighbourhood[9];
+		neighbourhood[0] = acc.template get<scalar_t, 2>(coord + int16_t2(-1, -1), cascadeIndex).xyz;
+		neighbourhood[1] = acc.template get<scalar_t, 2>(coord + int16_t2(0, -1), cascadeIndex).xyz;
+		neighbourhood[2] = acc.template get<scalar_t, 2>(coord + int16_t2(1, -1), cascadeIndex).xyz;
+		neighbourhood[3] = acc.template get<scalar_t, 2>(coord + int16_t2(-1, 0), cascadeIndex).xyz;
+		neighbourhood[4] = acc.template get<scalar_t, 2>(coord + int16_t2(0, 0), cascadeIndex).xyz;
+		neighbourhood[5] = acc.template get<scalar_t, 2>(coord + int16_t2(1, 0), cascadeIndex).xyz;
+		neighbourhood[6] = acc.template get<scalar_t, 2>(coord + int16_t2(-1, 1), cascadeIndex).xyz;
+		neighbourhood[7] = acc.template get<scalar_t, 2>(coord + int16_t2(0, 1), cascadeIndex).xyz;
+		neighbourhood[8] = acc.template get<scalar_t, 2>(coord + int16_t2(1, 1), cascadeIndex).xyz;
+
+		// numerical robustness
+		float32_t3 excl_hood_sum = ((neighbourhood[0] + neighbourhood[1]) + (neighbourhood[2] + neighbourhood[3])) +
+			((neighbourhood[5] + neighbourhood[6]) + (neighbourhood[7] + neighbourhood[8]));
+
+		CascadeSample retval;
+		retval.centerValue = neighbourhood[4];
+		retval.normalizedNeighbourhoodAverageLuma = retval.normalizedCenterLuma = acc.calcLuma(neighbourhood[4]) * reciprocalBaseI;
+		retval.normalizedNeighbourhoodAverageLuma = (acc.calcLuma(excl_hood_sum) * reciprocalBaseI + retval.normalizedNeighbourhoodAverageLuma) / 9.f;
+		return retval;
+	}
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/sampling/basic.hlsl b/include/nbl/builtin/hlsl/sampling/basic.hlsl
new file mode 100644
index 0000000000..9c575a22ce
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/basic.hlsl
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_BASIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_BASIC_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+struct PartitionRandVariable
+{
+    using floating_point_type = T;
+    using uint_type = unsigned_integer_of_size_t<sizeof(floating_point_type)>;
+
+    bool operator()(NBL_REF_ARG(floating_point_type) xi, NBL_REF_ARG(floating_point_type) rcpChoiceProb)
+    {
+        const floating_point_type NextULPAfterUnity = bit_cast<floating_point_type>(bit_cast<uint_type>(floating_point_type(1.0)) + uint_type(1u));
+        const bool pickRight = xi >= leftProb * NextULPAfterUnity;
+
+        // This is all 100% correct taking into account the above NextULPAfterUnity
+        xi -= pickRight ? leftProb : floating_point_type(0.0);
+
+        rcpChoiceProb = floating_point_type(1.0) / (pickRight ? (floating_point_type(1.0) - leftProb) : leftProb);
+        xi *= rcpChoiceProb;
+
+        return pickRight;
+    }
+
+    floating_point_type leftProb;
+};
+
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
new file mode 100644
index 0000000000..a74869990f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_BILINEAR_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_BILINEAR_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/sampling/linear.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct Bilinear
+{
+    using scalar_type = T;
+    using vector2_type = vector<T, 2>;
+    using vector3_type = vector<T, 3>;
+    using vector4_type = vector<T, 4>;
+
+    static Bilinear<T> create(const vector4_type bilinearCoeffs)
+    {
+        Bilinear<T> retval;
+        retval.bilinearCoeffs = bilinearCoeffs;
+        retval.twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]);
+        return retval;
+    }
+
+    vector2_type generate(NBL_REF_ARG(scalar_type) rcpPdf, const vector2_type _u)
+    {
+        vector2_type u;
+        Linear<scalar_type> lineary = Linear<scalar_type>::create(twiceAreasUnderXCurve);
+        u.y = lineary.generate(_u.y);
+
+        const vector2_type ySliceEndPoints = vector2_type(nbl::hlsl::mix(bilinearCoeffs[0], bilinearCoeffs[2], u.y), nbl::hlsl::mix(bilinearCoeffs[1], bilinearCoeffs[3], u.y));
+        Linear<scalar_type> linearx = Linear<scalar_type>::create(ySliceEndPoints);
+        u.x = linearx.generate(_u.x);
+
+        rcpPdf = (twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1]) / (4.0 * nbl::hlsl::mix(ySliceEndPoints[0], ySliceEndPoints[1], u.x));
+
+        return u;
+    }
+
+    scalar_type pdf(const vector2_type u)
+    {
+        return 4.0 * nbl::hlsl::mix(nbl::hlsl::mix(bilinearCoeffs[0], bilinearCoeffs[1], u.x), nbl::hlsl::mix(bilinearCoeffs[2], bilinearCoeffs[3], u.x), u.y) / (bilinearCoeffs[0] + bilinearCoeffs[1] + bilinearCoeffs[2] + bilinearCoeffs[3]);
+    }
+
+    // unit square: x0y0    x1y0
+    //              x0y1    x1y1
+    vector4_type bilinearCoeffs;    // (x0y0, x0y1, x1y0, x1y1)
+    vector2_type twiceAreasUnderXCurve;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
new file mode 100644
index 0000000000..9474642f4c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_BOX_MULLER_TRANSFORM_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_BOX_MULLER_TRANSFORM_INCLUDED_
+
+#include "nbl/builtin/hlsl/math/functions.hlsl"
+#include "nbl/builtin/hlsl/numbers.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+struct BoxMullerTransform
+{
+    using scalar_type = T;
+    using vector2_type = vector<T,2>;
+
+    vector2_type operator()(const vector2_type xi)
+    {
+        scalar_type sinPhi, cosPhi;
+        math::sincos<scalar_type>(2.0 * numbers::pi<scalar_type> * xi.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
+        return vector2_type(cosPhi, sinPhi) * nbl::hlsl::sqrt(-2.0 * nbl::hlsl::log(xi.x)) * stddev;
+    }
+
+    T stddev;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
index 1a5c96b6df..c44b55449d 100644
--- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
@@ -17,14 +17,14 @@ namespace sampling
 {
 
 template<typename T>
-vector<T,2> concentricMapping(vector<T,2> _u)
+vector<T,2> concentricMapping(const vector<T,2> _u)
 {
     //map [0;1]^2 to [-1;1]^2
     vector<T,2> u = 2.0f * _u - hlsl::promote<vector<T,2> >(1.0);
 
     vector<T,2> p;
-    if (hlsl::all<vector<bool,2> >(glsl::equal(u, hlsl::promote<vector<T,2> >(0.0))))
-        p = hlsl::promote<vector<T,2> >(0.0);
+    if (nbl::hlsl::all<vector<bool,2> >(u == (vector<T,2>)(0.0)))
+        p = (vector<T,2>)(0.0);
     else
     {
         T r;
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 9f95bf2ee5..ddbb961300 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -22,26 +22,26 @@ struct ProjectedHemisphere
     using vector_t2 = vector<T, 2>;
     using vector_t3 = vector<T, 3>;
     
-    static vector_t3 generate(vector_t2 _sample)
+    static vector_t3 generate(const vector_t2 _sample)
     {
         vector_t2 p = concentricMapping<T>(_sample * T(0.99999) + T(0.000005));
         T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
         return vector_t3(p.x, p.y, z);
     }
 
-    static T pdf(T L_z)
+    static T pdf(const T L_z)
     {
         return L_z * numbers::inv_pi<float>;
     }
 
     template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(T L)
+    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const T L)
     {
         return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L));
     }
 
     template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(vector_t3 L)
+    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const vector_t3 L)
     {
         return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L.z));
     }
@@ -77,7 +77,7 @@ struct ProjectedSphere
     }
 
     template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(vector_t3 L)
+    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const vector_t3 L)
     {
         return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L.z));
     }
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
new file mode 100644
index 0000000000..6c3cf1fad9
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_LINEAR_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_LINEAR_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct Linear
+{
+    using scalar_type = T;
+    using vector2_type = vector<T, 2>;
+
+    static Linear<T> create(const vector2_type linearCoeffs)   // start and end importance values (start, end)
+    {
+        Linear<T> retval;
+        retval.linearCoeffStart = linearCoeffs[0];
+        retval.rcpDiff = 1.0 / (linearCoeffs[0] - linearCoeffs[1]);
+        vector2_type squaredCoeffs = linearCoeffs * linearCoeffs;
+        retval.squaredCoeffStart = squaredCoeffs[0];
+        retval.squaredCoeffDiff = squaredCoeffs[1] - squaredCoeffs[0];
+        return retval;
+    }
+
+    scalar_type generate(const scalar_type u)
+    {
+        return hlsl::mix(u, (linearCoeffStart - hlsl::sqrt(squaredCoeffStart + u * squaredCoeffDiff)) * rcpDiff, hlsl::abs(rcpDiff) < numeric_limits<scalar_type>::max);
+    }
+
+    scalar_type linearCoeffStart;  
+    scalar_type rcpDiff;
+    scalar_type squaredCoeffStart;
+    scalar_type squaredCoeffDiff;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
new file mode 100644
index 0000000000..e60fe28423
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -0,0 +1,97 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_PROJECTED_SPHERICAL_TRIANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_PROJECTED_SPHERICAL_TRIANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/sampling/bilinear.hlsl>
+#include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct ProjectedSphericalTriangle
+{
+    using scalar_type = T;
+    using vector2_type = vector<T, 2>;
+    using vector3_type = vector<T, 3>;
+    using vector4_type = vector<T, 4>;
+
+    static ProjectedSphericalTriangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
+    {
+        ProjectedSphericalTriangle<T> retval;
+        retval.tri = tri;
+        return retval;
+    }
+
+    vector4_type computeBilinearPatch(const vector3_type receiverNormal, bool isBSDF)
+    {
+        const scalar_type minimumProjSolidAngle = 0.0;
+
+        matrix<T, 3, 3> m = matrix<T, 3, 3>(tri.vertex0, tri.vertex1, tri.vertex2);
+        const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(isBSDF, nbl::hlsl::mul(m, receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
+
+        return bxdfPdfAtVertex.yyxz;
+    }
+
+    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, scalar_type solidAngle, const vector3_type cos_vertices, const vector3_type sin_vertices, scalar_type cos_a, scalar_type cos_c, scalar_type csc_b, scalar_type csc_c, const vector3_type receiverNormal, bool isBSDF, const vector2_type _u)
+    {
+        vector2_type u;
+        // pre-warp according to proj solid angle approximation
+        vector4_type patch = computeBilinearPatch(receiverNormal, isBSDF);
+        Bilinear<scalar_type> bilinear = Bilinear<scalar_type>::create(patch);
+        u = bilinear.generate(rcpPdf, _u);
+
+        // now warp the points onto a spherical triangle
+        const vector3_type L = sphtri.generate(solidAngle, cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c, u);
+        rcpPdf *= solidAngle;
+
+        return L;
+    }
+
+    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, const vector3_type receiverNormal, bool isBSDF, const vector2_type u)
+    {
+        scalar_type cos_a, cos_c, csc_b, csc_c;
+        vector3_type cos_vertices, sin_vertices;
+        const scalar_type solidAngle = tri.solidAngleOfTriangle(cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c);
+        return generate(rcpPdf, solidAngle, cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c, receiverNormal, isBSDF, u);
+    }
+
+    scalar_type pdf(scalar_type solidAngle, const vector3_type cos_vertices, const vector3_type sin_vertices, scalar_type cos_a, scalar_type cos_c, scalar_type csc_b, scalar_type csc_c, const vector3_type receiverNormal, bool receiverWasBSDF, const vector3_type L)
+    {
+        scalar_type pdf;
+        const vector2_type u = sphtri.generateInverse(pdf, solidAngle, cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c, L);
+
+        vector4_type patch = computeBilinearPatch(receiverNormal, receiverWasBSDF);
+        Bilinear<scalar_type> bilinear = Bilinear<scalar_type>::create(patch);
+        return pdf * bilinear.pdf(u);
+    }
+
+    scalar_type pdf(const vector3_type receiverNormal, bool receiverWasBSDF, const vector3_type L)
+    {
+        scalar_type pdf;
+        const vector2_type u = sphtri.generateInverse(pdf, L);
+
+        vector4_type patch = computeBilinearPatch(receiverNormal, receiverWasBSDF);
+        Bilinear<scalar_type> bilinear = Bilinear<scalar_type>::create(patch);
+        return pdf * bilinear.pdf(u);
+    }
+
+    shapes::SphericalTriangle<T> tri;
+    sampling::SphericalTriangle<T> sphtri;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/quantized_sequence.hlsl b/include/nbl/builtin/hlsl/sampling/quantized_sequence.hlsl
new file mode 100644
index 0000000000..8929609c34
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/quantized_sequence.hlsl
@@ -0,0 +1,309 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_QUANTIZED_SEQUENCE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_QUANTIZED_SEQUENCE_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts/vector.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include "nbl/builtin/hlsl/random/pcg.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T, uint16_t Dim NBL_STRUCT_CONSTRAINABLE>
+struct QuantizedSequence;
+
+
+namespace impl
+{
+template<uint16_t Bits>
+struct unorm_constant;
+template<>
+struct unorm_constant<4> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x3d888889u; };
+template<>
+struct unorm_constant<5> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x3d042108u; };
+template<>
+struct unorm_constant<8> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x3b808081u; };
+template<>
+struct unorm_constant<10> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x3a802008u; };
+template<>
+struct unorm_constant<16> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x37800080u; };
+template<>
+struct unorm_constant<21> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x35000004u; };
+template<>
+struct unorm_constant<32> { NBL_CONSTEXPR_STATIC_INLINE uint32_t value = 0x2f800004u; };
+
+template<typename T, uint16_t D, bool EncodeScramble>
+struct decode_helper;
+
+template<typename T, uint16_t D>
+struct decode_helper<T, D, false>
+{
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    using fp_type = typename float_of_size<sizeof(scalar_type)>::type;
+    using uvec_type = vector<scalar_type, D>;
+    using sequence_type = QuantizedSequence<T, D>;
+    using return_type = vector<fp_type, D>;
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type UNormConstant = unorm_constant<8u*sizeof(scalar_type)>::value;
+
+    static return_type __call(NBL_CONST_REF_ARG(sequence_type) val, const uvec_type scrambleKey)
+    {
+        uvec_type seqVal;
+        NBL_UNROLL for(uint16_t i = 0; i < D; i++)
+            seqVal[i] = val.get(i) ^ scrambleKey[i];
+        return return_type(seqVal) * bit_cast<fp_type>(UNormConstant);
+    }
+};
+template<typename T, uint16_t D>
+struct decode_helper<T, D, true>
+{
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    using fp_type = typename float_of_size<sizeof(scalar_type)>::type;
+    using uvec_type = vector<scalar_type, D>;
+    using sequence_type = QuantizedSequence<T, D>;
+    using sequence_store_type = typename sequence_type::store_type;
+    using sequence_scalar_type = typename vector_traits<sequence_store_type>::scalar_type;
+    using return_type = vector<fp_type, D>;
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type UNormConstant = sequence_type::UNormConstant;
+
+    static return_type __call(NBL_CONST_REF_ARG(sequence_type) val, const uvec_type scrambleKey)
+    {
+        sequence_type scramble;
+        NBL_UNROLL for(uint16_t i = 0; i < D; i++)
+            scramble.set(i, scrambleKey[i]);
+        scramble.data ^= val.data;
+
+        uvec_type seqVal;
+        NBL_UNROLL for(uint16_t i = 0; i < D; i++)
+            seqVal[i] = scramble.get(i);
+        return return_type(seqVal) * bit_cast<fp_type>(UNormConstant);
+    }
+};
+}
+
+template<typename T, uint16_t D, bool EncodeScramble=false>
+vector<typename float_of_size<sizeof(typename vector_traits<T>::scalar_type)>::type, D> decode(NBL_CONST_REF_ARG(QuantizedSequence<T, D>) val, const vector<typename vector_traits<T>::scalar_type, D> scrambleKey)
+{
+    return impl::decode_helper<T,D,EncodeScramble>::__call(val, scrambleKey);
+}
+
+#define SEQUENCE_SPECIALIZATION_CONCEPT concepts::UnsignedIntegral<typename vector_traits<T>::scalar_type> && size_of_v<typename vector_traits<T>::scalar_type> <= 4
+
+// all Dim=1
+template<typename T> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT)
+struct QuantizedSequence<T, 1 NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT) >
+{
+    using store_type = T;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<8u*sizeof(store_type)>::value;
+
+    store_type get(const uint16_t idx) { assert(idx > 0 && idx < 1); return data; }
+    void set(const uint16_t idx, const store_type value) { assert(idx > 0 && idx < 1); data = value; }
+
+    store_type data;
+};
+
+// uint16_t, uint32_t; Dim=2,3,4
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 1 && Dim > 1 && Dim < 5)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 1 && Dim > 1 && Dim < 5) >
+{
+    using store_type = T;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t StoreBits = uint16_t(8u) * size_of_v<store_type>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BitsPerComponent = StoreBits / Dim;
+    NBL_CONSTEXPR_STATIC_INLINE store_type Mask = (uint16_t(1u) << BitsPerComponent) - uint16_t(1u);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t DiscardBits = StoreBits - BitsPerComponent;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<BitsPerComponent>::value;
+
+    store_type get(const uint16_t idx)
+    {
+        assert(idx > 0 && idx < Dim);
+        return (data >> (BitsPerComponent * idx)) & Mask;
+    }
+
+    void set(const uint16_t idx, const store_type value)
+    {
+        assert(idx > 0 && idx < Dim);
+        const uint16_t bits = (BitsPerComponent * idx);
+        data &= ~(Mask << bits);
+        data |= ((value >> DiscardBits) & Mask) << bits;
+    }
+
+    store_type data;
+};
+
+// Dim 2,3,4 matches vector dim
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == Dim && Dim > 1 && Dim < 5)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == Dim && Dim > 1 && Dim < 5) >
+{
+    using store_type = T;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<8u*sizeof(scalar_type)>::value;
+
+    scalar_type get(const uint16_t idx) { assert(idx > 0 && idx < Dim); return data[idx]; }
+    void set(const uint16_t idx, const scalar_type value) { assert(idx > 0 && idx < Dim); data[idx] = value; }
+
+    store_type data;
+};
+
+// uint16_t2, uint32_t2; Dim=3
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 2 && Dim == 3)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 2 && Dim == 3) >
+{
+    using store_type = T;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t StoreBits = uint16_t(8u) * size_of_v<store_type>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BitsPerComponent = StoreBits / Dim;
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type Mask = (scalar_type(1u) << BitsPerComponent) - scalar_type(1u);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t DiscardBits = (uint16_t(8u) * size_of_v<scalar_type>) - BitsPerComponent;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<BitsPerComponent>::value;
+
+    scalar_type get(const uint16_t idx)
+    {
+        assert(idx >= 0 && idx < 3);
+        if (idx < 2)
+        {
+            return data[idx] & Mask;
+        }
+        else
+        {
+            const scalar_type zbits = scalar_type(DiscardBits);
+            const scalar_type zmask = (scalar_type(1u) << zbits) - scalar_type(1u);
+            scalar_type z = (data[0] >> BitsPerComponent) & zmask;
+            z |= ((data[1] >> BitsPerComponent) & zmask) << DiscardBits;
+            return z;
+        }
+    }
+
+    void set(const uint16_t idx, const scalar_type value)
+    {
+        assert(idx >= 0 && idx < 3);
+        if (idx < 2)
+        {
+            const scalar_type trunc_val = value >> DiscardBits;
+            data[idx] &= ~Mask;
+            data[idx] |= trunc_val & Mask;
+        }
+        else
+        {
+            const scalar_type zbits = scalar_type(DiscardBits);
+            const scalar_type zmask = (scalar_type(1u) << zbits) - scalar_type(1u);
+            const scalar_type trunc_val = value >> DiscardBits;
+            data[0] &= Mask;
+            data[1] &= Mask;
+            data[0] |= (trunc_val & zmask) << BitsPerComponent;
+            data[1] |= ((trunc_val >> zbits) & zmask) << BitsPerComponent;
+        }
+    }
+
+    store_type data;
+};
+
+// uint16_t2, uint32_t2; Dim=4
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 2 && Dim == 4)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 2 && Dim == 4) >
+{
+    using store_type = T;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t StoreBits = uint16_t(8u) * size_of_v<store_type>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BitsPerComponent = StoreBits / Dim;
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type Mask = (uint16_t(1u) << BitsPerComponent) - uint16_t(1u);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t DiscardBits = (uint16_t(8u) * size_of_v<scalar_type>) - BitsPerComponent;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<BitsPerComponent>::value;
+
+    scalar_type get(const uint16_t idx)
+    {
+        assert(idx >= 0 && idx < 4);
+        const uint16_t i = (idx & uint16_t(2u)) >> uint16_t(1u);
+        return (data[i] >> (BitsPerComponent * (idx & uint16_t(1u)))) & Mask;
+    }
+
+    void set(const uint16_t idx, const scalar_type value)
+    {
+        assert(idx >= 0 && idx < 4);
+        const uint16_t i = (idx & uint16_t(2u)) >> uint16_t(1u);
+        const uint16_t odd = idx & uint16_t(1u);
+        data[i] &= hlsl::mix(~Mask, Mask, bool(odd));
+        data[i] |= ((value >> DiscardBits) & Mask) << (BitsPerComponent * odd);
+    }
+
+    store_type data;
+};
+
+// uint16_t4, uint32_t4; Dim=2
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 4 && Dim == 2)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 4 && Dim == 2) >
+{
+    using store_type = T;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    using base_type = vector<scalar_type, 2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<8u*sizeof(scalar_type)>::value;
+
+    base_type get(const uint16_t idx)
+    {
+        assert(idx >= 0 && idx < 2);
+        base_type a;
+        a[0] = data[uint16_t(2u) * idx];
+        a[1] = data[uint16_t(2u) * idx + 1];
+        return a;
+    }
+
+    void set(const uint16_t idx, const base_type value)
+    {
+        assert(idx >= 0 && idx < 2);
+        base_type a;
+        data[uint16_t(2u) * idx] = value[0];
+        data[uint16_t(2u) * idx + 1] = value[1];
+    }
+
+    store_type data;
+};
+
+// uint16_t4, uint32_t4; Dim=3
+// uint16_t4 --> returns uint16_t2 - 21 bits per component: 16 in x, 5 in y
+// uint16_t4 --> returns uint32_t2 - 42 bits per component: 32 in x, 10 in y
+template<typename T, uint16_t Dim> NBL_PARTIAL_REQ_TOP(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 4 && Dim == 3)
+struct QuantizedSequence<T, Dim NBL_PARTIAL_REQ_BOT(SEQUENCE_SPECIALIZATION_CONCEPT && vector_traits<T>::Dimension == 4 && Dim == 3) >
+{
+    using store_type = T;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    using base_type = vector<scalar_type, 2>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t StoreBits = uint16_t(8u) * size_of_v<store_type>;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BitsPerComponent = StoreBits / Dim;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t LeftoverBitsPerComponent = BitsPerComponent - uint16_t(8u) * size_of_v<scalar_type>;
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type Mask = (uint16_t(1u) << LeftoverBitsPerComponent) - uint16_t(1u);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t DiscardBits = (uint16_t(8u) * size_of_v<base_type>) - BitsPerComponent;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t UNormConstant = impl::unorm_constant<8u*sizeof(scalar_type)>::value;
+
+    base_type get(const uint16_t idx)
+    {
+        assert(idx >= 0 && idx < 3);
+        base_type a;
+        a[0] = data[idx];
+        a[1] = (data[3] >> (LeftoverBitsPerComponent * idx)) & Mask;
+        return a;
+    }
+
+    void set(const uint16_t idx, const base_type value)
+    {
+        assert(idx >= 0 && idx < 3);
+        data[idx] = value[0];
+        data[3] &= ~Mask;
+        data[3] |= ((value[1] >> DiscardBits) & Mask) << (LeftoverBitsPerComponent * idx);
+    }
+
+    store_type data;
+};
+
+#undef SEQUENCE_SPECIALIZATION_CONCEPT
+
+}
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
new file mode 100644
index 0000000000..f9e3d2f7ae
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -0,0 +1,90 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_RECTANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_RECTANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct SphericalRectangle
+{
+    using scalar_type = T;
+    using vector2_type = vector<T, 2>;
+    using vector3_type = vector<T, 3>;
+    using vector4_type = vector<T, 4>;
+
+    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect)
+    {
+        SphericalRectangle<T> retval;
+        retval.rect = rect;
+        return retval;
+    }
+
+    vector2_type generate(const vector2_type rectangleExtents, const vector2_type uv, NBL_REF_ARG(scalar_type) S)
+    {
+        const vector4_type denorm_n_z = vector4_type(-rect.r0.y, rect.r0.x + rectangleExtents.x, rect.r0.y + rectangleExtents.y, -rect.r0.x);
+        const vector4_type n_z = denorm_n_z / hlsl::sqrt<vector4_type>(hlsl::promote<vector4_type>(rect.r0.z * rect.r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type cosGamma = vector4_type(
+            -n_z[0] * n_z[1],
+            -n_z[1] * n_z[2],
+            -n_z[2] * n_z[3],
+            -n_z[3] * n_z[0]
+        );
+
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[0]);
+        angle_adder.addCosine(cosGamma[1]);
+        scalar_type p = angle_adder.getSumofArccos();
+        angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[2]);
+        angle_adder.addCosine(cosGamma[3]);
+        scalar_type q = angle_adder.getSumofArccos();
+
+        const scalar_type k = scalar_type(2.0) * numbers::pi<scalar_type> - q;
+        const scalar_type b0 = n_z[0];
+        const scalar_type b1 = n_z[2];
+        S = p + q - scalar_type(2.0) * numbers::pi<scalar_type>;
+
+        const scalar_type CLAMP_EPS = 1e-5;
+
+        // flip z axis if rect.r0.z > 0
+        rect.r0.z = ieee754::flipSignIfRHSNegative<scalar_type>(rect.r0.z, -rect.r0.z);
+        vector3_type r1 = rect.r0 + vector3_type(rectangleExtents.x, rectangleExtents.y, 0);
+
+        const scalar_type au = uv.x * S + k;
+        const scalar_type fu = (hlsl::cos<scalar_type>(au) * b0 - b1) / hlsl::sin<scalar_type>(au);
+        const scalar_type cu_2 = hlsl::max<scalar_type>(fu * fu + b0 * b0, 1.f); // forces `cu` to be in [-1,1]
+        const scalar_type cu = ieee754::flipSignIfRHSNegative<scalar_type>(scalar_type(1.0) / hlsl::sqrt<scalar_type>(cu_2), fu);
+
+        scalar_type xu = -(cu * rect.r0.z) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - cu * cu);
+        xu = hlsl::clamp<scalar_type>(xu, rect.r0.x, r1.x); // avoid Infs
+        const scalar_type d_2 = xu * xu + rect.r0.z * rect.r0.z;
+        const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
+
+        const scalar_type h0 = rect.r0.y / hlsl::sqrt<scalar_type>(d_2 + rect.r0.y * rect.r0.y);
+        const scalar_type h1 = r1.y / hlsl::sqrt<scalar_type>(d_2 + r1.y * r1.y);
+        const scalar_type hv = h0 + uv.y * (h1 - h0);
+        const scalar_type hv2 = hv * hv;
+        const scalar_type yv = hlsl::mix(r1.y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - CLAMP_EPS);
+
+        return vector2_type((xu - rect.r0.x) / rectangleExtents.x, (yv - rect.r0.y) / rectangleExtents.y);
+    }
+
+    shapes::SphericalRectangle<T> rect;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
new file mode 100644
index 0000000000..5770403cd2
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -0,0 +1,122 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_TRIANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_TRIANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/math/quaternions.hlsl>
+#include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct SphericalTriangle
+{
+    using scalar_type = T;
+    using vector2_type = vector<T, 2>;
+    using vector3_type = vector<T, 3>;
+
+    static SphericalTriangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
+    {
+        SphericalTriangle<T> retval;
+        retval.tri = tri;
+        return retval;
+    }
+
+    // WARNING: can and will return NAN if one or three of the triangle edges are near zero length
+    vector3_type generate(scalar_type solidAngle, const vector3_type cos_vertices, const vector3_type sin_vertices, scalar_type cos_a, scalar_type cos_c, scalar_type csc_b, scalar_type csc_c, const vector2_type u)
+    {
+        scalar_type negSinSubSolidAngle,negCosSubSolidAngle;
+        math::sincos(solidAngle * u.x - numbers::pi<scalar_type>, negSinSubSolidAngle, negCosSubSolidAngle);
+
+        const scalar_type p = negCosSubSolidAngle * sin_vertices[0] - negSinSubSolidAngle * cos_vertices[0];
+        const scalar_type q = -negSinSubSolidAngle * sin_vertices[0] - negCosSubSolidAngle * cos_vertices[0];
+        
+        // TODO: we could optimize everything up and including to the first slerp, because precision here is just godawful
+        scalar_type u_ = q - cos_vertices[0];
+        scalar_type v_ = p + sin_vertices[0] * cos_c;
+
+        // the slerps could probably be optimized by sidestepping `normalize` calls and accumulating scaling factors
+        vector3_type C_s = tri.vertex0;
+        if (csc_b < numeric_limits<scalar_type>::max)
+        {
+            const scalar_type cosAngleAlongAC = ((v_ * q - u_ * p) * cos_vertices[0] - v_) / ((v_ * p + u_ * q) * sin_vertices[0]);
+            if (nbl::hlsl::abs(cosAngleAlongAC) < 1.f)
+                C_s += math::quaternion<scalar_type>::slerp_delta(tri.vertex0, tri.vertex2 * csc_b, cosAngleAlongAC);
+        }
+
+        vector3_type retval = tri.vertex1;
+        const scalar_type cosBC_s = nbl::hlsl::dot(C_s, tri.vertex1);
+        const scalar_type csc_b_s = 1.0 / nbl::hlsl::sqrt(1.0 - cosBC_s * cosBC_s);
+        if (csc_b_s < numeric_limits<scalar_type>::max)
+        {
+            const scalar_type cosAngleAlongBC_s = nbl::hlsl::clamp(1.0 + cosBC_s * u.y - u.y, -1.f, 1.f);
+            if (nbl::hlsl::abs(cosAngleAlongBC_s) < 1.f)
+                retval += math::quaternion<scalar_type>::slerp_delta(tri.vertex1, C_s * csc_b_s, cosAngleAlongBC_s);
+        }
+        return retval;
+    }
+
+    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, const vector2_type u)
+    {
+        scalar_type cos_a, cos_c, csc_b, csc_c;
+        vector3_type cos_vertices, sin_vertices;
+
+        rcpPdf = tri.solidAngleOfTriangle(cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c);
+
+        return generate(rcpPdf, cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c, u);
+    }
+
+    vector2_type generateInverse(NBL_REF_ARG(scalar_type) pdf, scalar_type solidAngle, const vector3_type cos_vertices, const vector3_type sin_vertices, scalar_type cos_a, scalar_type cos_c, scalar_type csc_b, scalar_type csc_c, const vector3_type L)
+    {
+        pdf = 1.0 / solidAngle;
+
+        const scalar_type cosAngleAlongBC_s = nbl::hlsl::dot(L, tri.vertex1);
+        const scalar_type csc_a_ = 1.0 / nbl::hlsl::sqrt(1.0 - cosAngleAlongBC_s * cosAngleAlongBC_s);
+        const scalar_type cos_b_ = nbl::hlsl::dot(L, tri.vertex0);
+
+        const scalar_type cosB_ = (cos_b_ - cosAngleAlongBC_s * cos_c) * csc_a_ * csc_c;
+        const scalar_type sinB_ = nbl::hlsl::sqrt(1.0 - cosB_ * cosB_);
+
+        const scalar_type cosC_ = sin_vertices[0] * sinB_* cos_c - cos_vertices[0] * cosB_;
+        const scalar_type sinC_ = nbl::hlsl::sqrt(1.0 - cosC_ * cosC_);
+
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cos_vertices[0], sin_vertices[0]);
+        angle_adder.addAngle(cosB_, sinB_);
+        angle_adder.addAngle(cosC_, sinC_);
+        const scalar_type subTriSolidAngleRatio = (angle_adder.getSumofArccos() - numbers::pi<scalar_type>) * pdf;
+        const scalar_type u = subTriSolidAngleRatio > numeric_limits<scalar_type>::min ? subTriSolidAngleRatio : 0.0;
+
+        const scalar_type cosBC_s = (cos_vertices[0] + cosB_ * cosC_) / (sinB_ * sinC_);
+        const scalar_type v = (1.0 - cosAngleAlongBC_s) / (1.0 - (cosBC_s < bit_cast<float>(0x3f7fffff) ? cosBC_s : cos_c));
+
+        return vector2_type(u,v);
+    }
+
+    vector2_type generateInverse(NBL_REF_ARG(scalar_type) pdf, const vector3_type L)
+    {
+        scalar_type cos_a, cos_c, csc_b, csc_c;
+        vector3_type cos_vertices, sin_vertices;
+
+        const scalar_type solidAngle = tri.solidAngleOfTriangle(cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c);
+
+        return generateInverse(pdf, solidAngle, cos_vertices, sin_vertices, cos_a, cos_c, csc_b, csc_c, L);
+    }
+
+    shapes::SphericalTriangle<T> tri;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index df4100db9b..5fc3bc7a0b 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -23,7 +23,7 @@ struct UniformHemisphere
     using vector_t2 = vector<T, 2>;
     using vector_t3 = vector<T, 3>;
 
-    static vector_t3 generate(vector_t2 _sample)
+    static vector_t3 generate(const vector_t2 _sample)
     {
         T z = _sample.x;
         T r = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - z * z));
@@ -49,7 +49,7 @@ struct UniformSphere
     using vector_t2 = vector<T, 2>;
     using vector_t3 = vector<T, 3>;
 
-    static vector_t3 generate(vector_t2 _sample)
+    static vector_t3 generate(const vector_t2 _sample)
     {
         T z = T(1.0) - T(2.0) * _sample.x;
         T r = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - z * z));
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
new file mode 100644
index 0000000000..11442bef7c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -0,0 +1,67 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SHAPES_SPHERICAL_RECTANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SHAPES_SPHERICAL_RECTANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/numbers.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/math/angle_adding.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace shapes
+{
+
+template<typename Scalar>
+struct SphericalRectangle
+{
+    using scalar_type = Scalar;
+    using vector3_type = vector<Scalar, 3>;
+    using vector4_type = vector<Scalar, 4>;
+    using matrix3x3_type = matrix<Scalar, 3, 3>;
+
+    static SphericalRectangle<scalar_type> create(const vector3_type observer, const vector3_type rectangleOrigin, const matrix3x3_type basis)
+    {
+        SphericalRectangle<scalar_type> retval;
+        retval.r0 = nbl::hlsl::mul(basis, rectangleOrigin - observer);
+        return retval;
+    }
+
+    static SphericalRectangle<Scalar> create(const vector3_type observer, const vector3_type rectangleOrigin, const vector3_type T, vector3_type B, const vector3_type N)
+    {
+        SphericalRectangle<scalar_type> retval;
+        matrix3x3_type TBN = nbl::hlsl::transpose<matrix3x3_type>(matrix3x3_type(T, B, N));
+        retval.r0 = nbl::hlsl::mul(TBN, rectangleOrigin - observer);
+        return retval;
+    }
+
+    scalar_type solidAngleOfRectangle(const vector<scalar_type, 2> rectangleExtents)
+    {
+        const vector4_type denorm_n_z = vector4_type(-r0.y, r0.x + rectangleExtents.x, r0.y + rectangleExtents.y, -r0.x);
+        const vector4_type n_z = denorm_n_z / nbl::hlsl::sqrt((vector4_type)(r0.z * r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type cosGamma = vector4_type(
+            -n_z[0] * n_z[1],
+            -n_z[1] * n_z[2],
+            -n_z[2] * n_z[3],
+            -n_z[3] * n_z[0]
+        );
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[0]);
+        angle_adder.addCosine(cosGamma[1]);
+        angle_adder.addCosine(cosGamma[2]);
+        angle_adder.addCosine(cosGamma[3]);
+        return angle_adder.getSumofArccos() - scalar_type(2.0) * numbers::pi<float>;
+    }
+
+    vector3_type r0;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
new file mode 100644
index 0000000000..f574b106ce
--- /dev/null
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -0,0 +1,109 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SHAPES_SPHERICAL_TRIANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SHAPES_SPHERICAL_TRIANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/tgmath.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/math/angle_adding.hlsl>
+#include <nbl/builtin/hlsl/numbers.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace shapes
+{
+
+template<typename T>
+struct SphericalTriangle
+{
+    using scalar_type = T;
+    using vector3_type = vector<T, 3>;
+
+    static SphericalTriangle<T> create(const vector3_type vertex0, const vector3_type vertex1, const vector3_type vertex2, const vector3_type origin)
+    {
+        SphericalTriangle<T> retval;
+        retval.vertex0 = nbl::hlsl::normalize(vertex0 - origin);
+        retval.vertex1 = nbl::hlsl::normalize(vertex1 - origin);
+        retval.vertex2 = nbl::hlsl::normalize(vertex2 - origin);
+        retval.cos_sides = vector3_type(hlsl::dot(retval.vertex1, retval.vertex2), hlsl::dot(retval.vertex2, retval.vertex0), hlsl::dot(retval.vertex0, retval.vertex1));
+        const vector3_type csc_sides2 = hlsl::promote<vector3_type>(1.0) - retval.cos_sides * retval.cos_sides;
+        retval.csc_sides.x = hlsl::rsqrt<scalar_type>(csc_sides2.x);
+        retval.csc_sides.y = hlsl::rsqrt<scalar_type>(csc_sides2.y);
+        retval.csc_sides.z = hlsl::rsqrt<scalar_type>(csc_sides2.z);
+        return retval;
+    }
+
+    bool pyramidAngles()
+    {
+        return hlsl::any<vector<bool, 3> >(csc_sides >= (vector3_type)(numeric_limits<scalar_type>::max));
+    }
+
+    scalar_type solidAngleOfTriangle(NBL_REF_ARG(vector3_type) cos_vertices, NBL_REF_ARG(vector3_type) sin_vertices, NBL_REF_ARG(scalar_type) cos_a, NBL_REF_ARG(scalar_type) cos_c, NBL_REF_ARG(scalar_type) csc_b, NBL_REF_ARG(scalar_type) csc_c)
+    {
+        if (pyramidAngles())
+            return 0.f;
+
+        // these variables might eventually get optimized out
+        cos_a = cos_sides[0];
+        cos_c = cos_sides[2];
+        csc_b = csc_sides[1];
+        csc_c = csc_sides[2];
+
+        // Both vertices and angles at the vertices are denoted by the same upper case letters A, B, and C. The angles A, B, C of the triangle are equal to the angles between the planes that intersect the surface of the sphere or, equivalently, the angles between the tangent vectors of the great circle arcs where they meet at the vertices. Angles are in radians. The angles of proper spherical triangles are (by convention) less than PI
+        cos_vertices = hlsl::clamp((cos_sides - cos_sides.yzx * cos_sides.zxy) * csc_sides.yzx * csc_sides.zxy, hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0)); // using Spherical Law of Cosines (TODO: do we need to clamp anymore? since the pyramid angles method introduction?) 
+        sin_vertices = hlsl::sqrt(hlsl::promote<vector3_type>(1.0) - cos_vertices * cos_vertices);
+
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cos_vertices[0], sin_vertices[0]);
+        angle_adder.addAngle(cos_vertices[1], sin_vertices[1]);
+        angle_adder.addAngle(cos_vertices[2], sin_vertices[2]);
+        return angle_adder.getSumofArccos() - numbers::pi<scalar_type>;
+    }
+
+    scalar_type solidAngleOfTriangle()
+    {
+        vector3_type dummy0,dummy1;
+        scalar_type dummy2,dummy3,dummy4,dummy5;
+        return solidAngleOfTriangle(dummy0,dummy1,dummy2,dummy3,dummy4,dummy5);
+    }
+
+    scalar_type projectedSolidAngleOfTriangle(const vector3_type receiverNormal, NBL_REF_ARG(vector3_type) cos_sides, NBL_REF_ARG(vector3_type) csc_sides, NBL_REF_ARG(vector3_type) cos_vertices)
+    {
+        if (pyramidAngles())
+            return 0.f;
+
+        vector3_type awayFromEdgePlane0 = hlsl::cross<vector3_type>(vertex1, vertex2) * csc_sides[0];
+        vector3_type awayFromEdgePlane1 = hlsl::cross<vector3_type>(vertex2, vertex0) * csc_sides[1];
+        vector3_type awayFromEdgePlane2 = hlsl::cross<vector3_type>(vertex0, vertex1) * csc_sides[2];
+
+        // useless here but could be useful somewhere else
+        cos_vertices[0] = hlsl::dot<vector3_type>(awayFromEdgePlane1, awayFromEdgePlane2);
+        cos_vertices[1] = hlsl::dot<vector3_type>(awayFromEdgePlane2, awayFromEdgePlane0);
+        cos_vertices[2] = hlsl::dot<vector3_type>(awayFromEdgePlane0, awayFromEdgePlane1);
+        // TODO: above dot products are in the wrong order, either work out which is which, or try all 6 permutations till it works
+        cos_vertices = hlsl::clamp<vector3_type>((cos_sides - cos_sides.yzx * cos_sides.zxy) * csc_sides.yzx * csc_sides.zxy, hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0));
+
+        matrix<scalar_type, 3, 3> awayFromEdgePlane = matrix<scalar_type, 3, 3>(awayFromEdgePlane0, awayFromEdgePlane1, awayFromEdgePlane2);
+        const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
+
+        const vector3_type pyramidAngles = acos<scalar_type>(cos_sides);
+        return hlsl::dot<vector3_type>(pyramidAngles, externalProducts) / (2.f * numbers::pi<scalar_type>);
+    }
+
+    vector3_type vertex0;
+    vector3_type vertex1;
+    vector3_type vertex2;
+    vector3_type cos_sides;
+    vector3_type csc_sides;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/shapes/triangle.hlsl b/include/nbl/builtin/hlsl/shapes/triangle.hlsl
index 4677b0e155..b2f4170f70 100644
--- a/include/nbl/builtin/hlsl/shapes/triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/triangle.hlsl
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
@@ -18,10 +18,10 @@ namespace shapes
 
 namespace util
 {
-  // Use this convetion e_i = v_{i+2}-v_{i+1}. vertex index is modulo by 3.
-  template <typename float_t>
-  vector<float_t, 3> compInternalAngle(NBL_CONST_REF_ARG(vector<float_t, 3>) e0, NBL_CONST_REF_ARG(vector<float_t, 3>) e1, NBL_CONST_REF_ARG(vector<float_t, 3>) e2)
-  {
+// Use this convetion e_i = v_{i+2}-v_{i+1}. vertex index is modulo by 3.
+template <typename float_t>
+vector<float_t, 3> anglesFromTriangleEdges(const vector<float_t, 3> e0, vector<float_t, 3> e1, const vector<float_t, 3> e2)
+{
     // Calculate this triangle's weight for each of its three m_vertices
     // start by calculating the lengths of its sides
     const float_t a = hlsl::dot(e0, e0);
@@ -36,11 +36,11 @@ namespace util
     const float_t angle2 = hlsl::numbers::pi<float_t> - (angle0 + angle1);
     // use them to find the angle at each vertex
     return vector<float_t, 3>(angle0, angle1, angle2);
-  }
+}
 }
 
 }
 }
 }
 
-#endif
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 02495e2f2e..9190a4ec73 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -347,11 +347,6 @@ template<typename BooleanVector>
 [[vk::ext_instruction(spv::OpAny)]]
 enable_if_t<is_vector_v<BooleanVector>&& is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>, bool> any(BooleanVector vec);
 
-// If Condition is a vector, ResultType must be a vector with the same number of components. Using (p -> q) = (~p v q)
-template<typename Condition, typename ResultType NBL_FUNC_REQUIRES(concepts::Boolean<Condition> && (! concepts::Vector<Condition> || (concepts::Vector<ResultType> && (extent_v<Condition> == extent_v<ResultType>))))
-[[vk::ext_instruction(spv::OpSelect)]]
-ResultType select(Condition condition, ResultType object1, ResultType object2);
-
 template<typename T NBL_FUNC_REQUIRES(concepts::UnsignedIntegral<T>)
 [[vk::ext_instruction(spv::OpIAddCarry)]]
 AddCarryOutput<T> addCarry(T operand1, T operand2);
diff --git a/include/nbl/builtin/hlsl/testing/relative_approx_compare.hlsl b/include/nbl/builtin/hlsl/testing/relative_approx_compare.hlsl
new file mode 100644
index 0000000000..8d32780f93
--- /dev/null
+++ b/include/nbl/builtin/hlsl/testing/relative_approx_compare.hlsl
@@ -0,0 +1,94 @@
+#ifndef _NBL_BUILTIN_HLSL_TESTING_RELATIVE_APPROX_COMPARE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TESTING_RELATIVE_APPROX_COMPARE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
+
+namespace nbl 
+{
+namespace hlsl
+{
+namespace testing
+{
+namespace impl
+{
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct RelativeApproxCompareHelper;
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeScalar<FloatingPoint>)
+struct RelativeApproxCompareHelper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeScalar<FloatingPoint>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPoint) lhs, NBL_CONST_REF_ARG(FloatingPoint) rhs, const float64_t maxAllowedDifference)
+    {
+        const bool bothAreNaN = nbl::hlsl::isnan(lhs) && nbl::hlsl::isnan(rhs);
+        const bool bothAreInf = nbl::hlsl::isinf(lhs) && nbl::hlsl::isinf(rhs);
+        const bool bothHaveSameSign = nbl::hlsl::ieee754::extractSign(lhs) == nbl::hlsl::ieee754::extractSign(rhs);
+        const bool lhsIsSubnormalOrZero = ieee754::isSubnormal(lhs) || ieee754::isZero(lhs);
+        const bool rhsIsSubnormalOrZero = ieee754::isSubnormal(rhs) || ieee754::isZero(rhs);
+
+        if (bothAreNaN)
+            return true;
+        if (bothAreInf && bothHaveSameSign)
+            return true;
+        if (lhsIsSubnormalOrZero && rhsIsSubnormalOrZero)
+            return true;
+        if (!lhsIsSubnormalOrZero && rhsIsSubnormalOrZero)
+            return false;
+        if (lhsIsSubnormalOrZero && !rhsIsSubnormalOrZero)
+            return false;
+
+        return hlsl::max(hlsl::abs(lhs / rhs), hlsl::abs(rhs / lhs)) <= 1.f + maxAllowedDifference;
+    }
+};
+
+template<typename FloatingPointVector>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<FloatingPointVector>)
+struct RelativeApproxCompareHelper<FloatingPointVector NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<FloatingPointVector>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPointVector) lhs, NBL_CONST_REF_ARG(FloatingPointVector) rhs, const float64_t maxAllowedDifference)
+    {
+        using traits = nbl::hlsl::vector_traits<FloatingPointVector>;
+        for (uint32_t i = 0; i < traits::Dimension; ++i)
+        {
+            if (!RelativeApproxCompareHelper<typename traits::scalar_type>::__call(lhs[i], rhs[i], maxAllowedDifference))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+template<typename FloatingPointMatrix>
+NBL_PARTIAL_REQ_TOP(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>)
+struct RelativeApproxCompareHelper<FloatingPointMatrix NBL_PARTIAL_REQ_BOT(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPointMatrix) lhs, NBL_CONST_REF_ARG(FloatingPointMatrix) rhs, const float64_t maxAllowedDifference)
+    {
+        using traits = nbl::hlsl::matrix_traits<FloatingPointMatrix>;
+        for (uint32_t i = 0; i < traits::RowCount; ++i)
+        {
+            if (!RelativeApproxCompareHelper<typename traits::row_type>::__call(lhs[i], rhs[i], maxAllowedDifference))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+}
+
+template<typename T>
+bool relativeApproxCompare(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs, const float64_t maxAllowedDifference)
+{
+	return impl::RelativeApproxCompareHelper<T>::__call(lhs, rhs, maxAllowedDifference);
+}
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
index 652cabd7c7..95315f6e3c 100644
--- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
@@ -18,20 +18,13 @@ struct vector_traits
     NBL_CONSTEXPR_STATIC_INLINE bool IsVector = false;
 };
 
-// i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
-#define DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(DIMENSION)\
-template<typename T> \
-struct vector_traits<vector<T, DIMENSION> >\
-{\
-    using scalar_type = T;\
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = DIMENSION;\
-    NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
-};\
-
-DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1)
-DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
-DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3)
-DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4)
+template<typename T, uint16_t N>
+struct vector_traits<vector<T, N> >
+{
+    using scalar_type = T;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = N;
+    NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;
+};
 
 }
 }
diff --git a/include/nbl/config/BuildConfigOptions.h.in b/include/nbl/config/BuildConfigOptions.h.in
index c67c942217..f544562a57 100644
--- a/include/nbl/config/BuildConfigOptions.h.in
+++ b/include/nbl/config/BuildConfigOptions.h.in
@@ -59,7 +59,8 @@
 
 #cmakedefine _NBL_BUILD_DPL_
 
-// !
+#cmakedefine NBL_BUILD_DEBUG_DRAW
+
 // TODO: This has to disapppear from the main header and go to the OptiX extension header + config
 #cmakedefine OPTIX_INCLUDE_DIR "@OPTIX_INCLUDE_DIR@"
 
diff --git a/include/nbl/ext/DebugDraw/CDrawAABB.h b/include/nbl/ext/DebugDraw/CDrawAABB.h
new file mode 100644
index 0000000000..126731f425
--- /dev/null
+++ b/include/nbl/ext/DebugDraw/CDrawAABB.h
@@ -0,0 +1,242 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_EXT_DEBUG_DRAW_DRAW_AABB_H_
+#define _NBL_EXT_DEBUG_DRAW_DRAW_AABB_H_
+
+#include "nbl/video/declarations.h"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/shapes/aabb.hlsl"
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include "nbl/ext/DebugDraw/builtin/hlsl/common.hlsl"
+
+namespace nbl::ext::debug_draw
+{
+    class DrawAABB final : public core::IReferenceCounted
+    {
+    public:
+        static constexpr inline uint32_t IndicesCount = 24u;
+
+        enum DrawMode : uint16_t
+        {
+            ADM_DRAW_SINGLE = 0b01,
+            ADM_DRAW_BATCH = 0b10,
+            ADM_DRAW_BOTH = 0b11
+        };
+
+        struct SCachedCreationParameters
+        {
+            using streaming_buffer_t = video::StreamingTransientDataBufferST<core::allocator<uint8_t>>;
+
+            static constexpr inline auto RequiredAllocateFlags = core::bitflag<video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS>(video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+            static constexpr inline auto RequiredUsageFlags = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+            DrawMode drawMode = ADM_DRAW_BOTH;
+
+            core::smart_refctd_ptr<video::IUtilities> utilities;
+
+            //! optional, default MDI buffer allocated if not provided
+            core::smart_refctd_ptr<streaming_buffer_t> streamingBuffer = nullptr;
+        };
+
+        struct SCreationParameters : SCachedCreationParameters
+        {
+            video::IQueue* transfer = nullptr;  // only used to make the 24 element index buffer and instanced pipeline on create
+            core::smart_refctd_ptr<asset::IAssetManager> assetManager = nullptr;
+
+            core::smart_refctd_ptr<video::IGPUPipelineLayout> singlePipelineLayout = nullptr;
+            core::smart_refctd_ptr<video::IGPUPipelineLayout> batchPipelineLayout = nullptr;
+            core::smart_refctd_ptr<video::IGPURenderpass> renderpass = nullptr;
+
+            inline bool validate() const
+            {
+                const auto validation = std::to_array
+                ({
+                    std::make_pair(bool(assetManager), "Invalid `creationParams.assetManager` is nullptr!"),
+                    std::make_pair(bool(utilities), "Invalid `creationParams.utilities` is nullptr!"),
+                    std::make_pair(bool(transfer), "Invalid `creationParams.transfer` is nullptr!"),
+                    std::make_pair(bool(renderpass), "Invalid `creationParams.renderpass` is nullptr!"),
+                    std::make_pair(bool(utilities->getLogicalDevice()->getPhysicalDevice()->getQueueFamilyProperties()[transfer->getFamilyIndex()].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::TRANSFER_BIT)), "Invalid `creationParams.transfer` is not capable of transfer operations!")
+                });
+
+                system::logger_opt_ptr logger = utilities->getLogger();
+                for (const auto& [ok, error] : validation)
+                    if (!ok)
+                    {
+                        logger.log(error, system::ILogger::ELL_ERROR);
+                        return false;
+                    }
+
+                assert(bool(assetManager->getSystem()));
+
+                return true;
+            }
+        };
+
+        struct DrawParameters
+        {
+            video::IGPUCommandBuffer* commandBuffer = nullptr;
+            hlsl::float32_t4x4 cameraMat;
+            float lineWidth = 1.f;
+        };
+
+        // creates an instance that can draw one AABB via push constant or multiple using streaming buffer
+        static core::smart_refctd_ptr<DrawAABB> create(SCreationParameters&& params);
+
+        // creates pipeline layout from push constant range
+        static core::smart_refctd_ptr<video::IGPUPipelineLayout> createPipelineLayoutFromPCRange(video::ILogicalDevice* device, const asset::SPushConstantRange& pcRange);
+
+        // creates default pipeline layout for pipeline specified by draw mode (note: if mode==BOTH, returns layout for BATCH mode)
+        static core::smart_refctd_ptr<video::IGPUPipelineLayout> createDefaultPipelineLayout(video::ILogicalDevice* device, DrawMode mode = ADM_DRAW_BATCH);
+
+        //! mounts the extension's archive to given system - useful if you want to create your own shaders with common header included
+        static const core::smart_refctd_ptr<system::IFileArchive> mount(core::smart_refctd_ptr<system::ILogger> logger, system::ISystem* system, video::ILogicalDevice* device, const std::string_view archiveAlias = "");
+
+        inline const SCachedCreationParameters& getCreationParameters() const { return m_cachedCreationParams; }
+
+        // records draw command for single AABB, user has to set pipeline outside
+        bool renderSingle(const DrawParameters& params, const hlsl::shapes::AABB<3, float>& aabb, const hlsl::float32_t4& color);
+
+        // records draw command for rendering batch of AABB instances as InstanceData
+        // user has to set span of filled-in InstanceData; camera matrix used in push constant
+        inline bool render(const DrawParameters& params, video::ISemaphore::SWaitInfo waitInfo, std::span<const InstanceData> aabbInstances)
+        {
+            system::logger_opt_ptr logger = m_cachedCreationParams.utilities->getLogger();
+            if (!(m_cachedCreationParams.drawMode & ADM_DRAW_BATCH))
+            {
+                logger.log("DrawAABB has not been enabled for draw batches!", system::ILogger::ELL_ERROR);
+                return false;
+            }
+
+            using offset_t = SCachedCreationParameters::streaming_buffer_t::size_type;
+            constexpr offset_t MaxAlignment = sizeof(InstanceData);
+            // allocator initialization needs us to round up to PoT
+            const auto MaxPOTAlignment = hlsl::roundUpToPoT(MaxAlignment);
+            auto* streaming = m_cachedCreationParams.streamingBuffer.get();
+            if (streaming->getAddressAllocator().max_alignment() < MaxPOTAlignment)
+            {
+                logger.log("Draw AABB Streaming Buffer cannot guarantee the alignments we require!");
+                return false;
+            }
+
+            auto* const streamingPtr = reinterpret_cast<uint8_t*>(streaming->getBufferPointer());
+            assert(streamingPtr);
+
+            auto& commandBuffer = params.commandBuffer;
+            commandBuffer->bindGraphicsPipeline(m_batchPipeline.get());
+            commandBuffer->setLineWidth(params.lineWidth);
+            asset::SBufferBinding<video::IGPUBuffer> indexBinding = { .offset = 0, .buffer = m_indicesBuffer };
+            commandBuffer->bindIndexBuffer(indexBinding, asset::EIT_32BIT);
+
+            auto srcIt = aabbInstances.begin();
+            auto setInstancesRange = [&](InstanceData* data, uint32_t count) -> void {
+                for (uint32_t i = 0; i < count; i++)
+                {
+                    auto inst = data + i;
+                    *inst = *srcIt;
+                    inst->transform = hlsl::mul(params.cameraMat, inst->transform);
+                    srcIt++;
+
+                    if (srcIt == aabbInstances.end())
+                        break;
+                }
+            };
+
+            const uint32_t numInstances = aabbInstances.size();
+            uint32_t remainingInstancesBytes = numInstances * sizeof(InstanceData);
+            while (srcIt != aabbInstances.end())
+            {
+                uint32_t blockByteSize = core::alignUp(remainingInstancesBytes, MaxAlignment);
+                bool allocated = false;
+
+                offset_t blockOffset = SCachedCreationParameters::streaming_buffer_t::invalid_value;
+                const uint32_t smallestAlloc = hlsl::max<uint32_t>(core::alignUp(sizeof(InstanceData), MaxAlignment), streaming->getAddressAllocator().min_size());
+                while (blockByteSize >= smallestAlloc)
+                {
+                    std::chrono::steady_clock::time_point waitTill = std::chrono::steady_clock::now() + std::chrono::milliseconds(1u);
+                    if (streaming->multi_allocate(waitTill, 1, &blockOffset, &blockByteSize, &MaxAlignment) == 0u)
+                    {
+                        allocated = true;
+                        break;
+                    }
+
+                    streaming->cull_frees();
+                    blockByteSize >>= 1;
+                }
+
+                if (!allocated)
+                {
+                    logger.log("Failed to allocate a chunk from streaming buffer for the next drawcall batch.", system::ILogger::ELL_ERROR);
+                    return false;
+                }
+
+                const uint32_t instanceCount = blockByteSize / sizeof(InstanceData);                
+                auto* const streamingInstancesPtr = reinterpret_cast<InstanceData*>(streamingPtr + blockOffset);
+                setInstancesRange(streamingInstancesPtr, instanceCount);
+
+                if (streaming->needsManualFlushOrInvalidate())
+                {
+                    const video::ILogicalDevice::MappedMemoryRange flushRange(streaming->getBuffer()->getBoundMemory().memory, blockOffset, blockByteSize);
+                    m_cachedCreationParams.utilities->getLogicalDevice()->flushMappedMemoryRanges(1, &flushRange);
+                }
+
+                remainingInstancesBytes -= instanceCount * sizeof(InstanceData);
+
+                SInstancedPC pc;
+                pc.pInstanceBuffer = m_cachedCreationParams.streamingBuffer->getBuffer()->getDeviceAddress() + blockOffset;
+
+                commandBuffer->pushConstants(m_batchPipeline->getLayout(), asset::IShader::E_SHADER_STAGE::ESS_VERTEX, offsetof(ext::debug_draw::PushConstants, ipc), sizeof(SInstancedPC), &pc);
+                commandBuffer->drawIndexed(IndicesCount, instanceCount, 0, 0, 0);
+
+                streaming->multi_deallocate(1, &blockOffset, &blockByteSize, waitInfo);
+            }
+
+            return true;
+        }
+
+        static inline hlsl::float32_t3x4 getTransformFromAABB(const hlsl::shapes::AABB<3, float>& aabb)
+        {
+            const auto diagonal = aabb.getExtent();
+            hlsl::float32_t3x4 transform;
+            transform[0][3] = aabb.minVx.x;
+            transform[1][3] = aabb.minVx.y;
+            transform[2][3] = aabb.minVx.z;
+            transform[0][0] = diagonal.x;
+            transform[1][1] = diagonal.y;
+            transform[2][2] = diagonal.z;
+            return transform;
+        }
+
+    protected:
+        struct ConstructorParams
+        {
+            SCachedCreationParameters creationParams;
+            core::smart_refctd_ptr<video::IGPUGraphicsPipeline> singlePipeline = nullptr;
+            core::smart_refctd_ptr<video::IGPUGraphicsPipeline> batchPipeline = nullptr;
+            core::smart_refctd_ptr<video::IGPUBuffer> indicesBuffer = nullptr;
+        };
+
+	    DrawAABB(ConstructorParams&& params) :
+            m_cachedCreationParams(std::move(params.creationParams)),
+            m_singlePipeline(std::move(params.singlePipeline)),
+            m_batchPipeline(std::move(params.batchPipeline)),
+            m_indicesBuffer(std::move(params.indicesBuffer))
+        {}
+	    ~DrawAABB() override {}
+
+    private:
+        static core::smart_refctd_ptr<video::IGPUGraphicsPipeline> createPipeline(SCreationParameters& params, const video::IGPUPipelineLayout* pipelineLayout, const DrawMode mode);
+        static bool createStreamingBuffer(SCreationParameters& params);
+        static core::smart_refctd_ptr<video::IGPUBuffer> createIndicesBuffer(SCreationParameters& params);
+
+        core::smart_refctd_ptr<video::IGPUBuffer> m_indicesBuffer;
+
+        SCachedCreationParameters m_cachedCreationParams;
+
+        core::smart_refctd_ptr<video::IGPUGraphicsPipeline> m_singlePipeline;
+        core::smart_refctd_ptr<video::IGPUGraphicsPipeline> m_batchPipeline;
+};
+}
+
+#endif
diff --git a/include/nbl/ext/DebugDraw/builtin/hlsl/common.hlsl b/include/nbl/ext/DebugDraw/builtin/hlsl/common.hlsl
new file mode 100644
index 0000000000..b665c9d43a
--- /dev/null
+++ b/include/nbl/ext/DebugDraw/builtin/hlsl/common.hlsl
@@ -0,0 +1,56 @@
+#ifndef _NBL_DEBUG_DRAW_EXT_COMMON_HLSL
+#define _NBL_DEBUG_DRAW_EXT_COMMON_HLSL
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#ifdef __HLSL_VERSION
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
+#endif
+
+namespace nbl
+{
+namespace ext
+{
+namespace debug_draw
+{
+
+struct InstanceData
+{
+    hlsl::float32_t4x4 transform;
+    hlsl::float32_t4 color;
+};
+
+struct SSinglePC
+{
+    InstanceData instance;
+};
+
+struct SInstancedPC
+{
+    uint64_t pInstanceBuffer;
+};
+
+struct PushConstants
+{
+    SSinglePC spc;
+    SInstancedPC ipc;
+};
+
+#ifdef __HLSL_VERSION
+struct PSInput
+{
+    float32_t4 position : SV_Position;
+    nointerpolation float32_t4 color : TEXCOORD0;
+};
+
+float32_t3 getUnitAABBVertex()
+{
+    return (hlsl::promote<uint32_t3>(hlsl::glsl::gl_VertexIndex()) >> uint32_t3(0,2,1)) & 0x1u;
+}
+#endif
+
+}
+}
+}
+#endif
diff --git a/include/nbl/ext/DebugDraw/builtin/hlsl/draw_aabb.unified.hlsl b/include/nbl/ext/DebugDraw/builtin/hlsl/draw_aabb.unified.hlsl
new file mode 100644
index 0000000000..0b51f7de53
--- /dev/null
+++ b/include/nbl/ext/DebugDraw/builtin/hlsl/draw_aabb.unified.hlsl
@@ -0,0 +1,39 @@
+#include "nbl/ext/DebugDraw/builtin/hlsl/common.hlsl"
+
+using namespace nbl::hlsl;
+using namespace nbl::ext::debug_draw;
+
+[[vk::push_constant]] PushConstants pc;
+
+[shader("vertex")]
+PSInput aabb_vertex_single()
+{
+    PSInput output;
+    float32_t3 vertex = getUnitAABBVertex();
+
+    output.position = math::linalg::promoted_mul(pc.spc.instance.transform, vertex);
+    output.color = pc.spc.instance.color;
+
+    return output;
+}
+
+[shader("vertex")]
+PSInput aabb_vertex_instances()
+{
+    PSInput output;
+    const float32_t3 vertex = getUnitAABBVertex();
+    InstanceData instance = vk::BufferPointer<InstanceData>(pc.ipc.pInstanceBuffer + sizeof(InstanceData) * glsl::gl_InstanceIndex()).Get();
+
+    output.position = math::linalg::promoted_mul(instance.transform, vertex);
+    output.color = instance.color;
+
+    return output;
+}
+
+[shader("pixel")]
+float32_t4 aabb_fragment(PSInput input) : SV_TARGET
+{
+    float32_t4 outColor = input.color;
+
+    return outColor;
+}
diff --git a/include/nbl/system/CStdoutLogger.h b/include/nbl/system/CStdoutLogger.h
index 24693edd61..a63b8cf567 100644
--- a/include/nbl/system/CStdoutLogger.h
+++ b/include/nbl/system/CStdoutLogger.h
@@ -15,7 +15,7 @@ class CStdoutLogger : public IThreadsafeLogger
 	protected:
 		virtual void threadsafeLog_impl(const std::string_view& fmt, E_LOG_LEVEL logLevel, va_list args) override
 		{
-			printf(constructLogString(fmt, logLevel, args).data());
+			printf("%s", constructLogString(fmt, logLevel, args).data());
 			fflush(stdout);
 		}
 
diff --git a/include/nbl/system/ISystem.h b/include/nbl/system/ISystem.h
index 4e02221d7c..65f0351582 100644
--- a/include/nbl/system/ISystem.h
+++ b/include/nbl/system/ISystem.h
@@ -70,6 +70,7 @@ class NBL_API2 ISystem : public core::IReferenceCounted
         //
         virtual inline bool isDirectory(const system::path& p) const
         {
+            // TODO: fix bug, input "nbl/ext/DebugDraw/builtin/hlsl" -> returs true when no such dir present in mounted stuff due to how it uses parent paths in loop (goes up up till matches "nbl" builtin archive and thinks it resolved the requested dir)
             if (isPathReadOnly(p))
                 return p.extension()==""; // TODO: this is a temporary decision until we figure out how to check if a file is directory in android APK
             else
diff --git a/include/nbl/system/to_string.h b/include/nbl/system/to_string.h
index 92888704c0..c055434fa4 100644
--- a/include/nbl/system/to_string.h
+++ b/include/nbl/system/to_string.h
@@ -2,6 +2,8 @@
 #define _NBL_SYSTEM_TO_STRING_INCLUDED_
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/emulated/int64_t.hlsl>
+#include <nbl/builtin/hlsl/morton.hlsl>
 
 namespace nbl
 {
@@ -19,6 +21,24 @@ struct to_string_helper
     }
 };
 
+template<>
+struct to_string_helper<hlsl::emulated_uint64_t>
+{
+    static std::string __call(const hlsl::emulated_uint64_t& value)
+    {
+        return std::to_string(static_cast<uint64_t>(value));
+    }
+};
+
+template<>
+struct to_string_helper<hlsl::emulated_int64_t>
+{
+    static std::string __call(const hlsl::emulated_int64_t& value)
+    {
+        return std::to_string(static_cast<int64_t>(value));
+    }
+};
+
 template<typename T, int16_t N>
 struct to_string_helper<hlsl::vector<T, N>>
 {
@@ -39,6 +59,35 @@ struct to_string_helper<hlsl::vector<T, N>>
     }
 };
 
+template<typename T, uint16_t N, uint16_t M>
+struct to_string_helper<hlsl::matrix<T,N,M>>
+{
+    static std::string __call(const hlsl::matrix<T, N, M>& matrix)
+    {
+        std::stringstream output;
+        output << '\n';
+        for (int i = 0; i < N; ++i)
+        {
+            output << "{ ";
+            for (int j = 0; j < M; ++j)
+                output << matrix[i][j] << ", ";
+            output << "}\n";
+        }
+        return output.str();
+    }
+};
+
+template<bool Signed, uint16_t Bits, uint16_t D, typename _uint64_t>
+struct to_string_helper<hlsl::morton::code<Signed, Bits, D, _uint64_t>>
+{
+    using value_t = hlsl::morton::code<Signed, Bits, D, _uint64_t>;
+    static std::string __call(value_t value)
+    {
+        return to_string_helper<value_t::storage_t>::__call(value.value);
+    }
+};
+
+
 }
 
 template<typename T>
diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index bb6460754a..3290bd916a 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -328,8 +328,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         bool copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo);
 
         //! state setup
-        bool bindComputePipeline(const IGPUComputePipeline* const pipeline);
         bool bindGraphicsPipeline(const IGPUGraphicsPipeline* const pipeline);
+        bool bindComputePipeline(const IGPUComputePipeline* const pipeline);
+        bool bindMeshPipeline(const IGPUMeshPipeline* const pipeline);
         bool bindRayTracingPipeline(const IGPURayTracingPipeline* const pipeline);
         bool bindDescriptorSets(
             const asset::E_PIPELINE_BIND_POINT pipelineBindPoint, const IGPUPipelineLayout* const layout,
@@ -442,6 +443,14 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         }
         bool dispatchIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding);
 
+        bool drawMeshTasks(const uint32_t groupCountX, const uint32_t groupCountY = 1, const uint32_t groupCountZ = 1);
+        template<typename T> requires std::is_integral_v<T>
+        bool drawMeshTasks(const hlsl::vector<T, 3> groupCount)
+        {
+            return drawMeshTasks(groupCount.x, groupCount.y, groupCount.z);
+        }
+        bool drawMeshTasksIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride);
+
         //! Begin/End RenderPasses
         struct SRenderpassBeginInfo
         {
@@ -587,6 +596,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         inline const core::unordered_map<const IGPUDescriptorSet*, uint64_t>& getBoundDescriptorSetsRecord() const { return m_boundDescriptorSetsRecord; }
         const IGPUGraphicsPipeline* getBoundGraphicsPipeline() const { return m_boundGraphicsPipeline; }
         const IGPUComputePipeline* getBoundComputePipeline() const { return m_boundComputePipeline; }
+        const IGPUMeshPipeline* getBoundMeshPipeline() const { return m_boundMeshPipeline; }
         const IGPURayTracingPipeline* getBoundRayTracingPipeline() const { return m_boundRayTracingPipeline; }
 
     protected: 
@@ -670,8 +680,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst) = 0;
         virtual bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst) = 0;
 
-        virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0;
         virtual bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) = 0;
+        virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0;
+        virtual bool bindMeshPipeline_impl(const IGPUMeshPipeline* const pipeline) = 0;
         virtual bool bindRayTracingPipeline_impl(const IGPURayTracingPipeline* const pipeline) = 0;
         virtual bool bindDescriptorSets_impl(
             const asset::E_PIPELINE_BIND_POINT pipelineBindPoint, const IGPUPipelineLayout* const layout,
@@ -702,6 +713,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         virtual bool dispatch_impl(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ) = 0;
         virtual bool dispatchIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding) = 0;
 
+        virtual bool drawMeshTasks_impl(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ) = 0;
+        virtual bool drawMeshTasksIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride) = 0;
+
         virtual bool beginRenderPass_impl(const SRenderpassBeginInfo& info, SUBPASS_CONTENTS contents) = 0;
         virtual bool nextSubpass_impl(const SUBPASS_CONTENTS contents) = 0;
         virtual bool endRenderPass_impl() = 0;
@@ -750,9 +764,10 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 
             m_boundDescriptorSetsRecord.clear();
             m_TLASTrackingOps.clear();
-            m_boundGraphicsPipeline= nullptr;
-            m_boundComputePipeline= nullptr;
-            m_boundRayTracingPipeline= nullptr;
+            m_boundGraphicsPipeline = nullptr;
+            m_boundComputePipeline = nullptr;
+            m_boundMeshPipeline = nullptr;
+            m_boundRayTracingPipeline = nullptr;
             m_haveRtPipelineStackSize = false;
 
             m_commandList.head = nullptr;
@@ -770,6 +785,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             m_TLASTrackingOps.clear();
             m_boundGraphicsPipeline= nullptr;
             m_boundComputePipeline= nullptr;
+            m_boundMeshPipeline = nullptr;
             m_boundRayTracingPipeline= nullptr;
             m_haveRtPipelineStackSize = false;
             releaseResourcesBackToPool_impl();
@@ -931,6 +947,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 
         const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
         const IGPUComputePipeline* m_boundComputePipeline;
+        const IGPUMeshPipeline* m_boundMeshPipeline;
         const IGPURayTracingPipeline* m_boundRayTracingPipeline;
     
         IGPUCommandPool::CCommandSegmentListPool::SCommandSegmentList m_commandList = {};
diff --git a/include/nbl/video/IGPUCommandPool.h b/include/nbl/video/IGPUCommandPool.h
index 0424ad83bd..c44152130f 100644
--- a/include/nbl/video/IGPUCommandPool.h
+++ b/include/nbl/video/IGPUCommandPool.h
@@ -8,8 +8,9 @@
 
 #include "nbl/video/IEvent.h"
 #include "nbl/video/IGPUDescriptorSet.h"
-#include "nbl/video/IGPUComputePipeline.h"
 #include "nbl/video/IGPUGraphicsPipeline.h"
+#include "nbl/video/IGPUComputePipeline.h"
+#include "nbl/video/IGPUMeshPipeline.h"
 #include "nbl/video/IGPURayTracingPipeline.h"
 #include "nbl/video/IGPUFramebuffer.h"
 #include "nbl/video/IQueryPool.h"
@@ -125,7 +126,6 @@ class IGPUCommandPool : public IBackendObject
         class CBeginRenderPassCmd;
         class CPipelineBarrierCmd;
         class CBindDescriptorSetsCmd;
-        class CBindComputePipelineCmd;
         class CUpdateBufferCmd;
         class CResetQueryPoolCmd;
         class CWriteTimestampCmd;
@@ -133,6 +133,9 @@ class IGPUCommandPool : public IBackendObject
         class CEndQueryCmd;
         class CCopyQueryPoolResultsCmd;
         class CBindGraphicsPipelineCmd;
+        class CBindComputePipelineCmd;
+        class CBindMeshPipelineCmd;
+        class CBindRayTracingPipelineCmd;
         class CPushConstantsCmd;
         class CBindVertexBuffersCmd;
         class CCopyBufferCmd;
@@ -155,7 +158,6 @@ class IGPUCommandPool : public IBackendObject
         class CCopyAccelerationStructureToOrFromMemoryCmd; // for both vkCmdCopyAccelerationStructureToMemoryKHR and vkCmdCopyMemoryToAccelerationStructureKHR
         class CTraceRaysCmd;
         class CTraceRaysIndirectCmd;
-        class CBindRayTracingPipelineCmd;
 
     protected:
         IGPUCommandPool(core::smart_refctd_ptr<const ILogicalDevice>&& dev, const core::bitflag<CREATE_FLAGS> _flags, const uint8_t _familyIx)
@@ -529,15 +531,6 @@ class IGPUCommandPool::CBindDescriptorSetsCmd final : public IFixedSizeCommand<C
         core::smart_refctd_ptr<const IGPUDescriptorSet> m_sets[IGPUPipelineLayout::DESCRIPTOR_SET_COUNT];
 };
 
-class IGPUCommandPool::CBindComputePipelineCmd final : public IFixedSizeCommand<CBindComputePipelineCmd>
-{
-    public:
-        CBindComputePipelineCmd(core::smart_refctd_ptr<const IGPUComputePipeline>&& pipeline) : m_pipeline(std::move(pipeline)) {}
-
-    private:
-        core::smart_refctd_ptr<const IGPUComputePipeline> m_pipeline;
-};
-
 class IGPUCommandPool::CUpdateBufferCmd final : public IFixedSizeCommand<CUpdateBufferCmd>
 {
     public:
@@ -604,6 +597,24 @@ class IGPUCommandPool::CBindGraphicsPipelineCmd final : public IFixedSizeCommand
         core::smart_refctd_ptr<const IGPUGraphicsPipeline> m_pipeline;
 };
 
+class IGPUCommandPool::CBindComputePipelineCmd final : public IFixedSizeCommand<CBindComputePipelineCmd>
+{
+public:
+    CBindComputePipelineCmd(core::smart_refctd_ptr<const IGPUComputePipeline>&& pipeline) : m_pipeline(std::move(pipeline)) {}
+
+private:
+    core::smart_refctd_ptr<const IGPUComputePipeline> m_pipeline;
+};
+
+class IGPUCommandPool::CBindMeshPipelineCmd final : public IFixedSizeCommand<CBindMeshPipelineCmd>
+{
+public:
+    CBindMeshPipelineCmd(core::smart_refctd_ptr<const IGPUMeshPipeline>&& pipeline) : m_pipeline(std::move(pipeline)) {}
+
+private:
+    core::smart_refctd_ptr<const IGPUMeshPipeline> m_pipeline;
+};
+
 class IGPUCommandPool::CPushConstantsCmd final : public IFixedSizeCommand<CPushConstantsCmd>
 {
     public:
diff --git a/include/nbl/video/IGPUMeshPipeline.h b/include/nbl/video/IGPUMeshPipeline.h
new file mode 100644
index 0000000000..794eb68ee1
--- /dev/null
+++ b/include/nbl/video/IGPUMeshPipeline.h
@@ -0,0 +1,167 @@
+#ifndef _NBL_I_GPU_MESH_PIPELINE_H_INCLUDED_
+#define _NBL_I_GPU_MESH_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IMeshPipeline.h"
+
+#include "nbl/video/IGPUPipelineLayout.h"
+#include "nbl/video/IGPURenderpass.h"
+#include "nbl/video/IGPUPipeline.h"
+
+//related spec
+
+//i feel like this MIGHT get stuffed into graphicspipeline but idk
+
+/*
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-PrimitiveId-06264
+** If the pipeline requires pre-rasterization shader state, it includes a mesh shader and the fragment shader code reads from an input variable that is decorated with PrimitiveId, then the mesh shader code must write to a matching output variable, decorated with PrimitiveId, in all execution paths
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-07064
+* If renderPass is not VK_NULL_HANDLE, the pipeline is being created with pre-rasterization shader state, subpass viewMask is not 0, and multiviewMeshShader is not enabled, then pStages must not include a mesh shader
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-None-02322
+* If the pipeline requires pre-rasterization shader state, and there are any mesh shader stages in the pipeline there must not be any shader stage in the pipeline with a Xfb execution mode
+*** whats a xfb
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-shaderMeshEnqueue-10187
+* If the shaderMeshEnqueue feature is not enabled, shaders specified by pStages must not declare the ShaderEnqueueAMDX capability
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-flags-10188
+* If flags does not include VK_PIPELINE_CREATE_LIBRARY_BIT_KHR, shaders specified by pStages must not declare the ShaderEnqueueAMDX capability
+*** my understanding is nabla strictly controls it's extensions, so this shouldnt be an issue
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pDynamicStates-07065
+* If the pipeline requires pre-rasterization shader state, and includes a mesh shader, there must be no element of the
+*  pDynamicStates member of pDynamicState set to VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY, or VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE
+*** this one seems the most relevant
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pDynamicStates-07066
+* If the pipeline requires pre-rasterization shader state, and includes a mesh shader, there must be no element of the
+*  pDynamicStates member of pDynamicState set to VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE, or VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pDynamicStates-07067
+* If the pipeline requires pre-rasterization shader state, and includes a mesh shader, there must be no element of the pDynamicStates member of pDynamicState set to VK_DYNAMIC_STATE_VERTEX_INPUT_EXT
+
+https://registry.khronos.org/vulkan/specs/latest/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-07720
+* If renderPass is VK_NULL_HANDLE, the pipeline is being created with pre-rasterization shader state, and
+*  VkPipelineRenderingCreateInfo::viewMask is not 0, and multiviewMeshShader is not enabled, then pStages must not include a mesh shader
+
+
+* theres 1 or 2 more about pipeline libraries, but im not going to worry about that
+*/
+
+namespace nbl::video
+{
+
+    class IGPUMeshPipeline : public IGPUPipeline<asset::IMeshPipeline<const IGPUPipelineLayout, const IGPURenderpass>>
+    {
+        using pipeline_t = asset::IMeshPipeline<const IGPUPipelineLayout, const IGPURenderpass>;
+
+    public:
+        struct SCreationParams final : public SPipelineCreationParams<const IGPUMeshPipeline>
+        {
+        public:
+#define base_flag(F) static_cast<uint64_t>(pipeline_t::FLAGS::F)
+            enum class FLAGS : uint64_t
+            {
+                NONE = base_flag(NONE),
+                DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
+                ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
+                VIEW_INDEX_FROM_DEVICE_INDEX = 1 << 3,
+                FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
+                EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
+            };
+#undef base_flag
+
+            inline SSpecializationValidationResult valid() const
+            {
+                //this seems like the place to check if the mesh extension exists, but the raytracing pipeline doesnt do it here
+                if (!layout)
+                    return {};
+                SSpecializationValidationResult retval = { .count = 0,.dataSize = 0 };
+
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+                if (!renderpass || cached.subpassIx >= renderpass->getSubpassCount())
+                    return {};
+
+                // TODO: check rasterization samples, etc.
+                //rp->getCreationParameters().subpasses[i]
+
+                core::bitflag<hlsl::ShaderStage> stagePresence = {};
+
+                auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage)
+                    {
+                        if (!specInfo.shader) return true;
+                        if (!specInfo.accumulateSpecializationValidationResult(&retval)) return false;
+                        stagePresence |= stage;
+                        return true;
+                    };
+                if (!processSpecInfo(taskShader, hlsl::ShaderStage::ESS_TASK)) return {};
+                if (!processSpecInfo(meshShader, hlsl::ShaderStage::ESS_MESH)) return {};
+                if (!processSpecInfo(fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT)) return {};
+
+                if (!hasRequiredStages(stagePresence))
+                    return {};
+
+                //if (!vertexShader.shader) return {}; //i dont quite understand why this line was in IGPUGraphics. checking if the shader itself was made correctly?
+
+                return retval;
+            }
+
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+
+                core::bitflag<hlsl::ShaderStage> stages = {};
+                auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                    {
+                        if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                            stages |= stage;
+                        }
+                    };
+                processSpecInfo(taskShader, hlsl::ESS_TASK);
+                processSpecInfo(meshShader, hlsl::ESS_MESH);
+                processSpecInfo(fragmentShader, hlsl::ESS_FRAGMENT);
+                return stages;
+            }
+
+            inline core::bitflag<FLAGS>& getFlags() { return flags; }
+
+            inline core::bitflag<FLAGS> getFlags() const { return flags; }
+
+            const IGPUPipelineLayout* layout = nullptr;
+            SShaderSpecInfo taskShader;
+            SShaderSpecInfo meshShader;
+            SShaderSpecInfo fragmentShader;
+            SCachedCreationParams cached = {};
+            renderpass_t* renderpass = nullptr;
+
+            // TODO: Could guess the required flags from SPIR-V introspection of declared caps
+            core::bitflag<FLAGS> flags = FLAGS::NONE;
+
+            inline uint32_t getShaderCount() const
+            {
+                uint32_t count = 0; //count = 2 and only check task shader??
+                count += (taskShader.shader != nullptr);
+                count += (meshShader.shader != nullptr);
+                count += (fragmentShader.shader != nullptr);
+                return count;
+            }
+        };
+
+        inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const { return m_flags; }
+
+        // Vulkan: const VkPipeline*
+        virtual const void* getNativeHandle() const = 0;
+
+    protected:
+        // not explicit?
+        IGPUMeshPipeline(const SCreationParams& params) :
+            IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags)
+        {
+        }
+        virtual ~IGPUMeshPipeline() override = default;
+
+        const core::bitflag<SCreationParams::FLAGS> m_flags;
+    };
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 180342e2d4..0a8c79053c 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -1020,7 +1020,11 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             return createPipelineCache(initialData,notThreadsafe);
         }
 
-        bool createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const IGPUComputePipeline::SCreationParams> params, core::smart_refctd_ptr<IGPUComputePipeline>* const output);
+        bool createComputePipelines(
+            IGPUPipelineCache* const pipelineCache, 
+            const std::span<const IGPUComputePipeline::SCreationParams> params, 
+            core::smart_refctd_ptr<IGPUComputePipeline>* const output
+        );
 
         bool createGraphicsPipelines(
             IGPUPipelineCache* const pipelineCache,
@@ -1028,9 +1032,17 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output
         );
 
-        bool createRayTracingPipelines(IGPUPipelineCache* const pipelineCache,
-          const std::span<const IGPURayTracingPipeline::SCreationParams> params,
-          core::smart_refctd_ptr<IGPURayTracingPipeline>* const output);
+        bool createRayTracingPipelines(
+            IGPUPipelineCache* const pipelineCache,
+            const std::span<const IGPURayTracingPipeline::SCreationParams> params,
+            core::smart_refctd_ptr<IGPURayTracingPipeline>* const output
+        );
+
+        bool createMeshPipelines(
+            IGPUPipelineCache* const pipelineCache,
+            const std::span<const IGPUMeshPipeline::SCreationParams> params,
+            core::smart_refctd_ptr<IGPUMeshPipeline>* const output
+        );
         
         // queries
         inline core::smart_refctd_ptr<IQueryPool> createQueryPool(const IQueryPool::SCreationParams& params)
@@ -1282,12 +1294,18 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             core::smart_refctd_ptr<IGPUComputePipeline>* const output,
             const SSpecializationValidationResult& validation
         ) = 0;
-        virtual void createGraphicsPipelines_impl(
+        virtual void createGraphicsPipelines_impl (
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
             const SSpecializationValidationResult& validation
         ) = 0;
+        virtual void createMeshPipelines_impl(
+            IGPUPipelineCache* const pipelineCache,
+            const std::span<const IGPUMeshPipeline::SCreationParams> params,
+            core::smart_refctd_ptr<IGPUMeshPipeline>* const output,
+            const SSpecializationValidationResult& validation
+        ) = 0;
         virtual void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPURayTracingPipeline::SCreationParams> createInfos,
diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h
index c4a6c25ca5..c4279a5cad 100644
--- a/include/nbl/video/asset_traits.h
+++ b/include/nbl/video/asset_traits.h
@@ -9,10 +9,6 @@
 #include "nbl/video/IGPUBufferView.h"
 #include "nbl/asset/ICPUDescriptorSet.h"
 #include "nbl/video/IGPUDescriptorSet.h"
-#include "nbl/asset/ICPUComputePipeline.h"
-#include "nbl/video/IGPUComputePipeline.h"
-#include "nbl/asset/ICPUGraphicsPipeline.h"
-#include "nbl/video/IGPUGraphicsPipeline.h"
 #include "nbl/asset/ICPUSampler.h"
 #include "nbl/video/IGPUSampler.h"
 #include "nbl/asset/ICPUImageView.h"
@@ -21,6 +17,12 @@
 #include "nbl/video/IGPUAccelerationStructure.h"
 #include "nbl/asset/ICPUPolygonGeometry.h"
 #include "nbl/video/IGPUPolygonGeometry.h"
+#include "nbl/asset/ICPUGraphicsPipeline.h"
+#include "nbl/video/IGPUGraphicsPipeline.h"
+#include "nbl/asset/ICPUComputePipeline.h"
+#include "nbl/video/IGPUComputePipeline.h"
+#include "nbl/asset/ICPUMeshPipeline.h"
+#include "nbl/video/IGPUMeshPipeline.h"
 #include "nbl/asset/ICPURayTracingPipeline.h"
 #include "nbl/video/IGPURayTracingPipeline.h"
 
@@ -136,6 +138,19 @@ struct asset_traits<asset::ICPUGraphicsPipeline>
 	using lookup_t = const video_t*;
 };
 
+template<>
+struct asset_traits<asset::ICPUMeshPipeline> {
+	//the asset type
+	using asset_t = asset::ICPUMeshPipeline;
+	// we reference a pipeline layout and a renderpass
+	constexpr static inline bool HasChildren = true;
+	// the video type
+	using video_t = IGPUGraphicsPipeline;
+	// lookup type
+	using lookup_t = const video_t*;
+};
+
+
 
 template<>
 struct asset_traits<asset::ICPUBuffer>
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 76e046848c..3bc379c08f 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -266,7 +266,6 @@ set(NBL_VIDEO_SOURCES
 	video/CVulkanDescriptorSetLayout.cpp
 	video/CVulkanPipelineLayout.cpp
 	video/CVulkanPipelineCache.cpp
-	video/CVulkanComputePipeline.cpp
 	video/CVulkanDescriptorPool.cpp
 	video/CVulkanDescriptorSet.cpp
 	video/CVulkanMemoryAllocation.cpp
@@ -279,6 +278,8 @@ set(NBL_VIDEO_SOURCES
 	video/CVulkanConnection.cpp
 	video/CVulkanPhysicalDevice.cpp
 	video/CVulkanGraphicsPipeline.cpp
+	video/CVulkanComputePipeline.cpp
+	video/CVulkanMeshPipeline.cpp
 	video/CVulkanRayTracingPipeline.cpp
 	video/CVulkanEvent.cpp
 	video/CSurfaceVulkan.cpp
diff --git a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
index d4b9a3e394..b538f75eb3 100644
--- a/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
+++ b/src/nbl/asset/interchange/CGraphicsPipelineLoaderMTL.cpp
@@ -894,7 +894,7 @@ auto CGraphicsPipelineLoaderMTL::readMaterials(system::IFile* _file, const syste
                 case 'f':		// Tf - Transmitivity
                     currMaterial->params.transmissionFilter = readRGB();
                     sprintf(tmpbuf, "%s, %s: Detected Tf parameter, it won't be used in generated shader - fallback to alpha=0.5 instead", _file->getFileName().string().c_str(), currMaterial->name.c_str());
-                    logger.log(tmpbuf, system::ILogger::ELL_WARNING);
+                    logger.log("%s", system::ILogger::ELL_WARNING, tmpbuf);
                     break;
                 case 'r':       // Tr, transparency = 1.0-d
                     currMaterial->params.opacity = (1.f - readFloat());
diff --git a/src/nbl/asset/interchange/CImageLoaderJPG.cpp b/src/nbl/asset/interchange/CImageLoaderJPG.cpp
index 45677ff5cf..1db5e16ac2 100644
--- a/src/nbl/asset/interchange/CImageLoaderJPG.cpp
+++ b/src/nbl/asset/interchange/CImageLoaderJPG.cpp
@@ -93,7 +93,7 @@ namespace jpeg
 		std::string errMsg("JPEG FATAL ERROR in ");
 		auto ctx = reinterpret_cast<CImageLoaderJPG::SContext*>(cinfo->client_data);
 		errMsg += ctx->filename;
-		ctx->logger.log(errMsg + temp1, system::ILogger::ELL_ERROR);
+		ctx->logger.log("%s", system::ILogger::ELL_ERROR, errMsg + temp1);
 	}
 
 	/*	Initialize source.  This is called by jpeg_read_header() before any
diff --git a/src/nbl/asset/utils/CHLSLCompiler.cpp b/src/nbl/asset/utils/CHLSLCompiler.cpp
index 306d2f60de..d36ecfa1cb 100644
--- a/src/nbl/asset/utils/CHLSLCompiler.cpp
+++ b/src/nbl/asset/utils/CHLSLCompiler.cpp
@@ -115,11 +115,11 @@ static bool fixup_spirv_target_ver(std::vector<std::wstring>& arguments, system:
         const auto found = AllowedSuffices.find(suffix);
         if (found!=AllowedSuffices.end())
             return true;
-        logger.log("Compile flag error: Required compile flag not found -fspv-target-env=. Force enabling -fspv-target-env= found but with unsupported value `%s`.", system::ILogger::ELL_ERROR, "TODO: write wchar to char convert usage");
+        logger.log("Compile flag warning: Required compile flag not found -fspv-target-env=. Force enabling -fspv-target-env= found but with unsupported value `%s`.", system::ILogger::ELL_ERROR, "TODO: write wchar to char convert usage");
         return false;
     }
 
-    logger.log("Compile flag error: Required compile flag not found -fspv-target-env=. Force enabling -fspv-target-env=vulkan1.3, as it is required by Nabla.", system::ILogger::ELL_WARNING);
+    logger.log("Compile flag warning: Required compile flag not found -fspv-target-env=. Force enabling -fspv-target-env=vulkan1.3, as it is required by Nabla.", system::ILogger::ELL_WARNING);
     arguments.push_back(L"-fspv-target-env=vulkan1.3");
     return true;
 }
@@ -148,7 +148,7 @@ static void try_upgrade_hlsl_version(std::vector<std::wstring>& arguments, syste
     }
     else
     {
-        logger.log("Compile flag error: Required compile flag not found -HV. Force enabling -HV 202x, as it is required by Nabla.", system::ILogger::ELL_WARNING);
+        logger.log("Compile flag warning: Required compile flag not found -HV. Force enabling -HV 202x, as it is required by Nabla.", system::ILogger::ELL_WARNING);
         arguments.push_back(L"-HV");
         arguments.push_back(L"202x");
     }
@@ -254,7 +254,7 @@ static void add_required_arguments_if_not_present(std::vector<std::wstring>& arg
     {
         bool missing = set.find(required[j]) == set.end();
         if (missing) {
-            logger.log("Compile flag error: Required compile flag not found %ls. This flag will be force enabled, as it is required by Nabla.", system::ILogger::ELL_WARNING, required[j]);
+            logger.log("Compile flag warning: Required compile flag not found %ls. This flag will be force enabled, as it is required by Nabla.", system::ILogger::ELL_WARNING, required[j]);
             arguments.push_back(required[j]);
         }
     }
@@ -534,4 +534,4 @@ void CHLSLCompiler::insertIntoStart(std::string& code, std::ostringstream&& ins)
     code.insert(0u, ins.str());
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
index 4ac78066a7..818fbc584b 100644
--- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp
+++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
@@ -1054,7 +1054,7 @@ void CSPIRVIntrospector::CStageIntrospectionData::debugPrint(system::ILogger* lo
         }
     }
 
-    logger->log(debug.str() + '\n');
+    logger->log("%s", system::ILogger::ELL_DEBUG, debug.str() + '\n');
 }
 
 }
\ No newline at end of file
diff --git a/src/nbl/asset/utils/CSmoothNormalGenerator.cpp b/src/nbl/asset/utils/CSmoothNormalGenerator.cpp
index 8c03ad99b9..f8bc45a317 100644
--- a/src/nbl/asset/utils/CSmoothNormalGenerator.cpp
+++ b/src/nbl/asset/utils/CSmoothNormalGenerator.cpp
@@ -58,7 +58,7 @@ CSmoothNormalGenerator::VertexHashMap CSmoothNormalGenerator::setupData(const as
 		const auto faceNormal = normalize(cross(v1 - v0, v2 - v0));
 
 		//set data for m_vertices
-		const auto angleWages = hlsl::shapes::util::compInternalAngle(v2 - v1, v0 - v2, v1 - v2);
+		const auto angleWages = hlsl::shapes::util::anglesFromTriangleEdges(v2 - v1, v0 - v2, v1 - v2);
 
 		vertices.add({ i,	0,	faceNormal * angleWages.x, v0});
 		vertices.add({ i + 1,	0,	faceNormal * angleWages.y,v1});
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 75cb681d36..816f49fd73 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -229,6 +229,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/geometry.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/intutil.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/polar.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/angle_adding.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quaternions.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quadratic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/cubic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
@@ -253,12 +254,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/ellipse.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/line.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/beziers.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/triangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_triangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_rectangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/aabb.hlsl")
 #sampling
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/basic.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/linear.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/bilinear.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quantized_sequence.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/concentric_mapping.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform_spheres.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/box_muller_transform.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_triangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_triangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_rectangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform.hlsl")
 #
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ndarray_addressing.hlsl")
 #
@@ -355,7 +365,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/output_structs.hlsl")
 #blur
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/blur.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/box_sampler.hlsl")
-#morton codes
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/morton.hlsl")
+#rwmc
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/Resolve.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/CascadeAccumulator.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/SplattingParameters.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/ResolveParameters.hlsl")
 
 ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index e0bcd223f6..6271e912b9 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -54,6 +54,16 @@ if(NBL_BUILD_TEXT_RENDERING)
 	add_subdirectory(TextRendering)
 endif()
 
-propagate_changed_variables_to_parent_scope()
+if(NBL_BUILD_DEBUG_DRAW)
+    add_subdirectory(DebugDraw)
+    set(NBL_EXT_DEBUG_DRAW_INCLUDE_DIRS
+        ${NBL_EXT_DEBUG_DRAW_INCLUDE_DIRS}
+        PARENT_SCOPE
+    )
+    set(NBL_EXT_DEBUG_DRAW_LIB
+        ${NBL_EXT_DEBUG_DRAW_LIB}
+        PARENT_SCOPE
+    )
+endif()
 
-NBL_ADJUST_FOLDERS(ext)
\ No newline at end of file
+propagate_changed_variables_to_parent_scope()
\ No newline at end of file
diff --git a/src/nbl/ext/DebugDraw/CDrawAABB.cpp b/src/nbl/ext/DebugDraw/CDrawAABB.cpp
new file mode 100644
index 0000000000..ca82da688a
--- /dev/null
+++ b/src/nbl/ext/DebugDraw/CDrawAABB.cpp
@@ -0,0 +1,370 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/ext/DebugDraw/CDrawAABB.h"
+
+#ifdef NBL_EMBED_BUILTIN_RESOURCES
+#include "nbl/ext/debug_draw/builtin/build/CArchive.h"
+#endif
+
+#include "nbl/ext/DebugDraw/builtin/build/spirv/keys.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace video;
+using namespace system;
+using namespace asset;
+using namespace hlsl;
+
+namespace nbl::ext::debug_draw
+{
+
+core::smart_refctd_ptr<DrawAABB> DrawAABB::create(SCreationParameters&& params)
+{
+	auto* const logger = params.utilities->getLogger();
+
+	if (!params.validate())
+	{
+		logger->log("Failed creation parameters validation!", ILogger::ELL_ERROR);
+		return nullptr;
+	}
+
+	ConstructorParams constructorParams;
+
+	if (params.drawMode & ADM_DRAW_SINGLE)
+	{
+		auto pipelineLayout = params.singlePipelineLayout;
+		if (!pipelineLayout)
+			pipelineLayout = createDefaultPipelineLayout(params.utilities->getLogicalDevice(), ADM_DRAW_SINGLE);
+		constructorParams.singlePipeline = createPipeline(params, pipelineLayout.get(), ADM_DRAW_SINGLE);
+		if (!constructorParams.singlePipeline)
+		{
+			logger->log("Failed to create pipeline!", ILogger::ELL_ERROR);
+			return nullptr;
+		}
+	}
+
+	if (params.drawMode & ADM_DRAW_BATCH)
+	{
+		auto pipelineLayout = params.batchPipelineLayout;
+		if (!pipelineLayout)
+			pipelineLayout = createDefaultPipelineLayout(params.utilities->getLogicalDevice(), ADM_DRAW_BATCH);
+		constructorParams.batchPipeline = createPipeline(params, pipelineLayout.get(), ADM_DRAW_BATCH);
+		if (!constructorParams.batchPipeline)
+		{
+			logger->log("Failed to create pipeline!", ILogger::ELL_ERROR);
+			return nullptr;
+		}
+	}
+
+	if (!createStreamingBuffer(params))
+	{
+		logger->log("Failed to create streaming buffer!", ILogger::ELL_ERROR);
+		return nullptr;
+	}
+
+	constructorParams.indicesBuffer = createIndicesBuffer(params);
+	if (!constructorParams.indicesBuffer)
+	{
+		logger->log("Failed to create indices buffer!", ILogger::ELL_ERROR);
+		return nullptr;
+	}
+
+	constructorParams.creationParams = std::move(params);
+    return core::smart_refctd_ptr<DrawAABB>(new DrawAABB(std::move(constructorParams)));
+}
+
+// extension data mount alias
+constexpr std::string_view NBL_EXT_MOUNT_ENTRY = "nbl/ext/DebugDraw";
+
+const smart_refctd_ptr<IFileArchive> DrawAABB::mount(smart_refctd_ptr<ILogger> logger, ISystem* system, video::ILogicalDevice* device, const std::string_view archiveAlias)
+{
+	assert(system);
+
+	if (!system)
+		return nullptr;
+
+	// the key is deterministic, we are validating presence of required .spv
+	const auto composed = path(archiveAlias.data()) / nbl::ext::debug_draw::builtin::build::get_spirv_key<"draw_aabb">(device);
+	if (system->exists(composed, {}))
+		return nullptr;
+
+	// extension should mount everything for you, regardless if content goes from virtual filesystem 
+	// or disk directly - and you should never rely on application framework to expose extension data
+	#ifdef NBL_EMBED_BUILTIN_RESOURCES
+	auto archive = make_smart_refctd_ptr<builtin::build::CArchive>(smart_refctd_ptr(logger));
+	#else
+	auto archive = make_smart_refctd_ptr<nbl::system::CMountDirectoryArchive>(std::string_view(NBL_DEBUG_DRAW_HLSL_MOUNT_POINT), smart_refctd_ptr(logger), system);
+	#endif
+
+	system->mount(smart_refctd_ptr(archive), archiveAlias.data());
+	return smart_refctd_ptr(archive);
+}
+
+smart_refctd_ptr<IGPUGraphicsPipeline> DrawAABB::createPipeline(SCreationParameters& params, const IGPUPipelineLayout* pipelineLayout, DrawMode mode)
+{
+	system::logger_opt_ptr logger = params.utilities->getLogger();
+	auto system = smart_refctd_ptr<ISystem>(params.assetManager->getSystem());
+	auto* device = params.utilities->getLogicalDevice();
+	mount(smart_refctd_ptr<ILogger>(params.utilities->getLogger()), system.get(), params.utilities->getLogicalDevice(), NBL_EXT_MOUNT_ENTRY);
+
+	auto getShader = [&](const core::string& key)->smart_refctd_ptr<IShader> {
+		IAssetLoader::SAssetLoadParams lp = {};
+		lp.logger = params.utilities->getLogger();
+		lp.workingDirectory = NBL_EXT_MOUNT_ENTRY;
+		auto bundle = params.assetManager->getAsset(key.c_str(), lp);
+
+		const auto contents = bundle.getContents();
+
+		if (contents.empty())
+		{
+			logger.log("Failed to load shader %s from disk", ILogger::ELL_ERROR, key.c_str());
+			return nullptr;
+		}
+
+		if (bundle.getAssetType() != IAsset::ET_SHADER)
+		{
+			logger.log("Loaded asset has wrong type!", ILogger::ELL_ERROR);
+			return nullptr;
+		}
+
+		return IAsset::castDown<IShader>(contents[0]);
+	};
+
+	const auto key = nbl::ext::debug_draw::builtin::build::get_spirv_key<"draw_aabb">(device);
+	smart_refctd_ptr<IShader> unifiedShader = getShader(key);
+	if (!unifiedShader)
+	{
+		params.utilities->getLogger()->log("Could not compile shaders!", ILogger::ELL_ERROR);
+		return nullptr;
+	}
+
+	video::IGPUGraphicsPipeline::SCreationParams pipelineParams[1] = {};
+	pipelineParams[0].layout = pipelineLayout;
+	pipelineParams[0].vertexShader = { .shader = unifiedShader.get(), .entryPoint = (mode & ADM_DRAW_SINGLE) ? "aabb_vertex_single" : "aabb_vertex_instances" };
+	pipelineParams[0].fragmentShader = { .shader = unifiedShader.get(), .entryPoint = "aabb_fragment" };
+	pipelineParams[0].cached = {
+		.primitiveAssembly = {
+			.primitiveType = asset::E_PRIMITIVE_TOPOLOGY::EPT_LINE_LIST,
+		}
+	};
+	pipelineParams[0].renderpass = params.renderpass.get();
+
+	smart_refctd_ptr<IGPUGraphicsPipeline> pipeline;
+	params.utilities->getLogicalDevice()->createGraphicsPipelines(nullptr, pipelineParams, &pipeline);
+	if (!pipeline)
+	{
+		params.utilities->getLogger()->log("Could not create streaming pipeline!", ILogger::ELL_ERROR);
+		return nullptr;
+	}
+
+	return pipeline;
+}
+
+bool DrawAABB::createStreamingBuffer(SCreationParameters& params)
+{
+	const uint32_t minStreamingBufferAllocationSize = 128u, maxStreamingBufferAllocationAlignment = 4096u, mdiBufferDefaultSize = /* 2MB */ 1024u * 1024u * 2u;
+
+	auto getRequiredAccessFlags = [&](const bitflag<IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS>& properties)
+		{
+			bitflag<IDeviceMemoryAllocation::E_MAPPING_CPU_ACCESS_FLAGS> flags(IDeviceMemoryAllocation::EMCAF_NO_MAPPING_ACCESS);
+
+			if (properties.hasFlags(IDeviceMemoryAllocation::EMPF_HOST_WRITABLE_BIT))
+				flags |= IDeviceMemoryAllocation::EMCAF_WRITE;
+
+			return flags;
+		};
+
+	if (!params.streamingBuffer)
+	{
+		IGPUBuffer::SCreationParams mdiCreationParams = {};
+		mdiCreationParams.usage = SCachedCreationParameters::RequiredUsageFlags;
+		mdiCreationParams.size = mdiBufferDefaultSize;
+
+		auto buffer = params.utilities->getLogicalDevice()->createBuffer(std::move(mdiCreationParams));
+		buffer->setObjectDebugName("AABB Streaming Buffer");
+
+		auto memoryReqs = buffer->getMemoryReqs();
+		memoryReqs.memoryTypeBits &= params.utilities->getLogicalDevice()->getPhysicalDevice()->getUpStreamingMemoryTypeBits();
+
+		auto allocation = params.utilities->getLogicalDevice()->allocate(memoryReqs, buffer.get(), SCachedCreationParameters::RequiredAllocateFlags);
+		{
+			const bool allocated = allocation.isValid();
+			assert(allocated);
+		}
+		auto memory = allocation.memory;
+
+		if (!memory->map({ 0ull, memoryReqs.size }, getRequiredAccessFlags(memory->getMemoryPropertyFlags())))
+			params.utilities->getLogger()->log("Could not map device memory!", ILogger::ELL_ERROR);
+
+		params.streamingBuffer = make_smart_refctd_ptr<SCachedCreationParameters::streaming_buffer_t>(SBufferRange<IGPUBuffer>{0ull, mdiCreationParams.size, std::move(buffer)}, maxStreamingBufferAllocationAlignment, minStreamingBufferAllocationSize);
+	}
+
+	auto buffer = params.streamingBuffer->getBuffer();
+	auto binding = buffer->getBoundMemory();
+
+	const auto validation = std::to_array
+	({
+		std::make_pair(buffer->getCreationParams().usage.hasFlags(SCachedCreationParameters::RequiredUsageFlags), "Streaming buffer must be created with IBuffer::EUF_STORAGE_BUFFER_BIT | IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT enabled!"),
+		std::make_pair(binding.memory->getAllocateFlags().hasFlags(SCachedCreationParameters::RequiredAllocateFlags), "Streaming buffer's memory must be allocated with IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT enabled!"),
+		std::make_pair(binding.memory->isCurrentlyMapped(), "Streaming buffer's memory must be mapped!"), // streaming buffer contructor already validates it, but cannot assume user won't unmap its own buffer for some reason (sorry if you have just hit it)
+		std::make_pair(binding.memory->getCurrentMappingAccess().hasFlags(getRequiredAccessFlags(binding.memory->getMemoryPropertyFlags())), "Streaming buffer's memory current mapping access flags don't meet requirements!")
+		});
+
+	for (const auto& [ok, error] : validation)
+		if (!ok)
+		{
+			params.utilities->getLogger()->log(error, ILogger::ELL_ERROR);
+			return false;
+		}
+
+	return true;
+}
+
+smart_refctd_ptr<IGPUBuffer> DrawAABB::createIndicesBuffer(SCreationParameters& params)
+{
+	std::array<uint32_t, IndicesCount> unitAABBIndices;
+	unitAABBIndices[0] = 0;
+	unitAABBIndices[1] = 1;
+	unitAABBIndices[2] = 0;
+	unitAABBIndices[3] = 2;
+
+	unitAABBIndices[4] = 3;
+	unitAABBIndices[5] = 1;
+	unitAABBIndices[6] = 3;
+	unitAABBIndices[7] = 2;
+
+	unitAABBIndices[8] = 4;
+	unitAABBIndices[9] = 5;
+	unitAABBIndices[10] = 4;
+	unitAABBIndices[11] = 6;
+
+	unitAABBIndices[12] = 7;
+	unitAABBIndices[13] = 5;
+	unitAABBIndices[14] = 7;
+	unitAABBIndices[15] = 6;
+
+	unitAABBIndices[16] = 0;
+	unitAABBIndices[17] = 4;
+	unitAABBIndices[18] = 1;
+	unitAABBIndices[19] = 5;
+
+	unitAABBIndices[20] = 2;
+	unitAABBIndices[21] = 6;
+	unitAABBIndices[22] = 3;
+	unitAABBIndices[23] = 7;
+
+	auto* device = params.utilities->getLogicalDevice();
+	smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
+	{
+		smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = device->createCommandPool(params.transfer->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+		if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf, 1 }))
+		{
+			params.utilities->getLogger()->log("Failed to create Command Buffer for index buffer!\n");
+			return nullptr;
+		}
+	}
+
+	IGPUBuffer::SCreationParams bufparams;
+	bufparams.size = sizeof(uint32_t) * unitAABBIndices.size();
+	bufparams.usage = IGPUBuffer::EUF_INDEX_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+
+	smart_refctd_ptr<IGPUBuffer> indicesBuffer;
+	{
+		indicesBuffer = device->createBuffer(std::move(bufparams));
+		if (!indicesBuffer)
+		{
+			params.utilities->getLogger()->log("Failed to create index buffer!\n");
+			return nullptr;
+		}
+
+		video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = indicesBuffer->getMemoryReqs();
+		reqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+
+		auto bufMem = device->allocate(reqs, indicesBuffer.get());
+		if (!bufMem.isValid())
+		{
+			params.utilities->getLogger()->log("Failed to allocate device memory compatible with index buffer!\n");
+			return nullptr;
+		}
+	}
+
+	{
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		cmdbuf->beginDebugMarker("Fill indices buffer begin");
+
+		SBufferRange<IGPUBuffer> bufRange = { .offset = 0, .size = indicesBuffer->getSize(), .buffer = indicesBuffer };
+		cmdbuf->updateBuffer(bufRange, unitAABBIndices.data());
+
+		cmdbuf->endDebugMarker();
+		cmdbuf->end();
+	}
+
+	smart_refctd_ptr<ISemaphore> idxBufProgress;
+	constexpr auto FinishedValue = 25;
+	{
+		constexpr auto StartedValue = 0;
+		idxBufProgress = device->createSemaphore(StartedValue);
+
+		IQueue::SSubmitInfo submitInfos[1] = {};
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
+		submitInfos[0].commandBuffers = cmdbufs;
+		const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = idxBufProgress.get(),.value = FinishedValue,.stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS} };
+		submitInfos[0].signalSemaphores = signals;
+
+		params.transfer->submit(submitInfos);
+	}
+
+	const ISemaphore::SWaitInfo waitInfos[] = { {
+				.semaphore = idxBufProgress.get(),
+				.value = FinishedValue
+			} };
+	device->blockForSemaphores(waitInfos);
+
+	return indicesBuffer;
+}
+
+core::smart_refctd_ptr<video::IGPUPipelineLayout> DrawAABB::createPipelineLayoutFromPCRange(video::ILogicalDevice* device, const asset::SPushConstantRange& pcRange)
+{
+	return device->createPipelineLayout({ &pcRange , 1 }, nullptr, nullptr, nullptr, nullptr);
+}
+
+core::smart_refctd_ptr<video::IGPUPipelineLayout> DrawAABB::createDefaultPipelineLayout(video::ILogicalDevice* device, DrawMode mode)
+{
+	const uint32_t offset = (mode & ADM_DRAW_BATCH) ? offsetof(ext::debug_draw::PushConstants, ipc) : offsetof(ext::debug_draw::PushConstants, spc);
+	const uint32_t pcSize = (mode & ADM_DRAW_BATCH) ? sizeof(SInstancedPC) : sizeof(SSinglePC);
+	SPushConstantRange pcRange = {
+		.stageFlags = IShader::E_SHADER_STAGE::ESS_VERTEX,
+		.offset = offset,
+		.size = pcSize
+	};
+	return createPipelineLayoutFromPCRange(device, pcRange);
+}
+
+bool DrawAABB::renderSingle(const DrawParameters& params, const hlsl::shapes::AABB<3, float>& aabb, const hlsl::float32_t4& color)
+{
+	if (!(m_cachedCreationParams.drawMode & ADM_DRAW_SINGLE))
+	{
+		m_cachedCreationParams.utilities->getLogger()->log("DrawAABB has not been enabled for draw single!", ILogger::ELL_ERROR);
+		return false;
+	}
+
+	auto& commandBuffer = params.commandBuffer;
+	commandBuffer->bindGraphicsPipeline(m_singlePipeline.get());
+	commandBuffer->setLineWidth(params.lineWidth);
+	asset::SBufferBinding<video::IGPUBuffer> indexBinding = { .offset = 0, .buffer = m_indicesBuffer };
+	commandBuffer->bindIndexBuffer(indexBinding, asset::EIT_32BIT);
+
+	SSinglePC pc;
+	hlsl::float32_t3x4 instanceTransform = getTransformFromAABB(aabb);
+	pc.instance.transform = math::linalg::promoted_mul(params.cameraMat, instanceTransform);
+	pc.instance.color = color;
+	
+	commandBuffer->pushConstants(m_singlePipeline->getLayout(), ESS_VERTEX, offsetof(ext::debug_draw::PushConstants, spc), sizeof(SSinglePC), &pc);
+	commandBuffer->drawIndexed(IndicesCount, 1, 0, 0, 0);
+
+	return true;
+}
+
+}
diff --git a/src/nbl/ext/DebugDraw/CMakeLists.txt b/src/nbl/ext/DebugDraw/CMakeLists.txt
new file mode 100644
index 0000000000..dfa4a7624f
--- /dev/null
+++ b/src/nbl/ext/DebugDraw/CMakeLists.txt
@@ -0,0 +1,73 @@
+include(${NBL_ROOT_PATH}/cmake/common.cmake)
+
+set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include")
+
+set(NBL_EXT_DEBUG_DRAW_H
+    ${NBL_EXT_INTERNAL_INCLUDE_DIR}/nbl/ext/DebugDraw/CDrawAABB.h
+)
+
+set(NBL_EXT_DEBUG_DRAW_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/CDrawAABB.cpp"
+)
+
+nbl_create_ext_library_project(
+    DEBUG_DRAW
+    "${NBL_EXT_DEBUG_DRAW_H}"
+    "${NBL_EXT_DEBUG_DRAW_SRC}"
+    "${NBL_EXT_DEBUG_DRAW_EXTERNAL_INCLUDE}"
+    ""
+    ""
+)
+
+get_filename_component(_ARCHIVE_ABSOLUTE_ENTRY_PATH_ "${NBL_EXT_INTERNAL_INCLUDE_DIR}" ABSOLUTE)
+
+set(NBL_DEBUG_DRAW_HLSL_MOUNT_POINT "${_ARCHIVE_ABSOLUTE_ENTRY_PATH_}/nbl/ext/DebugDraw/builtin/hlsl")
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    ${NBL_DEBUG_DRAW_HLSL_MOUNT_POINT}/common.hlsl
+    ${NBL_DEBUG_DRAW_HLSL_MOUNT_POINT}/draw_aabb.unified.hlsl
+)
+target_sources(${LIB_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "${NBL_DEBUG_DRAW_HLSL_MOUNT_POINT}/draw_aabb.unified.hlsl",
+        "KEY": "draw_aabb",
+    }
+    
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${NBL_ROOT_PATH}/include" # a workaround due to debug draw ext common header which is not part of Nabla builtin archive
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${LIB_NAME}SPIRV
+    LINK_TO ${LIB_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_DEBUG_DRAW_HLSL_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/ext/DebugDraw/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::ext::debug_draw::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::ext::debug_draw::builtin::build
+    TARGET ${LIB_NAME}_builtinsBuild
+    LINK_TO ${LIB_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
+
+
+add_library(Nabla::ext::DebugDraw ALIAS ${LIB_NAME})
diff --git a/src/nbl/system/CColoredStdoutLoggerWin32.cpp b/src/nbl/system/CColoredStdoutLoggerWin32.cpp
index e664ae84bc..f2690a81b4 100644
--- a/src/nbl/system/CColoredStdoutLoggerWin32.cpp
+++ b/src/nbl/system/CColoredStdoutLoggerWin32.cpp
@@ -15,7 +15,7 @@ CColoredStdoutLoggerWin32::CColoredStdoutLoggerWin32(core::bitflag<E_LOG_LEVEL>
 void CColoredStdoutLoggerWin32::threadsafeLog_impl(const std::string_view& fmt, E_LOG_LEVEL logLevel, va_list args)
 {
 	SetConsoleTextAttribute(m_native_console, getConsoleColor(logLevel));
-	printf(constructLogString(fmt, logLevel, args).data());
+	printf("%s", constructLogString(fmt, logLevel, args).data());
 	fflush(stdout);
 	SetConsoleTextAttribute(m_native_console, 15); // restore to white
 }
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index a55c3a1e7b..59d056bfdf 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -406,15 +406,20 @@ bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const asset:
     return true;
 }
 
+bool CVulkanCommandBuffer::bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline)
+{
+    getFunctionTable().vkCmdBindPipeline(m_cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, static_cast<const CVulkanGraphicsPipeline*>(pipeline)->getInternalObject());
+    return true;
+}
+
 bool CVulkanCommandBuffer::bindComputePipeline_impl(const IGPUComputePipeline* const pipeline)
 {
     getFunctionTable().vkCmdBindPipeline(m_cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, static_cast<const CVulkanComputePipeline*>(pipeline)->getInternalObject());
     return true;
 }
 
-bool CVulkanCommandBuffer::bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline)
-{
-    getFunctionTable().vkCmdBindPipeline(m_cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, static_cast<const CVulkanGraphicsPipeline*>(pipeline)->getInternalObject());
+bool CVulkanCommandBuffer::bindMeshPipeline_impl(const IGPUMeshPipeline* const pipeline) {
+    getFunctionTable().vkCmdBindPipeline(m_cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, static_cast<const CVulkanMeshPipeline*>(pipeline)->getInternalObject());
     return true;
 }
 
@@ -635,10 +640,23 @@ bool CVulkanCommandBuffer::dispatch_impl(const uint32_t groupCountX, const uint3
 
 bool CVulkanCommandBuffer::dispatchIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding)
 {
-    getFunctionTable().vkCmdDispatchIndirect(m_cmdbuf,static_cast<const CVulkanBuffer*>(binding.buffer.get())->getInternalObject(),binding.offset);
+    getFunctionTable().vkCmdDispatchIndirect(m_cmdbuf, static_cast<const CVulkanBuffer*>(binding.buffer.get())->getInternalObject(), binding.offset);
     return true;
 }
 
+bool CVulkanCommandBuffer::drawMeshTasks_impl(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ)
+{
+    getFunctionTable().vkCmdDrawMeshTasksEXT(m_cmdbuf, groupCountX, groupCountY, groupCountZ);
+    return true;
+}
+
+bool CVulkanCommandBuffer::drawMeshTasksIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride)
+{
+    getFunctionTable().vkCmdDrawMeshTasksIndirectEXT(m_cmdbuf, static_cast<const CVulkanBuffer*>(binding.buffer.get())->getInternalObject(), binding.offset, drawCount, stride);
+    return true;
+}
+
+
 
 bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info, const SUBPASS_CONTENTS contents)
 {
diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h
index 9383585b23..ba3925ffe2 100644
--- a/src/nbl/video/CVulkanCommandBuffer.h
+++ b/src/nbl/video/CVulkanCommandBuffer.h
@@ -181,8 +181,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer
         bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst);
         bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst);
 
-        bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override;
         bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) override;
+        bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override;
+        bool bindMeshPipeline_impl(const IGPUMeshPipeline* const pipeline) override;
         bool bindRayTracingPipeline_impl(const IGPURayTracingPipeline* const pipeline) override;
         bool bindDescriptorSets_impl(const asset::E_PIPELINE_BIND_POINT pipelineBindPoint, const IGPUPipelineLayout* const layout, const uint32_t firstSet, const uint32_t descriptorSetCount, const IGPUDescriptorSet* const* const pDescriptorSets, const uint32_t dynamicOffsetCount = 0u, const uint32_t* const dynamicOffsets = nullptr) override;
         bool pushConstants_impl(const IGPUPipelineLayout* const layout, const core::bitflag<hlsl::ShaderStage> stageFlags, const uint32_t offset, const uint32_t size, const void* const pValues) override;
@@ -209,6 +210,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer
         bool dispatch_impl(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ) override;
         bool dispatchIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding) override;
 
+        bool drawMeshTasks_impl(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ) override;
+        bool drawMeshTasksIndirect_impl(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride) override;
+
         bool beginRenderPass_impl(const SRenderpassBeginInfo& info, SUBPASS_CONTENTS contents) override;
         bool nextSubpass_impl(const SUBPASS_CONTENTS contents) override;
         bool endRenderPass_impl() override;
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 5390b4c3fa..cd24704df8 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1121,7 +1121,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
 
         if (requireFullSubgroups)
         {
-            assert(stage==hlsl::ShaderStage::ESS_COMPUTE/*TODO: Or Mesh Or Task*/);
+            assert(stage==hlsl::ShaderStage::ESS_COMPUTE || stage == hlsl::ShaderStage::ESS_MESH || stage == hlsl::ShaderStage::ESS_TASK);
             retval.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT;
         }
     }
@@ -1176,12 +1176,10 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
     for (const auto& info : createInfos)
     {
         initPipelineCreateInfo(outCreateInfo,info);
-        const auto& spec = info.shader;
-        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
+        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(info.shader, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
         outCreateInfo++;
     }
     auto vk_pipelines = reinterpret_cast<VkPipeline*>(output);
-    std::stringstream debugNameBuilder;
     if (m_devf.vk.vkCreateComputePipelines(m_vkdev,vk_pipelineCache,vk_createInfos.size(),vk_createInfos.data(),nullptr,vk_pipelines)==VK_SUCCESS)
     {
         for (size_t i=0ull; i<createInfos.size(); ++i)
@@ -1193,6 +1191,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
             output[i] = core::make_smart_refctd_ptr<CVulkanComputePipeline>(
                 info,vk_pipeline
             );
+            std::stringstream debugNameBuilder;
             debugNameBuilder.str("");
             const auto& specInfo = createInfos[i].shader;
             debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << hlsl::ShaderStage::ESS_COMPUTE << ")\n";
@@ -1202,26 +1201,136 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
         std::fill_n(output,vk_createInfos.size(),nullptr);
 }
 
-void CVulkanLogicalDevice::createGraphicsPipelines_impl(
-    IGPUPipelineCache* const pipelineCache,
-    const std::span<const IGPUGraphicsPipeline::SCreationParams> createInfos,
-    core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-    const SSpecializationValidationResult& validation
-)
-{
-    auto getVkStencilOpStateFrom = [](const asset::SStencilOpParams& params)->VkStencilOpState
-    {
-        return {
-            .failOp = static_cast<VkStencilOp>(params.failOp),
-            .passOp = static_cast<VkStencilOp>(params.passOp),
-            .depthFailOp = static_cast<VkStencilOp>(params.depthFailOp),
-            .compareOp = static_cast<VkCompareOp>(params.compareOp)
-        };
+void PopulateViewport(VkPipelineViewportStateCreateInfo& outViewport, nbl::asset::SRasterizationParams const& raster){
+    outViewport.viewportCount = raster.viewportCount;
+    // must be identical to viewport count unless VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT or VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT are used
+    outViewport.scissorCount = raster.viewportCount;
+}
+
+
+void PopulateRaster(VkPipelineRasterizationStateCreateInfo& outRaster, nbl::asset::SRasterizationParams const& raster){
+    outRaster.depthClampEnable = raster.depthClampEnable;
+    outRaster.rasterizerDiscardEnable = raster.rasterizerDiscard;
+    outRaster.polygonMode = static_cast<VkPolygonMode>(raster.polygonMode);
+    outRaster.cullMode = static_cast<VkCullModeFlags>(raster.faceCullingMode);
+    outRaster.frontFace = raster.frontFaceIsCCW ? VK_FRONT_FACE_COUNTER_CLOCKWISE:VK_FRONT_FACE_CLOCKWISE;
+    outRaster.depthBiasEnable = raster.depthBiasEnable;
+}
+
+void PopulateMultisample(VkPipelineMultisampleStateCreateInfo& outMultisample, nbl::asset::SRasterizationParams const& raster){
+    outMultisample.rasterizationSamples = static_cast<VkSampleCountFlagBits>(0x1<<raster.samplesLog2);
+    if (raster.minSampleShadingUnorm>0) {
+        outMultisample.sampleShadingEnable = true;
+        outMultisample.minSampleShading = float(raster.minSampleShadingUnorm)/255.f;
+    }
+    else {
+        outMultisample.sampleShadingEnable = false;
+        outMultisample.minSampleShading = 0.f;
+    }
+    outMultisample.pSampleMask = raster.sampleMask;
+    outMultisample.alphaToCoverageEnable = raster.alphaToCoverageEnable;
+    outMultisample.alphaToOneEnable = raster.alphaToOneEnable;
+}
+VkStencilOpState getVkStencilOpStateFrom(const asset::SStencilOpParams& params){
+    return {
+        .failOp = static_cast<VkStencilOp>(params.failOp),
+        .passOp = static_cast<VkStencilOp>(params.passOp),
+        .depthFailOp = static_cast<VkStencilOp>(params.depthFailOp),
+        .compareOp = static_cast<VkCompareOp>(params.compareOp)
     };
+}
 
-    const auto& features = getEnabledFeatures();
+void PopulateDepthStencil(VkPipelineDepthStencilStateCreateInfo& outDepthStencil, nbl::asset::SRasterizationParams const& raster){
+    outDepthStencil.depthTestEnable = raster.depthTestEnable();
+    outDepthStencil.depthWriteEnable = raster.depthWriteEnable;
+    outDepthStencil.depthCompareOp = static_cast<VkCompareOp>(raster.depthCompareOp);
+    outDepthStencil.depthBoundsTestEnable = raster.depthBoundsTestEnable;
+    outDepthStencil.stencilTestEnable = raster.stencilTestEnable();
+    outDepthStencil.front = getVkStencilOpStateFrom(raster.frontStencilOps);
+    outDepthStencil.back = getVkStencilOpStateFrom(raster.backStencilOps);
+}
+
+void PopulateColorBlend(
+    VkPipelineColorBlendStateCreateInfo& outColorBlend, 
+    VkPipelineColorBlendAttachmentState*& outColorBlendAttachmentState, 
+    nbl::asset::SBlendParams const& blend,
+    nbl::asset::IRenderpass::SCreationParams::SSubpassDescription const& subpass
+) {
+    //outColorBlend->flags no attachment order access yet
+    outColorBlend.logicOpEnable = blend.logicOp!=asset::ELO_NO_OP;
+    outColorBlend.logicOp = getVkLogicOpFromLogicOp(blend.logicOp);
+    outColorBlend.pAttachments = outColorBlendAttachmentState;
+    for (auto i=0; i<IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments; i++){
+        if (subpass.colorAttachments[i].render.used()) {
+            const auto& params = blend.blendParams[i];
+            outColorBlendAttachmentState->blendEnable = params.blendEnabled();
+            outColorBlendAttachmentState->srcColorBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.srcColorFactor));
+            outColorBlendAttachmentState->dstColorBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.dstColorFactor));
+            outColorBlendAttachmentState->colorBlendOp = getVkBlendOpFromBlendOp(static_cast<asset::E_BLEND_OP>(params.colorBlendOp));
+            outColorBlendAttachmentState->srcAlphaBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.srcAlphaFactor));
+            outColorBlendAttachmentState->dstAlphaBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.dstAlphaFactor));
+            outColorBlendAttachmentState->alphaBlendOp = getVkBlendOpFromBlendOp(static_cast<asset::E_BLEND_OP>(params.alphaBlendOp));
+            outColorBlendAttachmentState->colorWriteMask = getVkColorComponentFlagsFromColorWriteMask(params.colorWriteMask);
+            outColorBlendAttachmentState++;
+            //^that pointer iterator is how we ensure the attachments or consecutive
+        }
+    }
+    outColorBlend.attachmentCount = std::distance<const VkPipelineColorBlendAttachmentState*>(outColorBlend.pAttachments,outColorBlendAttachmentState);        
+}
+
+template<typename SCreationParams>
+void PopulateMeshGraphicsCommonData(
+    const std::span<const SCreationParams> createInfos,
+    core::vector<VkGraphicsPipelineCreateInfo>& vk_createInfos,
+
+    core::vector<VkPipelineViewportStateCreateInfo>& vk_viewportStates,
+    core::vector<VkPipelineRasterizationStateCreateInfo>& vk_rasterizationStates,
+    core::vector<VkPipelineMultisampleStateCreateInfo>& vk_multisampleStates,
+    core::vector<VkPipelineDepthStencilStateCreateInfo>& vk_depthStencilStates,
+    core::vector<VkPipelineColorBlendStateCreateInfo>& vk_colorBlendStates,
+    core::vector<VkPipelineColorBlendAttachmentState>& vk_colorBlendAttachmentStates,
+
+    core::vector<VkDynamicState>& vk_dynamicStates,
+    const VkPipelineDynamicStateCreateInfo& vk_dynamicStateCreateInfo
+){
+    //the main concern is lifetime, so don't want to construct, move, or copy anything in here
 
-    core::vector<VkDynamicState> vk_dynamicStates = {
+    auto outColorBlendAttachmentState = vk_colorBlendAttachmentStates.data(); //the pointer iterator is used
+
+
+    for (uint32_t i = 0; i < createInfos.size(); i++){ //whats the maximum number of pipelines that can be created at once? uint32_t to be safe
+        auto& info = createInfos[i];
+        const auto& blend = info.cached.blend;
+        const auto& raster = info.cached.rasterization;
+        const auto& subpass = info.renderpass->getCreationParameters().subpasses[info.cached.subpassIx];
+
+        initPipelineCreateInfo(&vk_createInfos[i], info);
+
+        PopulateViewport(vk_viewportStates[i], raster);
+        PopulateRaster(vk_rasterizationStates[i], raster);
+        PopulateMultisample(vk_multisampleStates[i], raster);
+        PopulateDepthStencil(vk_depthStencilStates[i], raster);
+        PopulateColorBlend(vk_colorBlendStates[i], outColorBlendAttachmentState, blend, subpass);
+        //PopulateDynamicState(dynState, ?)
+
+
+        vk_createInfos[i].pViewportState = &vk_viewportStates[i];
+        vk_createInfos[i].pRasterizationState = &vk_rasterizationStates[i];
+        vk_createInfos[i].pMultisampleState = &vk_multisampleStates[i];
+        vk_createInfos[i].pDepthStencilState = &vk_depthStencilStates[i];
+        vk_createInfos[i].pColorBlendState = &vk_colorBlendStates[i];
+        vk_createInfos[i].pDynamicState = &vk_dynamicStateCreateInfo;
+        vk_createInfos[i].renderPass = static_cast<const CVulkanRenderpass*>(info.renderpass)->getInternalObject();
+        vk_createInfos[i].subpass = info.cached.subpassIx;
+        //handle
+        //index
+        //layout?
+        // ^ handled in initPipelineCreateInfo
+    }
+}
+
+core::vector<VkDynamicState> getDefaultDynamicStates(SPhysicalDeviceFeatures const& features){
+    core::vector<VkDynamicState> ret = {
         VK_DYNAMIC_STATE_VIEWPORT,
         VK_DYNAMIC_STATE_SCISSOR,
         VK_DYNAMIC_STATE_LINE_WIDTH,
@@ -1231,17 +1340,162 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
         VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
         VK_DYNAMIC_STATE_STENCIL_REFERENCE
     };
-    if (features.depthBounds)
-        vk_dynamicStates.push_back(VK_DYNAMIC_STATE_DEPTH_BOUNDS);
+    if (features.depthBounds){
+        ret.push_back(VK_DYNAMIC_STATE_DEPTH_BOUNDS);
+    }
     // TODO: VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT, VK_DYNAMIC_STATE_DISCARD_RECTANGLE_ENABLE_EXT, VK_DYNAMIC_STATE_DISCARD_RECTANGLE_MODE_EXT
-    
-    const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { 
+
+    return ret;
+}
+
+//maximum cleanliness,i tried it and im not a big fan
+//struct CommonPipelineStruct {
+//    VkPipelineRasterizationStateCreateInfo vk_rasterizationStates{ VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,nullptr,0 };
+//    VkPipelineMultisampleStateCreateInfo vk_multisampleStates{ VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,nullptr,0 };
+//    VkPipelineDepthStencilStateCreateInfo vk_depthStencilStates{ VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,nullptr,0 };
+//    VkPipelineColorBlendStateCreateInfo vk_colorBlendStates{ VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,nullptr,0 };
+//    core::vector<VkPipelineColorBlendAttachmentState> vk_colorBlendAttachmentStates{ IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments };
+//};
+
+
+void CVulkanLogicalDevice::createMeshPipelines_impl(
+    IGPUPipelineCache* const pipelineCache,
+    const std::span<const IGPUMeshPipeline::SCreationParams> createInfos,
+    core::smart_refctd_ptr<IGPUMeshPipeline>* const output,
+    const SSpecializationValidationResult& validation
+) {
+    const auto& features = getEnabledFeatures();
+
+    const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject() : VK_NULL_HANDLE;
+
+    core::vector<VkGraphicsPipelineCreateInfo> vk_createInfos(createInfos.size(), { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,nullptr });
+
+    core::vector<VkPipelineRasterizationStateCreateInfo> vk_rasterizationStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineMultisampleStateCreateInfo> vk_multisampleStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineDepthStencilStateCreateInfo> vk_depthStencilStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineColorBlendStateCreateInfo> vk_colorBlendStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineColorBlendAttachmentState> vk_colorBlendAttachmentStates(createInfos.size() * IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments);
+
+    core::vector<VkDynamicState> vk_dynamicStates = getDefaultDynamicStates(features);
+
+    const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0u,
         .dynamicStateCount = static_cast<uint32_t>(vk_dynamicStates.size()),
         .pDynamicStates = vk_dynamicStates.data()
     };
+    core::vector<VkPipelineViewportStateCreateInfo> vk_viewportStates(createInfos.size(), {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .pNext = nullptr, // the extensions that interest us have a dynamic state variant anyway
+        .flags = 0, // must be 0
+        .viewportCount = 0,
+        .pViewports = nullptr,
+        .scissorCount = 0,
+        .pScissors = nullptr,
+    });
+
+    PopulateMeshGraphicsCommonData(
+        createInfos, vk_createInfos,
+
+        vk_viewportStates,
+        vk_rasterizationStates,
+        vk_multisampleStates,
+        vk_depthStencilStates,
+        vk_colorBlendStates,
+        vk_colorBlendAttachmentStates,
+
+        vk_dynamicStates, vk_dynamicStateCreateInfo
+    );
+
+    //not used in mesh pipelines
+    for (auto& outCreateInfo : vk_createInfos) {
+        outCreateInfo.pVertexInputState = nullptr;
+        outCreateInfo.pInputAssemblyState = nullptr;
+        outCreateInfo.pTessellationState = nullptr;
+    }
+    auto outCreateInfo = vk_createInfos.data();
+
+    const auto maxShaderStages = createInfos.size() * IGPUMeshPipeline::MESH_SHADER_STAGE_COUNT;
+    core::vector<VkPipelineShaderStageCreateInfo> vk_shaderStage(maxShaderStages, { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,nullptr });
+    core::vector<VkShaderModuleCreateInfo> vk_shaderModule(maxShaderStages, { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,nullptr, 0 });
+    core::vector<std::string> entryPoints(maxShaderStages);
+    core::vector<VkPipelineShaderStageRequiredSubgroupSizeCreateInfo> vk_requiredSubgroupSize(maxShaderStages, {
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO,nullptr});
+    core::vector<VkSpecializationInfo> vk_specializationInfos(maxShaderStages, { 0,nullptr,0,nullptr });
+    core::vector<VkSpecializationMapEntry> vk_specializationMapEntry(validation.count);
+    core::vector<uint8_t> specializationData(validation.dataSize);
+    auto outShaderStage = vk_shaderStage.data();
+    auto outEntryPoints = entryPoints.data();
+    auto outShaderModule = vk_shaderModule.data();
+    auto outRequiredSubgroupSize = vk_requiredSubgroupSize.data();
+    auto outSpecInfo = vk_specializationInfos.data();
+    auto outSpecMapEntry = vk_specializationMapEntry.data();
+    auto outSpecData = specializationData.data();
+
+    //shader
+    for (const auto& info : createInfos)
+    {
+        outCreateInfo->pStages = outShaderStage;
+        auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage)
+            {
+                if (spec.shader)
+                {
+                    *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec,
+                                                                         shaderStage,
+                                                                         false,
+                                                                         outShaderModule,
+                                                                         outEntryPoints,
+                                                                         outRequiredSubgroupSize,
+                                                                         outSpecInfo,
+                                                                         outSpecMapEntry,
+                                                                         outSpecData
+                    );
+                    outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
+                }
+            };
+        processSpecShader(info.taskShader, hlsl::ShaderStage::ESS_TASK);
+        processSpecShader(info.meshShader, hlsl::ShaderStage::ESS_MESH);
+        processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+
+        outCreateInfo++;
+    }
+
+    auto vk_pipelines = reinterpret_cast<VkPipeline*>(output);
+    std::stringstream debugNameBuilder;
+    if (m_devf.vk.vkCreateGraphicsPipelines(m_vkdev, vk_pipelineCache, vk_createInfos.size(), vk_createInfos.data(), nullptr, vk_pipelines) == VK_SUCCESS)
+    {
+        for (size_t i = 0ull; i < createInfos.size(); ++i)
+        {
+            const auto& createInfo = createInfos[i];
+            const VkPipeline vk_pipeline = vk_pipelines[i];
+            // break the lifetime cause of the aliasing
+            std::uninitialized_default_construct_n(output + i, 1);
+            output[i] = core::make_smart_refctd_ptr<CVulkanMeshPipeline>(createInfos[i], vk_pipeline);
+            debugNameBuilder.str("");
+            auto buildDebugName = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                {
+                    if (spec.shader != nullptr)
+                        debugNameBuilder << spec.shader->getFilepathHint() << "(" << spec.entryPoint << "," << stage << ")\n";
+                };
+            buildDebugName(createInfo.taskShader, hlsl::ESS_TASK);
+            buildDebugName(createInfo.meshShader, hlsl::ESS_MESH);
+            buildDebugName(createInfo.fragmentShader, hlsl::ESS_FRAGMENT);
+            output[i]->setObjectDebugName(debugNameBuilder.str().c_str());
+        }
+    }
+    else
+        std::fill_n(output, vk_createInfos.size(), nullptr);
+}
+
+void CVulkanLogicalDevice::createGraphicsPipelines_impl(
+    IGPUPipelineCache* const pipelineCache,
+    const std::span<const IGPUGraphicsPipeline::SCreationParams> createInfos,
+    core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
+    const SSpecializationValidationResult& validation
+)
+{
+    const auto& features = getEnabledFeatures();
 
     const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject():VK_NULL_HANDLE;
     // Interesting things to put in pNext:
@@ -1252,24 +1506,26 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     // - Discard Rectangle State
     // - Fragment Shading Rate State Creation Info
     // - Piepline Robustness 
+
+    //maximum cleanliness, I create a struct that holds this for mesh and graphics?
     core::vector<VkGraphicsPipelineCreateInfo> vk_createInfos(createInfos.size(),{VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,nullptr});
 
-    const auto maxShaderStages = createInfos.size()*IGPUGraphicsPipeline::GRAPHICS_SHADER_STAGE_COUNT;
-    core::vector<VkPipelineShaderStageCreateInfo> vk_shaderStage(maxShaderStages,{VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,nullptr});
-    core::vector<VkShaderModuleCreateInfo> vk_shaderModule(maxShaderStages,{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,nullptr, 0});
-    core::vector<std::string> entryPoints(maxShaderStages);
-    core::vector<VkPipelineShaderStageRequiredSubgroupSizeCreateInfo> vk_requiredSubgroupSize(maxShaderStages,{
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO,nullptr
-    });
-    core::vector<VkSpecializationInfo> vk_specializationInfos(maxShaderStages,{0,nullptr,0,nullptr});
-    core::vector<VkSpecializationMapEntry> vk_specializationMapEntry(validation.count);
-    core::vector<uint8_t> specializationData(validation.dataSize);
-    core::vector<VkPipelineVertexInputStateCreateInfo> vk_vertexInput(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkVertexInputBindingDescription> vk_inputBinding(createInfos.size()*asset::SVertexInputParams::MAX_ATTR_BUF_BINDING_COUNT);
-    core::vector<VkVertexInputAttributeDescription> vk_inputAttribute(createInfos.size()*asset::SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT);
-    core::vector<VkPipelineInputAssemblyStateCreateInfo> vk_inputAssembly(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineTessellationStateCreateInfo> vk_tessellation(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineViewportStateCreateInfo> vk_viewportStates(createInfos.size(),{
+    core::vector<VkPipelineRasterizationStateCreateInfo> vk_rasterizationStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineMultisampleStateCreateInfo> vk_multisampleStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineDepthStencilStateCreateInfo> vk_depthStencilStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineColorBlendStateCreateInfo> vk_colorBlendStates(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineColorBlendAttachmentState> vk_colorBlendAttachmentStates(createInfos.size() * IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments);
+    
+    core::vector<VkDynamicState> vk_dynamicStates = getDefaultDynamicStates(features);
+
+    const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0u,
+        .dynamicStateCount = static_cast<uint32_t>(vk_dynamicStates.size()),
+        .pDynamicStates = vk_dynamicStates.data()
+    };
+    core::vector<VkPipelineViewportStateCreateInfo> vk_viewportStates(createInfos.size(), {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .pNext = nullptr, // the extensions that interest us have a dynamic state variant anyway
         .flags = 0, // must be 0
@@ -1278,67 +1534,53 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
         .scissorCount = 0,
         .pScissors = nullptr,
     });
-    core::vector<VkPipelineRasterizationStateCreateInfo> vk_rasterizationStates(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineMultisampleStateCreateInfo> vk_multisampleStates(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineDepthStencilStateCreateInfo> vk_depthStencilStates(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineColorBlendStateCreateInfo> vk_colorBlendStates(createInfos.size(),{VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,nullptr,0});
-    core::vector<VkPipelineColorBlendAttachmentState> vk_colorBlendAttachmentStates(createInfos.size()*IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments);
+
+    PopulateMeshGraphicsCommonData(
+        createInfos, vk_createInfos,
+
+        vk_viewportStates,
+        vk_rasterizationStates,
+        vk_multisampleStates,
+        vk_depthStencilStates,
+        vk_colorBlendStates,
+        vk_colorBlendAttachmentStates,
+
+        vk_dynamicStates, vk_dynamicStateCreateInfo
+    );
+
+
+    core::vector<VkVertexInputBindingDescription> vk_inputBinding(createInfos.size() * asset::SVertexInputParams::MAX_ATTR_BUF_BINDING_COUNT);
+    core::vector<VkVertexInputAttributeDescription> vk_inputAttribute(createInfos.size() * asset::SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT);
+    core::vector<VkPipelineInputAssemblyStateCreateInfo> vk_inputAssembly(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineTessellationStateCreateInfo> vk_tessellation(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO,nullptr,0 });
+    core::vector<VkPipelineVertexInputStateCreateInfo> vk_vertexInput(createInfos.size(), { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,nullptr,0 });
 
     auto outCreateInfo = vk_createInfos.data();
-    auto outShaderStage = vk_shaderStage.data();
-    auto outEntryPoints = entryPoints.data();
-    auto outShaderModule = vk_shaderModule.data();
-    auto outRequiredSubgroupSize = vk_requiredSubgroupSize.data();
-    auto outSpecInfo = vk_specializationInfos.data();
-    auto outSpecMapEntry = vk_specializationMapEntry.data();
-    auto outSpecData = specializationData.data();
     auto outVertexInput = vk_vertexInput.data();
     auto outInputBinding = vk_inputBinding.data();
     auto outInputAttribute = vk_inputAttribute.data();
-    auto outInputAssembly = vk_inputAssembly.data();
     auto outTessellation = vk_tessellation.data();
-    auto outViewport = vk_viewportStates.data();
-    auto outRaster = vk_rasterizationStates.data();
-    auto outMultisample = vk_multisampleStates.data();
-    auto outDepthStencil = vk_depthStencilStates.data();
-    auto outColorBlend = vk_colorBlendStates.data();
-    auto outColorBlendAttachmentState = vk_colorBlendAttachmentStates.data();
+    auto outInputAssembly = vk_inputAssembly.data();
 
+    //ill acknowledge this additional looping is a little ugly
+    //input and tess
     for (const auto& info : createInfos)
-    {
-        initPipelineCreateInfo(outCreateInfo,info);
-        outCreateInfo->pStages = outShaderStage;
-        auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage)
-        {
-            if (spec.shader)
-            {
-              *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
-              outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
-            }
-        };
-        processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
-        processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
-        processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
-        processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
-        processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
-
-        // when dealing with mesh shaders, the vertex input and assembly state will be null
+    {   
         {
-            {
-                const auto& vertexInputParams = info.cached.vertexInput;
-                outVertexInput->pVertexBindingDescriptions = outInputBinding;
-                for (auto b=0u; b<asset::SVertexInputParams::MAX_ATTR_BUF_BINDING_COUNT; b++)
-                if (vertexInputParams.enabledBindingFlags&(1<<b))
+            const auto& vertexInputParams = info.cached.vertexInput;
+            outVertexInput->pVertexBindingDescriptions = outInputBinding;
+            for (auto b = 0u; b < asset::SVertexInputParams::MAX_ATTR_BUF_BINDING_COUNT; b++)
+                if (vertexInputParams.enabledBindingFlags & (1 << b))
                 {
                     outInputBinding->binding = b;
                     outInputBinding->stride = vertexInputParams.bindings[b].stride;
                     outInputBinding->inputRate = static_cast<VkVertexInputRate>(vertexInputParams.bindings[b].inputRate);
                     outInputBinding++;
                 }
-                outVertexInput->vertexBindingDescriptionCount = std::distance<const VkVertexInputBindingDescription*>(outVertexInput->pVertexBindingDescriptions,outInputBinding);
-                outVertexInput->pVertexAttributeDescriptions = outInputAttribute;
-                for (auto l=0u; l<asset::SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT; l++)
-                if (vertexInputParams.enabledAttribFlags&(1<<l))
+            outVertexInput->vertexBindingDescriptionCount = std::distance<const VkVertexInputBindingDescription*>(outVertexInput->pVertexBindingDescriptions, outInputBinding);
+            outVertexInput->pVertexAttributeDescriptions = outInputAttribute;
+            for (auto l = 0u; l < asset::SVertexInputParams::MAX_VERTEX_ATTRIB_COUNT; l++)
+                if (vertexInputParams.enabledAttribFlags & (1 << l))
                 {
                     outInputAttribute->location = l;
                     outInputAttribute->binding = vertexInputParams.attributes[l].binding;
@@ -1346,16 +1588,15 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
                     outInputAttribute->offset = vertexInputParams.attributes[l].relativeOffset;
                     outInputAttribute++;
                 }
-                outVertexInput->vertexAttributeDescriptionCount = std::distance<const VkVertexInputAttributeDescription*>(outVertexInput->pVertexAttributeDescriptions,outInputAttribute);
-            }
-            outCreateInfo->pVertexInputState = outVertexInput++;
-            {
-                const auto& primAssParams = info.cached.primitiveAssembly;
-                outInputAssembly->topology = static_cast<VkPrimitiveTopology>(primAssParams.primitiveType);
-                outInputAssembly->primitiveRestartEnable = primAssParams.primitiveRestartEnable;
-            }
-            outCreateInfo->pInputAssemblyState = outInputAssembly++;
+            outVertexInput->vertexAttributeDescriptionCount = std::distance<const VkVertexInputAttributeDescription*>(outVertexInput->pVertexAttributeDescriptions, outInputAttribute);
         }
+        outCreateInfo->pVertexInputState = outVertexInput++;
+        {
+            const auto& primAssParams = info.cached.primitiveAssembly;
+            outInputAssembly->topology = static_cast<VkPrimitiveTopology>(primAssParams.primitiveType);
+            outInputAssembly->primitiveRestartEnable = primAssParams.primitiveRestartEnable;
+        }
+        outCreateInfo->pInputAssemblyState = outInputAssembly++;
 
         if (info.tesselationControlShader.shader || info.tesselationEvaluationShader.shader)
         {
@@ -1363,79 +1604,59 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
             outCreateInfo->pTessellationState = outTessellation++;
         }
 
-        const auto& raster = info.cached.rasterization;
-        {
-            outViewport->viewportCount = raster.viewportCount;
-            // must be identical to viewport count unless VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT or VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT are used
-            outViewport->scissorCount = raster.viewportCount;
-            outCreateInfo->pViewportState = outViewport++;
-        }
-        {
-            outRaster->depthClampEnable = raster.depthClampEnable;
-            outRaster->rasterizerDiscardEnable = raster.rasterizerDiscard;
-            outRaster->polygonMode = static_cast<VkPolygonMode>(raster.polygonMode);
-            outRaster->cullMode = static_cast<VkCullModeFlags>(raster.faceCullingMode);
-            outRaster->frontFace = raster.frontFaceIsCCW ? VK_FRONT_FACE_COUNTER_CLOCKWISE:VK_FRONT_FACE_CLOCKWISE;
-            outRaster->depthBiasEnable = raster.depthBiasEnable;
-            outCreateInfo->pRasterizationState = outRaster++;
-        }
-        {
-            outMultisample->rasterizationSamples = static_cast<VkSampleCountFlagBits>(0x1<<raster.samplesLog2);
-            if (raster.minSampleShadingUnorm>0)
-            {
-                outMultisample->sampleShadingEnable = true;
-                outMultisample->minSampleShading = float(raster.minSampleShadingUnorm)/255.f;
-            }
-            else
-            {
-                outMultisample->sampleShadingEnable = false;
-                outMultisample->minSampleShading = 0.f;
-            }
-            outMultisample->pSampleMask = raster.sampleMask;
-            outMultisample->alphaToCoverageEnable = raster.alphaToCoverageEnable;
-            outMultisample->alphaToOneEnable = raster.alphaToOneEnable;
-            outCreateInfo->pMultisampleState = outMultisample++;
-        }
-        {
-            //outDepthStencil->flags no attachment order access yet
-            outDepthStencil->depthTestEnable = raster.depthTestEnable();
-            outDepthStencil->depthWriteEnable = raster.depthWriteEnable;
-            outDepthStencil->depthCompareOp = static_cast<VkCompareOp>(raster.depthCompareOp);
-            outDepthStencil->depthBoundsTestEnable = raster.depthBoundsTestEnable;
-            outDepthStencil->stencilTestEnable = raster.stencilTestEnable();
-            outDepthStencil->front = getVkStencilOpStateFrom(raster.frontStencilOps);
-            outDepthStencil->back = getVkStencilOpStateFrom(raster.backStencilOps);
-            outCreateInfo->pDepthStencilState = outDepthStencil++;
-        }
+        outCreateInfo++;
+    }
+
+    const auto maxShaderStages = createInfos.size()*IGPUGraphicsPipeline::GRAPHICS_SHADER_STAGE_COUNT;
+    core::vector<VkPipelineShaderStageCreateInfo> vk_shaderStage(maxShaderStages,{VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,nullptr});
+    core::vector<VkShaderModuleCreateInfo> vk_shaderModule(maxShaderStages,{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,nullptr, 0});
+    core::vector<std::string> entryPoints(maxShaderStages);
+    core::vector<VkPipelineShaderStageRequiredSubgroupSizeCreateInfo> vk_requiredSubgroupSize(maxShaderStages,{
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO,nullptr
+    });
+    core::vector<VkSpecializationInfo> vk_specializationInfos(maxShaderStages,{0,nullptr,0,nullptr});
+    core::vector<VkSpecializationMapEntry> vk_specializationMapEntry(validation.count);
+    core::vector<uint8_t> specializationData(validation.dataSize);
+
+    outCreateInfo = vk_createInfos.data();
+    auto outShaderStage = vk_shaderStage.data();
+    auto outEntryPoints = entryPoints.data();
+    auto outShaderModule = vk_shaderModule.data();
+    auto outRequiredSubgroupSize = vk_requiredSubgroupSize.data();
+    auto outSpecInfo = vk_specializationInfos.data();
+    auto outSpecMapEntry = vk_specializationMapEntry.data();
+    auto outSpecData = specializationData.data();
+
+    //shader
+    for (const auto& info : createInfos)
+    {
+        outCreateInfo->pStages = outShaderStage;
+        auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage)
         {
-            const auto& blend = info.cached.blend;
-            const auto& subpass = info.renderpass->getCreationParameters().subpasses[info.cached.subpassIx];
-            //outColorBlend->flags no attachment order access yet
-            outColorBlend->logicOpEnable = blend.logicOp!=asset::ELO_NO_OP;
-            outColorBlend->logicOp = getVkLogicOpFromLogicOp(blend.logicOp);
-            outColorBlend->pAttachments = outColorBlendAttachmentState;
-            for (auto i=0; i<IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments; i++)
-            if (subpass.colorAttachments[i].render.used())
+            if (spec.shader)
             {
-                const auto& params = blend.blendParams[i];
-                outColorBlendAttachmentState->blendEnable = params.blendEnabled();
-                outColorBlendAttachmentState->srcColorBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.srcColorFactor));
-                outColorBlendAttachmentState->dstColorBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.dstColorFactor));
-                outColorBlendAttachmentState->colorBlendOp = getVkBlendOpFromBlendOp(static_cast<asset::E_BLEND_OP>(params.colorBlendOp));
-                outColorBlendAttachmentState->srcAlphaBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.srcAlphaFactor));
-                outColorBlendAttachmentState->dstAlphaBlendFactor = getVkBlendFactorFromBlendFactor(static_cast<asset::E_BLEND_FACTOR>(params.dstAlphaFactor));
-                outColorBlendAttachmentState->alphaBlendOp = getVkBlendOpFromBlendOp(static_cast<asset::E_BLEND_OP>(params.alphaBlendOp));
-                outColorBlendAttachmentState->colorWriteMask = getVkColorComponentFlagsFromColorWriteMask(params.colorWriteMask);
-                outColorBlendAttachmentState++;
+                *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, 
+                                                                     shaderStage, 
+                                                                     false, 
+                                                                     outShaderModule, 
+                                                                     outEntryPoints, 
+                                                                     outRequiredSubgroupSize, 
+                                                                     outSpecInfo, 
+                                                                     outSpecMapEntry, 
+                                                                     outSpecData
+                );
+                outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
             }
-            outColorBlend->attachmentCount = std::distance<const VkPipelineColorBlendAttachmentState*>(outColorBlend->pAttachments,outColorBlendAttachmentState);
-            outCreateInfo->pColorBlendState = outColorBlend++;
-        }
-        outCreateInfo->pDynamicState = &vk_dynamicStateCreateInfo;
-        outCreateInfo->renderPass = static_cast<const CVulkanRenderpass*>(info.renderpass)->getInternalObject();
-        outCreateInfo->subpass = info.cached.subpassIx;
+        };
+        processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+
         outCreateInfo++;
     }
+   
     auto vk_pipelines = reinterpret_cast<VkPipeline*>(output);
     std::stringstream debugNameBuilder;
     if (m_devf.vk.vkCreateGraphicsPipelines(m_vkdev,vk_pipelineCache,vk_createInfos.size(),vk_createInfos.data(),nullptr,vk_pipelines)==VK_SUCCESS)
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 4cc633ec55..0d1f75918d 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -20,7 +20,6 @@
 #include "nbl/video/CVulkanSampler.h"
 #include "nbl/video/CVulkanPipelineLayout.h"
 #include "nbl/video/CVulkanPipelineCache.h"
-#include "nbl/video/CVulkanComputePipeline.h"
 #include "nbl/video/CVulkanDescriptorPool.h"
 #include "nbl/video/CVulkanDescriptorSet.h"
 #include "nbl/video/CVulkanMemoryAllocation.h"
@@ -29,7 +28,10 @@
 #include "nbl/video/CVulkanImage.h"
 #include "nbl/video/CVulkanDeferredOperation.h"
 #include "nbl/video/CVulkanAccelerationStructure.h"
+
 #include "nbl/video/CVulkanGraphicsPipeline.h"
+#include "nbl/video/CVulkanComputePipeline.h"
+#include "nbl/video/CVulkanMeshPipeline.h"
 #include "nbl/video/CVulkanRayTracingPipeline.h"
 
 namespace nbl::video
@@ -289,10 +291,16 @@ class CVulkanLogicalDevice final : public ILogicalDevice
         ) override;
         void createGraphicsPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
-            const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
+            const std::span<const IGPUGraphicsPipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
             const SSpecializationValidationResult& validation
         ) override;
+        void createMeshPipelines_impl(
+            IGPUPipelineCache* const pipelineCache,
+            const std::span<const IGPUMeshPipeline::SCreationParams> createInfos,
+            core::smart_refctd_ptr<IGPUMeshPipeline>* const output,
+            const SSpecializationValidationResult& validation
+        ) override; //final?
 
         void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
diff --git a/src/nbl/video/CVulkanMeshPipeline.cpp b/src/nbl/video/CVulkanMeshPipeline.cpp
new file mode 100644
index 0000000000..5801fb075c
--- /dev/null
+++ b/src/nbl/video/CVulkanMeshPipeline.cpp
@@ -0,0 +1,27 @@
+#include "nbl/video/CVulkanMeshPipeline.h"
+
+#include "nbl/video/CVulkanLogicalDevice.h"
+
+namespace nbl::video
+{
+
+	CVulkanMeshPipeline::~CVulkanMeshPipeline()
+	{
+		const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
+		auto* vk = vulkanDevice->getFunctionTable();
+		vk->vk.vkDestroyPipeline(vulkanDevice->getInternalObject(), m_vkPipeline, nullptr);
+	}
+	void CVulkanMeshPipeline::setObjectDebugName(const char* label) const
+	{
+		IBackendObject::setObjectDebugName(label);
+
+		if (vkSetDebugUtilsObjectNameEXT == 0) return;
+
+		const CVulkanLogicalDevice* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
+		VkDebugUtilsObjectNameInfoEXT nameInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, nullptr };
+		nameInfo.objectType = VK_OBJECT_TYPE_PIPELINE;
+		nameInfo.objectHandle = reinterpret_cast<uint64_t>(getInternalObject());
+		nameInfo.pObjectName = getObjectDebugName();
+		vkSetDebugUtilsObjectNameEXT(vulkanDevice->getInternalObject(), &nameInfo);
+	}
+}
\ No newline at end of file
diff --git a/src/nbl/video/CVulkanMeshPipeline.h b/src/nbl/video/CVulkanMeshPipeline.h
new file mode 100644
index 0000000000..3bf68d33a0
--- /dev/null
+++ b/src/nbl/video/CVulkanMeshPipeline.h
@@ -0,0 +1,31 @@
+#ifndef _NBL_C_VULKAN_MESH_PIPELINE_H_INCLUDED_
+#define _NBL_C_VULKAN_MESH_PIPELINE_H_INCLUDED_
+
+
+#include "nbl/video/IGPUMeshPipeline.h"
+
+#include <volk.h>
+
+namespace nbl::video
+{
+
+class CVulkanMeshPipeline final : public IGPUMeshPipeline
+{
+    public:
+        CVulkanMeshPipeline(const SCreationParams& params, const VkPipeline vk_pipeline) :
+            IGPUMeshPipeline(params), m_vkPipeline(vk_pipeline) {}
+
+        inline const void* getNativeHandle() const override {return &m_vkPipeline;}
+
+        inline VkPipeline getInternalObject() const {return m_vkPipeline;}
+
+        void setObjectDebugName(const char* label) const override; //exists in compute but not in graphics
+    private:
+        ~CVulkanMeshPipeline();
+
+        const VkPipeline m_vkPipeline;
+};
+
+}
+
+#endif
diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp
index da86d7c9d9..1bc03b8c48 100644
--- a/src/nbl/video/CVulkanPhysicalDevice.cpp
+++ b/src/nbl/video/CVulkanPhysicalDevice.cpp
@@ -732,6 +732,12 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
         VkPhysicalDeviceCooperativeMatrixFeaturesKHR                    cooperativeMatrixFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR };
         VkPhysicalDeviceMaintenance5FeaturesKHR                         maintenance5Features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_5_FEATURES_KHR };
         VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT              graphicsPipelineLibraryFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT };
+        VkPhysicalDeviceMeshShaderFeaturesEXT                           meshShaderFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_EXT };
+
+        //do we hate macros?
+#define AddExtensionToPNextIfSupported(name, feat) if (isExtensionSupported(name)) addToPNextChain(&feat);
+        
+        AddExtensionToPNextIfSupported(VK_EXT_MESH_SHADER_EXTENSION_NAME, meshShaderFeatures);
 
         if (isExtensionSupported(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME))
             addToPNextChain(&conditionalRenderingFeatures);
@@ -817,6 +823,45 @@ std::unique_ptr<CVulkanPhysicalDevice> CVulkanPhysicalDevice::create(core::smart
             
         features.geometryShader = deviceFeatures.features.geometryShader;
         features.tessellationShader = deviceFeatures.features.tessellationShader;
+
+        //check if features are existant first
+        //potentially put a copy of VkPhysicalDeviceMeshShaderFeaturesEXT directly into features
+        //depends on the less obvious properties
+        if (isExtensionSupported(VK_EXT_MESH_SHADER_EXTENSION_NAME)) {
+            features.taskShader = meshShaderFeatures.taskShader;
+            features.meshShader = meshShaderFeatures.meshShader;
+            //TODO
+            //VkBool32           multiviewMeshShader;
+            //VkBool32           primitiveFragmentShadingRateMeshShader;
+            //VkBool32           meshShaderQueries;
+
+            //VkPhysicalDeviceMeshShaderPropertiesEXT
+            //#define LIMIT_INIT_MESH(limitMemberName) properties.limits.limitMemberName = meshShaderProperties.limitMemberName
+            //LIMIT_INIT_MESH(maxTaskWorkGroupTotalCount);
+            //LIMIT_INIT_MESH(maxTaskWorkGroupInvocations);
+            //LIMIT_INIT_MESH(maxTaskPayloadSize);
+            //LIMIT_INIT_MESH(maxTaskSharedMemorySize);
+            //LIMIT_INIT_MESH(maxTaskPayloadAndSharedMemorySize);
+            //LIMIT_INIT_MESH(maxMeshWorkGroupInvocations);
+            //LIMIT_INIT_MESH(maxMeshSharedMemorySize);
+            //LIMIT_INIT_MESH(maxMeshPayloadAndSharedMemorySize);
+            //LIMIT_INIT_MESH(maxMeshOutputMemorySize);
+            //LIMIT_INIT_MESH(maxMeshOutputComponents);
+            //LIMIT_INIT_MESH(maxMeshOutputVertices);
+            //LIMIT_INIT_MESH(maxMeshOutputPrimitives);
+            //LIMIT_INIT_MESH(maxMeshOutputLayers);
+            //LIMIT_INIT_MESH(maxMeshMultiviewViewCount);
+            //LIMIT_INIT_MESH(maxMeshOutputPerVertexGranularity);
+            //LIMIT_INIT_MESH(maxMeshOutputPerPrimitiveGranularity);
+
+            //for(uint8_t i = 0; i < 3; i++){
+            //    LIMIT_INIT_MESH(maxTaskWorkGroupCount[i]);
+            //    LIMIT_INIT_MESH(maxTaskWorkGroupSize[i]);
+            //    LIMIT_INIT_MESH(maxMeshWorkGroupCount[i]);
+            //    LIMIT_INIT_MESH(maxMeshWorkGroupSize[i]);
+            //}
+            //#undef LIMIT_INIT_MESH
+        }
             
         if (!deviceFeatures.features.sampleRateShading || !deviceFeatures.features.dualSrcBlend)
             RETURN_NULL_PHYSICAL_DEVICE;
@@ -1502,6 +1547,9 @@ core::smart_refctd_ptr<ILogicalDevice> CVulkanPhysicalDevice::createLogicalDevic
         VkPhysicalDeviceRayQueryFeaturesKHR rayQueryFeatures = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR,nullptr };
         REQUIRE_EXTENSION_IF(enabledFeatures.rayQuery,VK_KHR_RAY_QUERY_EXTENSION_NAME,&rayQueryFeatures); // feature dependency taken care of
 
+        VkPhysicalDeviceMeshShaderFeaturesEXT meshShaderFeatures = {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_EXT, nullptr};
+        REQUIRE_EXTENSION_IF(enabledFeatures.meshShader, VK_EXT_MESH_SHADER_EXTENSION_NAME, &meshShaderFeatures);
+
         VkPhysicalDeviceShaderSMBuiltinsFeaturesNV shaderSMBuiltinsFeaturesNV = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SM_BUILTINS_FEATURES_NV,nullptr };
         enableExtensionIfAvailable(VK_NV_SHADER_SM_BUILTINS_EXTENSION_NAME,&shaderSMBuiltinsFeaturesNV);
 
@@ -1818,6 +1866,12 @@ core::smart_refctd_ptr<ILogicalDevice> CVulkanPhysicalDevice::createLogicalDevic
 
         rayTracingPositionFetchFeatures.rayTracingPositionFetch = limits.rayTracingPositionFetch;
 
+        meshShaderFeatures.taskShader = enabledFeatures.taskShader;
+        meshShaderFeatures.meshShader = enabledFeatures.meshShader;
+        meshShaderFeatures.primitiveFragmentShadingRateMeshShader = VK_FALSE;//needs to be explicitly set?
+        meshShaderFeatures.meshShaderQueries = VK_FALSE;
+        meshShaderFeatures.multiviewMeshShader = VK_FALSE;
+
         //shaderSMBuiltinsFeaturesNV [LIMIT SO ENABLE EVERYTHING BY DEFAULT]
 
         representativeFragmentTestFeatures.representativeFragmentTest = enabledFeatures.representativeFragmentTest;
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 1f619666ab..d86a5a582d 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -963,6 +963,32 @@ template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IG
 template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
 
 
+bool IGPUCommandBuffer::bindGraphicsPipeline(const IGPUGraphicsPipeline* const pipeline)
+{
+    // Because binding of the Gfx pipeline can happen outside of a Renderpass Scope,
+    // we cannot check renderpass-pipeline compatibility here.
+    // And checking before every drawcall would be performance suicide.
+    if (!checkStateBeforeRecording(queue_flags_t::GRAPHICS_BIT))
+        return false;
+
+    if (!pipeline || !this->isCompatibleDevicewise(pipeline))
+    {
+        NBL_LOG_ERROR("incompatible pipeline device!");
+        return false;
+    }
+
+    if (!m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CBindGraphicsPipelineCmd>(m_commandList, core::smart_refctd_ptr<const IGPUGraphicsPipeline>(pipeline)))
+    {
+        NBL_LOG_ERROR("out of host memory!");
+        return false;
+    }
+
+    m_boundGraphicsPipeline = pipeline;
+
+    m_noCommands = false;
+    return bindGraphicsPipeline_impl(pipeline);
+}
+
 bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT))
@@ -988,7 +1014,7 @@ bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pip
     return true;
 }
 
-bool IGPUCommandBuffer::bindGraphicsPipeline(const IGPUGraphicsPipeline* const pipeline)
+bool IGPUCommandBuffer::bindMeshPipeline(const IGPUMeshPipeline* const pipeline)
 {
     // Because binding of the Gfx pipeline can happen outside of a Renderpass Scope,
     // we cannot check renderpass-pipeline compatibility here.
@@ -1002,18 +1028,19 @@ bool IGPUCommandBuffer::bindGraphicsPipeline(const IGPUGraphicsPipeline* const p
         return false;
     }
 
-    if (!m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CBindGraphicsPipelineCmd>(m_commandList, core::smart_refctd_ptr<const IGPUGraphicsPipeline>(pipeline)))
+    if (!m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CBindMeshPipelineCmd>(m_commandList, core::smart_refctd_ptr<const IGPUMeshPipeline>(pipeline)))
     {
         NBL_LOG_ERROR("out of host memory!");
         return false;
     }
 
-    m_boundGraphicsPipeline = pipeline;
+    m_boundMeshPipeline = pipeline;
 
     m_noCommands = false;
-    return bindGraphicsPipeline_impl(pipeline);
+    return bindMeshPipeline_impl(pipeline);
 }
 
+
 bool IGPUCommandBuffer::bindRayTracingPipeline(const IGPURayTracingPipeline* const pipeline)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT))
@@ -1421,9 +1448,18 @@ bool IGPUCommandBuffer::copyQueryPoolResults(
     return copyQueryPoolResults_impl(queryPool, firstQuery, queryCount, dstBuffer, stride, flags);
 }
 
-
 bool IGPUCommandBuffer::dispatch(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ)
 {
+    /*
+    * potentially do something like this here.
+    const bool whollyInsideRenderpass = m_recordingFlags.hasFlags(USAGE::RENDER_PASS_CONTINUE_BIT);
+    auto allowedQueueCaps = queue_flags_t::GRAPHICS_BIT;
+    auto allowedRenderpassScope = inside;
+    if (!whollyInsideRenderpass)
+        allowedQueueCaps = queue_flags_t::COMPUTE_BIT;
+        allowedRenderpassScope = outside;
+    */
+
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
 
@@ -1446,9 +1482,8 @@ bool IGPUCommandBuffer::dispatch(const uint32_t groupCountX, const uint32_t grou
 
 bool IGPUCommandBuffer::dispatchIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding)
 {
-    if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT,RENDERPASS_SCOPE::OUTSIDE))
+    if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT, RENDERPASS_SCOPE::OUTSIDE))
         return false;
-
     if (invalidBufferBinding(binding,4u/*TODO: is it really 4?*/,IGPUBuffer::EUF_INDIRECT_BUFFER_BIT))
         return false;
 
@@ -1462,6 +1497,60 @@ bool IGPUCommandBuffer::dispatchIndirect(const asset::SBufferBinding<const IGPUB
     return dispatchIndirect_impl(binding);
 }
 
+bool IGPUCommandBuffer::drawMeshTasks(const uint32_t groupCountX, const uint32_t groupCountY, const uint32_t groupCountZ) {
+    if (!checkStateBeforeRecording(queue_flags_t::GRAPHICS_BIT, RENDERPASS_SCOPE::INSIDE))
+        return false;
+
+    if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0)
+    {
+        NBL_LOG_ERROR("invalid group counts (%d, %d, %d)!", groupCountX, groupCountY, groupCountZ);
+        return false;
+    }
+
+    const auto& limits = getOriginDevice()->getPhysicalDevice()->getLimits();
+    if (groupCountX > limits.maxMeshWorkGroupCount[0] || groupCountY > limits.maxMeshWorkGroupCount[1] || groupCountZ > limits.maxMeshWorkGroupCount[2])
+    {
+        NBL_LOG_ERROR("group counts (%d, %d, %d) exceeds maximum counts (%d, %d, %d)!", groupCountX, groupCountY, groupCountZ, limits.maxMeshWorkGroupCount[0], limits.maxMeshWorkGroupCount[1], limits.maxMeshWorkGroupCount[2]);
+        return false;
+    }
+
+    m_noCommands = false;
+    return drawMeshTasks_impl(groupCountX, groupCountY, groupCountZ);
+}
+
+bool IGPUCommandBuffer::drawMeshTasksIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride)
+{
+	if (!checkStateBeforeRecording(queue_flags_t::GRAPHICS_BIT,RENDERPASS_SCOPE::INSIDE))
+        return false;
+    if (invalidBufferBinding(binding,4u/*TODO: is it really 4?*/,IGPUBuffer::EUF_INDIRECT_BUFFER_BIT)){
+        return false;
+    }
+    
+    if (drawCount) {
+        if (drawCount==1u)
+            stride = sizeof(DrawMeshTasksIndirectCommand_t);
+        if (stride&0x3u || stride<sizeof(DrawMeshTasksIndirectCommand_t)) {
+            NBL_LOG_ERROR("invalid command buffer stride (%d)!", stride);
+            return false;
+        }
+        if (drawCount > getOriginDevice()->getPhysicalDevice()->getLimits().maxDrawIndirectCount) {
+            NBL_LOG_ERROR("draw count (%d) exceeds maximum allowed amount (%d)!", drawCount, getOriginDevice()->getPhysicalDevice()->getLimits().maxDrawIndirectCount);
+            return false;
+        }
+        if (invalidBufferRange({ binding.offset,stride * (drawCount - 1u) + sizeof(IndirectCommand),binding.buffer }, alignof(uint32_t), IGPUBuffer::EUF_INDIRECT_BUFFER_BIT))
+            return false;
+    } // i get the feeling the vk command shouldnt be called if drawCount is 0, but this is how drawindirect does it
+
+    if (!m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CIndirectCmd>(m_commandList,core::smart_refctd_ptr<const IGPUBuffer>(binding.buffer)))
+    {
+        NBL_LOG_ERROR("out of host memory!");
+        return false;
+    }
+
+    m_noCommands = false;
+    return drawMeshTasksIndirect_impl(binding, drawCount, stride);
+}
+
 
 bool IGPUCommandBuffer::beginRenderPass(SRenderpassBeginInfo info, const SUBPASS_CONTENTS contents)
 {
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 7c3f5dbb81..561574b83d 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -833,6 +833,172 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     return retval;
 }
 
+bool MeshGraphicsCommonValidation(
+    const IGPURenderpass* renderpass, uint8_t subpassIndex,
+    SPhysicalDeviceLimits const& limits, SPhysicalDeviceFeatures const& features, 
+    nbl::asset::SRasterizationParams const& rasterParams, nbl::asset::SBlendParams const& blendParams,
+    const system::logger_opt_ptr m_logger,
+    const IPhysicalDevice::SFormatImageUsages& formatUsages
+) {
+    if (rasterParams.alphaToOneEnable && !features.alphaToOne)
+    {
+        NBL_LOG_ERROR("Feature `alpha to one` is not enabled");
+        return false;
+    }
+    if (rasterParams.depthBoundsTestEnable && !features.depthBounds)
+    {
+        NBL_LOG_ERROR("Feature `depth bounds` is not enabled");
+        return false;
+    }
+    const auto samples = 0x1u << rasterParams.samplesLog2;
+
+    const auto& passParams = renderpass->getCreationParameters();
+    const auto& subpass = passParams.subpasses[subpassIndex];
+    if (subpass.viewMask)
+    {
+        /*
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06047
+        if (!limits.multiviewTessellationShader && .test(tesS_contrOL))
+            return false;
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06048
+        if (!limits.multiviewGeomtryShader && .test(GEOMETRY))
+            return false;
+        */
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06578
+        //NOTE: index of MSB must be less than maxMultiviewViewCount; wrong negation here, should be >=
+        if (hlsl::findMSB(subpass.viewMask) > limits.maxMultiviewViewCount)
+        {
+            NBL_LOG_ERROR("Invalid viewMask (params[%u])", subpassIndex);
+            return false;
+        }
+    }
+    if (subpass.depthStencilAttachment.render.used())
+    {
+        const auto& attachment = passParams.depthStencilAttachments[subpass.depthStencilAttachment.render.attachmentIndex];
+
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-multisampledRenderToSingleSampled-06853
+        bool sampleCountNeedsToMatch = !features.mixedAttachmentSamples /*&& !features.multisampledRenderToSingleSampled*/;
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-subpass-01411
+        if (/*detect NV version && */(rasterParams.depthTestEnable() || rasterParams.stencilTestEnable() || rasterParams.depthBoundsTestEnable))
+            sampleCountNeedsToMatch = true;
+        if (sampleCountNeedsToMatch && attachment.samples != samples)
+        {
+            NBL_LOG_ERROR("Depth stencil and rasterization samples need to match (params[%u])", subpassIndex);
+            return false;
+        }
+    }
+    for (auto i = 0; i < IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments; i++)
+    {
+        const auto& render = subpass.colorAttachments[i].render;
+        if (render.used())
+        {
+            const auto& attachment = passParams.colorAttachments[render.attachmentIndex];
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06041
+            if (blendParams.blendParams[i].blendEnabled() && !formatUsages[attachment.format].attachmentBlend)
+            {
+                NBL_LOG_ERROR("Invalid color attachment (params[%u].colorAttachments[%u])", subpassIndex, i);
+                return false;
+            }
+
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-multisampledRenderToSingleSampled-06853
+            if (!features.mixedAttachmentSamples /*&& !features.multisampledRenderToSingleSampled*/ && attachment.samples != samples)
+            {
+                NBL_LOG_ERROR("Color attachment and rasterization samples need to match (params[%u].colorAttachments[%u])", subpassIndex, i);
+                return false;
+            }
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-subpass-01412
+            if (/*detect NV version && */(attachment.samples > samples))
+            {
+                NBL_LOG_ERROR("Invalid color attachment (params[%u].colorAttachments[%u])", subpassIndex, i);
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+//this is a COPY of graphics pipeline, with MINOR adjustments. 
+//no changes should be made DIRECTLY here
+//UNLESS it's DIRECTLY for mesh/task
+//there SHOULD be a function duplicates functionality between graphics and mesh pipeline that can be adjusted first
+bool ILogicalDevice::createMeshPipelines(
+    IGPUPipelineCache* const pipelineCache,
+    const std::span<const IGPUMeshPipeline::SCreationParams> params,
+    core::smart_refctd_ptr<IGPUMeshPipeline>* const output
+) {
+    std::fill_n(output, params.size(), nullptr);
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache, params);
+    if (!specConstantValidation) {
+        NBL_LOG_ERROR("Invalid parameters were given");
+        return false;
+    }
+
+    const auto& features = getEnabledFeatures();
+    const auto& limits = getPhysicalDeviceLimits();
+
+    core::vector<IGPUMeshPipeline::SCreationParams> newParams(params.begin(), params.end());
+    const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+                                             {return sum + param.getShaderCount();}
+                                            );
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
+
+    for (auto ix = 0u; ix < params.size(); ix++)
+    {
+        const auto& ci = params[ix];
+
+        if (params[ix].taskShader.shader != nullptr) {
+            if (!features.taskShader) {
+                NBL_LOG_ERROR("Feature `mesh shader` is not enabled");
+                return false;
+            }
+        }
+
+        //check extensions here
+        //it SEEMS like createGraphicsPipeline does, but it does it in a weird way I don't understand? 
+        //geo and tess are just flat disabled??
+        if (!features.meshShader) {
+            NBL_LOG_ERROR("Feature `mesh shader` is not enabled");
+            return false;
+        }
+
+        auto renderpass = ci.renderpass;
+        if (!renderpass->wasCreatedBy(this)) {
+            NBL_LOG_ERROR("Invalid renderpass was given (params[%u])", ix);
+            return false;
+        }
+        
+
+        MeshGraphicsCommonValidation(renderpass, ci.cached.subpassIx, limits, features, ci.cached.rasterization, ci.cached.blend, m_logger, getPhysicalDevice()->getImageFormatUsagesOptimalTiling());
+
+        SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
+        trimTask.insertEntryPoint(ci.taskShader, hlsl::ShaderStage::ESS_TASK);
+        trimTask.insertEntryPoint(ci.meshShader, hlsl::ShaderStage::ESS_MESH);
+        trimTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+
+        newParams[ix].taskShader = trimTask.trim(ci.taskShader, trimmedShaders);
+        newParams[ix].meshShader = trimTask.trim(ci.meshShader, trimmedShaders);
+        newParams[ix].fragmentShader = trimTask.trim(ci.fragmentShader, trimmedShaders);
+    }
+    createMeshPipelines_impl(pipelineCache, newParams, output, specConstantValidation);
+
+    for (auto i = 0u; i < params.size(); i++)
+    {
+        if (!output[i])
+        {
+            NBL_LOG_ERROR("MeshPipeline was not created (params[%u])", i);
+            return false;
+        }
+        else
+        {
+            m_logger.log("shader[%d] mesh debug name - %s\n", nbl::system::ILogger::ELL_DEBUG, i, params[i].meshShader.shader->getDebugName());
+           // TODO: set pipeline debug name thats a concatenation of all active stages' shader file path hints
+        }
+    }
+    return true;
+}
+
 bool ILogicalDevice::createGraphicsPipelines(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
@@ -888,88 +1054,13 @@ bool ILogicalDevice::createGraphicsPipelines(
             return false;
         }
 
-        const auto& rasterParams = ci.cached.rasterization;
-        if (rasterParams.alphaToOneEnable && !features.alphaToOne)
-        {
-            NBL_LOG_ERROR("Feature `alpha to one` is not enabled");
-            return false;
-        }
-        if (rasterParams.depthBoundsTestEnable && !features.depthBounds)
-        {
-            NBL_LOG_ERROR("Feature `depth bounds` is not enabled");
-            return false;
-        }
-
-        const auto samples = 0x1u << rasterParams.samplesLog2;
-
         // TODO: loads more validation on extra parameters here!
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-lineRasterizationMode-02766
 
         // TODO: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-subpass-01505
         // baiscally the AMD version must have the rasterization samples equal to the maximum of all attachment samples counts
 
-        const auto& passParams = renderpass->getCreationParameters();
-        const auto& subpass = passParams.subpasses[ci.cached.subpassIx];
-        if (subpass.viewMask)
-        {
-            /*
-            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06047
-            if (!limits.multiviewTessellationShader && .test(tesS_contrOL))
-                return false;
-            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06048
-            if (!limits.multiviewGeomtryShader && .test(GEOMETRY))
-                return false;
-            */
-            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06578
-            //NOTE: index of MSB must be less than maxMultiviewViewCount; wrong negation here, should be >=
-            if (hlsl::findMSB(subpass.viewMask) > limits.maxMultiviewViewCount)
-            {
-                NBL_LOG_ERROR("Invalid viewMask (params[%u])", ix);
-                return false;
-            }
-        }
-        if (subpass.depthStencilAttachment.render.used())
-        {
-            const auto& attachment = passParams.depthStencilAttachments[subpass.depthStencilAttachment.render.attachmentIndex];
-
-            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-multisampledRenderToSingleSampled-06853
-            bool sampleCountNeedsToMatch = !features.mixedAttachmentSamples /*&& !features.multisampledRenderToSingleSampled*/;
-            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-subpass-01411
-            if (/*detect NV version && */(rasterParams.depthTestEnable() || rasterParams.stencilTestEnable() || rasterParams.depthBoundsTestEnable))
-                sampleCountNeedsToMatch = true;
-            if (sampleCountNeedsToMatch && attachment.samples != samples)
-            {
-                NBL_LOG_ERROR("Invalid depth stencil attachment (params[%u])", ix);
-                return false;
-            }
-        }
-        for (auto i = 0; i < IGPURenderpass::SCreationParams::SSubpassDescription::MaxColorAttachments; i++)
-        {
-            const auto& render = subpass.colorAttachments[i].render;
-            if (render.used())
-            {
-                const auto& attachment = passParams.colorAttachments[render.attachmentIndex];
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-renderPass-06041
-                if (ci.cached.blend.blendParams[i].blendEnabled() && !getPhysicalDevice()->getImageFormatUsagesOptimalTiling()[attachment.format].attachmentBlend)
-                {
-                    NBL_LOG_ERROR("Invalid color attachment (params[%u].colorAttachments[%u])", ix, i);
-                    return false;
-                }
-
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-multisampledRenderToSingleSampled-06853
-                if (!features.mixedAttachmentSamples /*&& !features.multisampledRenderToSingleSampled*/ && attachment.samples != samples)
-                {
-                    NBL_LOG_ERROR("Invalid color attachment (params[%u].colorAttachments[%u])", ix, i);
-                    return false;
-                }
-                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-subpass-01412
-                if (/*detect NV version && */(attachment.samples > samples))
-                {
-                    NBL_LOG_ERROR("Invalid color attachment (params[%u].colorAttachments[%u])", ix, i);
-                    return false;
-                }
-            }
-        }
+        MeshGraphicsCommonValidation(renderpass, ci.cached.subpassIx, limits, features, ci.cached.rasterization, ci.cached.blend, m_logger, getPhysicalDevice()->getImageFormatUsagesOptimalTiling());
 
         SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
         trimTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
diff --git a/src/nbl/video/device_capabilities/device_features.json b/src/nbl/video/device_capabilities/device_features.json
index 5e4775e9b4..3cdaee820d 100644
--- a/src/nbl/video/device_capabilities/device_features.json
+++ b/src/nbl/video/device_capabilities/device_features.json
@@ -50,6 +50,16 @@
                     "type": "bool",
                     "name": "tessellationShader",
                     "value": false
+                },
+                {
+                    "type": "bool",
+                    "name": "meshShader",
+                    "value": false
+                },
+                {
+                    "type": "bool",
+                    "name": "taskShader",
+                    "value": false
                 }
             ]
         },
diff --git a/src/nbl/video/device_capabilities/device_limits.json b/src/nbl/video/device_capabilities/device_limits.json
index e8bc3a3af4..b1f8852f00 100644
--- a/src/nbl/video/device_capabilities/device_limits.json
+++ b/src/nbl/video/device_capabilities/device_limits.json
@@ -339,6 +339,123 @@
                 }
             ]
         },
+        {
+            "comment": ["VkPhysicalDeviceMeshShaderPropertiesEXT - task"],
+            "entries":[
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskWorkGroupTotalCount",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskWorkGroupCount[3]",
+                    "value": "{MinMaxWorkgroupCount,MinMaxWorkgroupCount,MinMaxWorkgroupCount}"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskWorkGroupInvocations",
+                    "value": "MinMaxWorkgroupInvocations"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskWorkGroupSize[3]",
+                    "value": "{MinMaxWorkgroupInvocations,MinMaxWorkgroupInvocations,64u}"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskPayloadSize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskSharedMemorySize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxTaskPayloadAndSharedMemorySize",
+                    "value": 0
+                }
+            ]
+        },
+        {
+            "comment": ["VkPhysicalDeviceMeshShaderPropertiesEXT - mesh"],
+            "entries":[
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshWorkGroupCount[3]",
+                    "value": "{MinMaxWorkgroupCount,MinMaxWorkgroupCount,MinMaxWorkgroupCount}"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshWorkGroupInvocations",
+                    "value": "MinMaxWorkgroupInvocations"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshWorkGroupSize[3]",
+                    "value": "{MinMaxWorkgroupInvocations,MinMaxWorkgroupInvocations,64u}"
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshSharedMemorySize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshPayloadAndSharedMemorySize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputMemorySize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshPayloadAndOutputMemorySize",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputComponents",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputVertices",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputPrimitives",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputLayers",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshMultiviewViewCount",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputPerVertexGranularity",
+                    "value": 0
+                },
+                {
+                    "type": "uint32_t",
+                    "name": "maxMeshOutputPerPrimitiveGranularity",
+                    "value": 0
+                }
+
+                
+            ]
+        },
         {
             "comment": [],
             "entries": [
diff --git a/tools/nsc/main.cpp b/tools/nsc/main.cpp
index c4ce43b326..edc56de84c 100644
--- a/tools/nsc/main.cpp
+++ b/tools/nsc/main.cpp
@@ -153,6 +153,7 @@ class ShaderCompiler final : public system::IApplicationFramework
 			});
 		};
 		
+		auto preprocessOnly = findOutputFlag("-P") != m_arguments.end();
 		auto output_flag_pos_fc = findOutputFlag("-Fc");
 		auto output_flag_pos_fo = findOutputFlag("-Fo");
 		if (output_flag_pos_fc != m_arguments.end() && output_flag_pos_fo != m_arguments.end()) {
@@ -195,7 +196,8 @@ class ShaderCompiler final : public system::IApplicationFramework
 				return false;
 			}
 		
-			m_logger->log("Compiled shader code will be saved to " + output_filepath, ILogger::ELL_INFO);
+			std::string outputType = preprocessOnly ? "Preprocessed" : "Compiled";
+			m_logger->log(outputType + " shader code will be saved to " + output_filepath, ILogger::ELL_INFO);
 		}
 
 #ifndef NBL_EMBED_BUILTIN_RESOURCES
@@ -227,13 +229,27 @@ class ShaderCompiler final : public system::IApplicationFramework
 		}
 
 		auto start = std::chrono::high_resolution_clock::now();
-		auto compilation_result = compile_shader(shader.get(), shaderStage, file_to_compile);
+		smart_refctd_ptr<IShader> compilation_result;
+		std::string preprocessing_result;
+		std::string_view result_view;
+		if (preprocessOnly)
+		{
+			preprocessing_result = preprocess_shader(shader.get(), shaderStage, file_to_compile);
+			result_view = preprocessing_result;
+		}
+		else
+		{
+			compilation_result = compile_shader(shader.get(), shaderStage, file_to_compile);
+			result_view = { (const char*)compilation_result->getContent()->getPointer(), compilation_result->getContent()->getSize() };
+		}
 		auto end = std::chrono::high_resolution_clock::now();
 
-		// writie compiled shader to file as bytes
-		if (compilation_result) 
+		// write compiled/preprocessed shader to file as bytes
+		std::string operationType = preprocessOnly ? "preprocessing" : "compilation";
+		const bool success = preprocessOnly ? preprocessing_result != std::string{} : bool(compilation_result);
+		if (success) 
 		{
-			m_logger->log("Shader compilation successful.", ILogger::ELL_INFO);
+			m_logger->log("Shader " + operationType + " successful.", ILogger::ELL_INFO);
 			const auto took = std::to_string(std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count());
 			m_logger->log("Took %s ms.", ILogger::ELL_PERFORMANCE, took.c_str());
 			{
@@ -258,7 +274,7 @@ class ShaderCompiler final : public system::IApplicationFramework
 				return false;
 			}
 
-			output_file.write((const char*)compilation_result->getContent()->getPointer(), compilation_result->getContent()->getSize());
+			output_file.write(result_view.data(), result_view.size());
 
 			if (output_file.fail()) 
 			{
@@ -279,7 +295,7 @@ class ShaderCompiler final : public system::IApplicationFramework
 		}
 		else 
 		{
-			m_logger->log("Shader compilation failed.", ILogger::ELL_ERROR);
+			m_logger->log("Shader " + operationType + " failed.", ILogger::ELL_ERROR);
 			return false;
 		}
 	}
@@ -291,6 +307,28 @@ class ShaderCompiler final : public system::IApplicationFramework
 
 private:
 
+	std::string preprocess_shader(const IShader* shader, hlsl::ShaderStage shaderStage, std::string_view sourceIdentifier) {
+		smart_refctd_ptr<CHLSLCompiler> hlslcompiler = make_smart_refctd_ptr<CHLSLCompiler>(smart_refctd_ptr(m_system));
+
+		CHLSLCompiler::SPreprocessorOptions options = {};
+		options.sourceIdentifier = sourceIdentifier;
+		options.logger = m_logger.get();
+
+		auto includeFinder = make_smart_refctd_ptr<IShaderCompiler::CIncludeFinder>(smart_refctd_ptr(m_system));
+		auto includeLoader = includeFinder->getDefaultFileSystemLoader();
+
+		// because before real compilation we do preprocess the input it doesn't really matter we proxy include search direcotries further with dxcOptions since at the end all includes are resolved to single file
+		for (const auto& it : m_include_search_paths)
+			includeFinder->addSearchPath(it, includeLoader);
+
+		options.includeFinder = includeFinder.get();
+
+		const char* code_ptr = (const char*)shader->getContent()->getPointer();
+		std::string_view code({ code_ptr, strlen(code_ptr)});
+
+		return hlslcompiler->preprocessShader(std::string(code), shaderStage, options, nullptr);
+	}
+
 	core::smart_refctd_ptr<IShader> compile_shader(const IShader* shader, hlsl::ShaderStage shaderStage, std::string_view sourceIdentifier) {
 		smart_refctd_ptr<CHLSLCompiler> hlslcompiler = make_smart_refctd_ptr<CHLSLCompiler>(smart_refctd_ptr(m_system));