From f2ea51d0b3e3388c0f9bae03602ec3b1f658c124 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Sun, 23 Mar 2025 19:29:49 -0300
Subject: [PATCH 01/57] Morton code tests

---
 CMakeLists.txt                       |  3 +-
 XX_Mortons/CMakeLists.txt            | 24 ++++++++++
 XX_Mortons/app_resources/shader.hlsl |  7 +++
 XX_Mortons/config.json.template      | 28 +++++++++++
 XX_Mortons/main.cpp                  | 69 ++++++++++++++++++++++++++++
 XX_Mortons/pipeline.groovy           | 50 ++++++++++++++++++++
 6 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 XX_Mortons/CMakeLists.txt
 create mode 100644 XX_Mortons/app_resources/shader.hlsl
 create mode 100644 XX_Mortons/config.json.template
 create mode 100644 XX_Mortons/main.cpp
 create mode 100644 XX_Mortons/pipeline.groovy

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb03f95a4..7fcddfc18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,8 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
-  add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+  	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+	add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()
diff --git a/XX_Mortons/CMakeLists.txt b/XX_Mortons/CMakeLists.txt
new file mode 100644
index 000000000..a434ff32a
--- /dev/null
+++ b/XX_Mortons/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl
new file mode 100644
index 000000000..a24a78191
--- /dev/null
+++ b/XX_Mortons/app_resources/shader.hlsl
@@ -0,0 +1,7 @@
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
+[numthreads(512, 1, 1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[0], nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[1]);
+}
\ No newline at end of file
diff --git a/XX_Mortons/config.json.template b/XX_Mortons/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/XX_Mortons/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
new file mode 100644
index 000000000..881c84417
--- /dev/null
+++ b/XX_Mortons/main.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include <bitset>
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
+class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = application_templates::MonoDeviceApplication;
+		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
+			const char* includeMainName)
+		{
+			std::string prelude = "#include \"";
+			auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
+			assert(CPUShader);
+			return m_device->createShader(CPUShader.get());
+		}
+	public:
+		MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			createShader("app_resources/shader.hlsl");
+
+			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
+			for (auto i = 0u; i < 3; i++)
+			{
+				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
+			}
+
+			return true;
+		}
+
+		// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+		void workLoopBody() override {}
+
+		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+		bool keepRunning() override {return false;}
+
+	private:
+		smart_refctd_ptr<nbl::video::CVulkanConnection> m_api;
+};
+
+
+NBL_MAIN_FUNC(MortonTestApp)
\ No newline at end of file
diff --git a/XX_Mortons/pipeline.groovy b/XX_Mortons/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/XX_Mortons/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file

From 8f4e4529ca6f31ace6498cf9ac4284c14dbdf652 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 25 Mar 2025 10:44:31 -0300
Subject: [PATCH 02/57] Morton codes creating properly

---
 XX_Mortons/app_resources/common.hlsl |  10 ++
 XX_Mortons/app_resources/shader.hlsl |  15 +-
 XX_Mortons/main.cpp                  | 241 ++++++++++++++++++++++++++-
 3 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 XX_Mortons/app_resources/common.hlsl

diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl
new file mode 100644
index 000000000..3a9fca3fa
--- /dev/null
+++ b/XX_Mortons/app_resources/common.hlsl
@@ -0,0 +1,10 @@
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
+NBL_CONSTEXPR uint32_t bufferSize = 256;
+using scalar_t = int32_t;
+using unsigned_scalar_t = nbl::hlsl::make_unsigned_t<scalar_t>;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl
index a24a78191..d1f7c967e 100644
--- a/XX_Mortons/app_resources/shader.hlsl
+++ b/XX_Mortons/app_resources/shader.hlsl
@@ -1,7 +1,16 @@
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
 
-[numthreads(512, 1, 1)]
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+[numthreads(bufferSize, 1, 1)]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
-	printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[0], nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[1]);
+	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
+	
+	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
+
+	accessor.set(0, foo.value);
 }
\ No newline at end of file
diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
index 881c84417..860b581d2 100644
--- a/XX_Mortons/main.cpp
+++ b/XX_Mortons/main.cpp
@@ -7,7 +7,7 @@
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "app_resources/common.hlsl"
 #include <bitset>
 
 using namespace nbl;
@@ -16,7 +16,6 @@ using namespace system;
 using namespace asset;
 using namespace video;
 
-
 // this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
 class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -44,14 +43,221 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
-			createShader("app_resources/shader.hlsl");
+			auto shader = createShader("app_resources/shader.hlsl");
+
+			// Create massive upload/download buffers
+			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+			constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+			if (!m_utils)
+				return logFail("Failed to create Utilities!");
+			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+			// Create device-local buffer
+			{
+				IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+
+				IQueue* const queue = getComputeQueue();
+				uint32_t queueFamilyIndex = queue->getFamilyIndex();
+
+				deviceLocalBufferParams.queueFamilyIndexCount = 1;
+				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+				deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize;
+				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+				auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+				mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+				m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+			}
+
+			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
+
+			{
+				auto layout = m_device->createPipelineLayout({ &pcRange,1 });
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = shader.get();
+				params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+				params.shader.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+					return logFail("Failed to create compute pipeline!\n");
+			}
+
+			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+			// We'll align to max of coherent atom size even if the memory is coherent,
+			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+			m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
+
+			// Semaphor used here to know the FFT is done before download
+			m_timeline = m_device->createSemaphore(semaphorValue);
+
+			IQueue* const queue = getComputeQueue();
+
+			const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize;
+
+			// Just need a single suballocation in this example
+			const uint32_t AllocationCount = 1;
+
+			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+			auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+			m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
+
+			// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
+			{
+				auto* const inputPtr = reinterpret_cast<unsigned_scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+				for (auto j = 0; j < bufferSize; j++)
+				{
+					unsigned_scalar_t x = j > 0 ? 0.f : 2.f;
+					unsigned_scalar_t y = 0;
+
+					/*
+					unsigned_scalar_t x = 1.f;
+					unsigned_scalar_t y = 0.f;
+					*/
+
+					inputPtr[2 * j] = x;
+					inputPtr[2 * j + 1] = y;
+				}
+				// Always remember to flush!
+				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+				{
+					const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+					const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+					m_device->flushMappedMemoryRanges(1, &range);
+				}
+			}
+
+			// finally allocate our output range
+			const uint32_t outputSize = inputSize;
 
+			auto outputOffset = m_downStreamingBuffer->invalid_value;
+			m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
+					return logFail("Failed to create Command Buffers!\n");
+				}
+				cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				cmdbuf->bindComputePipeline(m_pipeline.get());
+				// This is the new fun part, pushing constants
+				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
+				IGPUCommandBuffer::SBufferCopy copyInfo = {};
+				copyInfo.srcOffset = 0;
+				copyInfo.dstOffset = 0;
+				copyInfo.size = m_deviceLocalBuffer->getSize();
+				cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+				// Remember we do a single workgroup per 1D array in these parts
+				cmdbuf->dispatch(1, 1, 1);
+
+				// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
+
+				decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {};
+				pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
+
+				barrier.range.buffer = m_deviceLocalBuffer;
+
+				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+
+				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+				cmdbuf->end();
+			}
+
+			semaphorValue++;
+			{
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+				{
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+				{
+					.semaphore = m_timeline.get(),
+					.value = semaphorValue,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				};
+
+				const IQueue::SSubmitInfo submitInfo = {
+					.waitSemaphores = {},
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signalInfo,1}
+				};
+
+				m_api->startCapture();
+				queue->submit({ &submitInfo,1 });
+				m_api->endCapture();
+			}
+
+			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
+			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
+
+			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+			m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
+
+			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+				IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+				{
+					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+					assert(dstOffset == 0 && size == outputSize);
+
+					std::cout << "Begin array GPU\n";
+					unsigned_scalar_t* const data = reinterpret_cast<unsigned_scalar_t*>(const_cast<void*>(bufSrc));
+					std::cout << std::bitset<32>(data[0]) << "\n";
+					/*
+					for (auto i = 0u; i < bufferSize; i++) {
+						std::cout << std::bitset<32>(data[i]) << "\n";
+					}
+					*/
+					std::cout << "\nEnd array GPU\n";
+				},
+				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+				// It could also be latched in the upstreaming deallocate, because its the same fence.
+				std::move(cmdbuf), m_downStreamingBuffer
+			);
+			// We put a function we want to execute 
+			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
+
+			// ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------
 			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
 			for (auto i = 0u; i < 3; i++)
 			{
 				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
 			}
 
+			const auto someCode = hlsl::morton::code<uint32_t, 4>::create(hlsl::vector<uint32_t, 4>(1, 1, 1, 1));
+
 			return true;
 		}
 
@@ -61,8 +267,35 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
 		bool keepRunning() override {return false;}
 
+		// Cleanup
+		bool onAppTerminated() override
+		{
+			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+			while (m_downStreamingBuffer->cull_frees()) {}
+			return device_base_t::onAppTerminated();
+		}
+
 	private:
-		smart_refctd_ptr<nbl::video::CVulkanConnection> m_api;
+		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+		// These are Buffer Device Addresses
+		uint64_t m_upStreamingBufferAddress;
+		uint64_t m_downStreamingBufferAddress;
+		uint64_t m_deviceLocalBufferAddress;
+
+		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+		uint32_t m_alignment;
+
+		// This example really lets the advantages of a timeline semaphore shine through!
+		smart_refctd_ptr<ISemaphore> m_timeline;
+		uint64_t semaphorValue = 0;
 };
 
 

From 0aedfd929a505657ef761c84be15cfaf8d4ddb7b Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Fri, 28 Mar 2025 20:16:45 -0300
Subject: [PATCH 03/57] All tests passing, HLSL compiles fine!

---
 XX_Mortons/main.cpp | 235 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 177 insertions(+), 58 deletions(-)

diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
index 860b581d2..b20662904 100644
--- a/XX_Mortons/main.cpp
+++ b/XX_Mortons/main.cpp
@@ -10,6 +10,9 @@
 #include "app_resources/common.hlsl"
 #include <bitset>
 
+// Right now the test only checks that HLSL compiles the file
+constexpr bool TestHLSL = true;
+
 using namespace nbl;
 using namespace core;
 using namespace system;
@@ -22,6 +25,12 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		using device_base_t = application_templates::MonoDeviceApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
+		using morton_t = nbl::hlsl::morton::code<int32_t, 3>;
+		using vector_t = nbl::hlsl::vector<int32_t, 3>;
+		using unsigned_morton_t = nbl::hlsl::morton::code<uint32_t, 3>;
+		using unsigned_vector_t = nbl::hlsl::vector<uint32_t, 3>;
+		using bool_vector_t = nbl::hlsl::vector<bool, 3>;
+
 		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
 			const char* includeMainName)
 		{
@@ -43,18 +52,173 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
+			
+			// Coordinate extraction and whole vector decode tests
+			{
+				morton_t morton(vector_t(-1011, 765, 248));
+				unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011));
+
+				assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248);
+				assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u);
+
+				assert(static_cast<vector_t>(morton) == vector_t(-1011, 765, 248) && static_cast<unsigned_vector_t>(unsignedMorton) == unsigned_vector_t(154, 789, 1011));
+			}
+
+			// ***********************************************************************************************************************************
+			// ************************************************* Arithmetic operator tests *******************************************************
+			// ***********************************************************************************************************************************
+			
+			//  ----------------------------------------------------------------------------------------------------
+			//  --------------------------------------- ADDITION ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// ---------------------------------------- Signed -----------------------------------------------------
+			
+			// No overflow
+			assert(static_cast<vector_t>(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448));
+			
+			// Type 1 overflow: Addition of representable coordinates goes out of range
+			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504));
+
+			// Type 2 overflow: Addition of irrepresentable range gives correct result
+			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224));
+
+			// ---------------------------------------- Unsigned -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763));
+
+			// Type 1 overflow: Addition of representable coordinates goes out of range
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519));
+
+			// Type 2 overflow: Addition of irrepresentable range gives correct result
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- SUBTRACTION -------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// ---------------------------------------- Signed -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<vector_t>(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465));
+
+			// Type 1 overflow: Subtraction of representable coordinates goes out of range
+			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504));
+
+			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
+			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224));
+
+			// ---------------------------------------- Unsigned -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244));
+
+			// Type 1 overflow: Subtraction of representable coordinates goes out of range
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567));
+
+			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485));
+
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- UNARY NEGATION ----------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Only makes sense for signed
+			assert(static_cast<vector_t>(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475));
+
+			// ***********************************************************************************************************************************
+			// ************************************************* Comparison operator tests *******************************************************
+			// ***********************************************************************************************************************************
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR< ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+			
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR<= --------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR> ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR>= --------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
+
+
+			if(!TestHLSL)
+				return true;
+
+
+
+
+
+
+
+
+
+			// ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ----------------------------------------------
 			auto shader = createShader("app_resources/shader.hlsl");
 
 			// Create massive upload/download buffers
 			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
-			constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
 
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
 			if (!m_utils)
 				return logFail("Failed to create Utilities!");
-			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
 			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
 			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
 
 			// Create device-local buffer
@@ -109,40 +273,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// Just need a single suballocation in this example
 			const uint32_t AllocationCount = 1;
 
-			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
-			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
-			auto inputOffset = m_upStreamingBuffer->invalid_value;
-
 			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
 			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
 			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
-			m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
-
-			// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
-			{
-				auto* const inputPtr = reinterpret_cast<unsigned_scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
-				for (auto j = 0; j < bufferSize; j++)
-				{
-					unsigned_scalar_t x = j > 0 ? 0.f : 2.f;
-					unsigned_scalar_t y = 0;
-
-					/*
-					unsigned_scalar_t x = 1.f;
-					unsigned_scalar_t y = 0.f;
-					*/
-
-					inputPtr[2 * j] = x;
-					inputPtr[2 * j + 1] = y;
-				}
-				// Always remember to flush!
-				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
-				{
-					const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
-					const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
-					m_device->flushMappedMemoryRanges(1, &range);
-				}
-			}
 
 			// finally allocate our output range
 			const uint32_t outputSize = inputSize;
@@ -161,11 +294,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				cmdbuf->bindComputePipeline(m_pipeline.get());
 				// This is the new fun part, pushing constants
 				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
-				IGPUCommandBuffer::SBufferCopy copyInfo = {};
-				copyInfo.srcOffset = 0;
-				copyInfo.dstOffset = 0;
-				copyInfo.size = m_deviceLocalBuffer->getSize();
-				cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
 				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
 				// Remember we do a single workgroup per 1D array in these parts
 				cmdbuf->dispatch(1, 1, 1);
@@ -184,6 +312,11 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
 
 				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+
+				IGPUCommandBuffer::SBufferCopy copyInfo = {};
+				copyInfo.srcOffset = 0;
+				copyInfo.dstOffset = 0;
+				copyInfo.size = m_deviceLocalBuffer->getSize();
 				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
 				cmdbuf->end();
 			}
@@ -215,10 +348,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
 			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
 
-			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
-			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
-			m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
-
 			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
 			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
 			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
@@ -249,15 +378,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// We put a function we want to execute 
 			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
 
-			// ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------
-			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
-			for (auto i = 0u; i < 3; i++)
-			{
-				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
-			}
-
-			const auto someCode = hlsl::morton::code<uint32_t, 4>::create(hlsl::vector<uint32_t, 4>(1, 1, 1, 1));
-
 			return true;
 		}
 
@@ -272,7 +392,10 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		{
 			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
 			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
-			while (m_downStreamingBuffer->cull_frees()) {}
+			if (TestHLSL)
+			{
+				while (m_downStreamingBuffer->cull_frees()) {}
+			}
 			return device_base_t::onAppTerminated();
 		}
 
@@ -281,19 +404,15 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
 
-		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
 		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
 		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
 
 		// These are Buffer Device Addresses
-		uint64_t m_upStreamingBufferAddress;
 		uint64_t m_downStreamingBufferAddress;
 		uint64_t m_deviceLocalBufferAddress;
 
-		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
 
-		// This example really lets the advantages of a timeline semaphore shine through!
 		smart_refctd_ptr<ISemaphore> m_timeline;
 		uint64_t semaphorValue = 0;
 };

From ea42d5bf287cbff376809be65f64c71567e0134f Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 1 Apr 2025 15:44:55 -0300
Subject: [PATCH 04/57] Rename example

---
 {XX_Mortons => 12_Mortons}/CMakeLists.txt     |  0
 12_Mortons/app_resources/common.hlsl          | 13 ++++++++++++
 .../app_resources/shader.hlsl                 |  8 ++++---
 .../config.json.template                      |  0
 {XX_Mortons => 12_Mortons}/main.cpp           | 21 ++++++++-----------
 {XX_Mortons => 12_Mortons}/pipeline.groovy    |  0
 CMakeLists.txt                                |  2 +-
 XX_Mortons/app_resources/common.hlsl          | 10 ---------
 8 files changed, 28 insertions(+), 26 deletions(-)
 rename {XX_Mortons => 12_Mortons}/CMakeLists.txt (100%)
 create mode 100644 12_Mortons/app_resources/common.hlsl
 rename {XX_Mortons => 12_Mortons}/app_resources/shader.hlsl (79%)
 rename {XX_Mortons => 12_Mortons}/config.json.template (100%)
 rename {XX_Mortons => 12_Mortons}/main.cpp (97%)
 rename {XX_Mortons => 12_Mortons}/pipeline.groovy (100%)
 delete mode 100644 XX_Mortons/app_resources/common.hlsl

diff --git a/XX_Mortons/CMakeLists.txt b/12_Mortons/CMakeLists.txt
similarity index 100%
rename from XX_Mortons/CMakeLists.txt
rename to 12_Mortons/CMakeLists.txt
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
new file mode 100644
index 000000000..bd5184f80
--- /dev/null
+++ b/12_Mortons/app_resources/common.hlsl
@@ -0,0 +1,13 @@
+//#include "nbl/builtin/hlsl/morton.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+NBL_CONSTEXPR uint32_t bufferSize = 256;
+
+// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations
+//using morton_t2 = nbl::hlsl::morton::code<true, 8, 2>; // Fits in an int16_t
+using vector_t2 = nbl::hlsl::vector<int16_t, 3>;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl
similarity index 79%
rename from XX_Mortons/app_resources/shader.hlsl
rename to 12_Mortons/app_resources/shader.hlsl
index d1f7c967e..e7f570eee 100644
--- a/XX_Mortons/app_resources/shader.hlsl
+++ b/12_Mortons/app_resources/shader.hlsl
@@ -3,14 +3,16 @@
 
 [[vk::push_constant]] PushConstantData pushConstants;
 
-using namespace nbl::hlsl;
-
 [numthreads(bufferSize, 1, 1)]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
+	/*
 	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
 	
 	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
 
-	accessor.set(0, foo.value);
+	//accessor.set(0, foo.value);
+	*/
+	uint32_t bar = _static_cast<uint32_t>(0xCAFEDEADDEADBEEF);
+	accessor.set(0, bar);
 }
\ No newline at end of file
diff --git a/XX_Mortons/config.json.template b/12_Mortons/config.json.template
similarity index 100%
rename from XX_Mortons/config.json.template
rename to 12_Mortons/config.json.template
diff --git a/XX_Mortons/main.cpp b/12_Mortons/main.cpp
similarity index 97%
rename from XX_Mortons/main.cpp
rename to 12_Mortons/main.cpp
index b20662904..d1fddba7a 100644
--- a/XX_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -25,12 +25,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		using device_base_t = application_templates::MonoDeviceApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
-		using morton_t = nbl::hlsl::morton::code<int32_t, 3>;
-		using vector_t = nbl::hlsl::vector<int32_t, 3>;
-		using unsigned_morton_t = nbl::hlsl::morton::code<uint32_t, 3>;
-		using unsigned_vector_t = nbl::hlsl::vector<uint32_t, 3>;
-		using bool_vector_t = nbl::hlsl::vector<bool, 3>;
-
 		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
 			const char* includeMainName)
 		{
@@ -52,6 +46,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			/*
+
 			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
 			
 			// Coordinate extraction and whole vector decode tests
@@ -201,7 +197,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if(!TestHLSL)
 				return true;
 
-
+			*/
 
 
 
@@ -213,7 +209,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			auto shader = createShader("app_resources/shader.hlsl");
 
 			// Create massive upload/download buffers
-			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+			constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23;
 
 			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
 			if (!m_utils)
@@ -230,7 +226,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 				deviceLocalBufferParams.queueFamilyIndexCount = 1;
 				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
-				deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize;
+				deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize;
 				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
@@ -268,7 +264,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 			IQueue* const queue = getComputeQueue();
 
-			const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize;
+			const uint32_t inputSize = sizeof(uint32_t) * bufferSize;
 
 			// Just need a single suballocation in this example
 			const uint32_t AllocationCount = 1;
@@ -361,8 +357,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 					assert(dstOffset == 0 && size == outputSize);
 
 					std::cout << "Begin array GPU\n";
-					unsigned_scalar_t* const data = reinterpret_cast<unsigned_scalar_t*>(const_cast<void*>(bufSrc));
-					std::cout << std::bitset<32>(data[0]) << "\n";
+					uint32_t* const data = reinterpret_cast<uint32_t*>(const_cast<void*>(bufSrc));
+					//std::cout << std::bitset<32>(data[0]) << "\n";
+					std::cout << data[0] << "\n";
 					/*
 					for (auto i = 0u; i < bufferSize; i++) {
 						std::cout << std::bitset<32>(data[i]) << "\n";
diff --git a/XX_Mortons/pipeline.groovy b/12_Mortons/pipeline.groovy
similarity index 100%
rename from XX_Mortons/pipeline.groovy
rename to 12_Mortons/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fcddfc18..5d0c148cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT EXCLUDE_FROM_ALL)
+	add_subdirectory(12_Mortons EXCLUDE_FROM_ALL)
 
 
 	# Waiting for a refactor
@@ -96,7 +97,6 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
-	add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()
diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl
deleted file mode 100644
index 3a9fca3fa..000000000
--- a/XX_Mortons/app_resources/common.hlsl
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "nbl/builtin/hlsl/math/morton.hlsl"
-
-NBL_CONSTEXPR uint32_t bufferSize = 256;
-using scalar_t = int32_t;
-using unsigned_scalar_t = nbl::hlsl::make_unsigned_t<scalar_t>;
-
-struct PushConstantData
-{
-	uint64_t deviceBufferAddress;
-};
\ No newline at end of file

From 2ba08a4a39bf15b3c689666012b263794b8371f2 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 1 Apr 2025 17:43:20 -0300
Subject: [PATCH 05/57] Add tests for AddCarry and SUbBorrow intrinsics

---
 22_CppCompat/CIntrinsicsTester.h       |  13 +
 22_CppCompat/app_resources/common.hlsl | 859 +++++++++++++------------
 2 files changed, 451 insertions(+), 421 deletions(-)

diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index 77aa2c1ca..5fe7bc08e 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -85,6 +85,10 @@ class CIntrinsicsTester final : public ITester
             testInput.smoothStepEdge0 = realDistributionNeg(mt);
             testInput.smoothStepEdge1 = realDistributionPos(mt);
             testInput.smoothStepX = realDistribution(mt);
+            testInput.addCarryA = std::numeric_limits<uint32_t>::max() - uintDistribution(mt);
+            testInput.addCarryB = uintDistribution(mt);
+            testInput.subBorrowA = uintDistribution(mt);
+            testInput.subBorrowB = uintDistribution(mt);
 
             testInput.bitCountVec = int32_t3(intDistribution(mt), intDistribution(mt), intDistribution(mt));
             testInput.clampValVec = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt));
@@ -119,6 +123,10 @@ class CIntrinsicsTester final : public ITester
             testInput.refractI = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt));
             testInput.refractN = glm::normalize(float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt)));
             testInput.refractEta = realDistribution(mt);
+            testInput.addCarryAVec = uint32_t3(std::numeric_limits<uint32_t>::max() - uintDistribution(mt), std::numeric_limits<uint32_t>::max() - uintDistribution(mt), std::numeric_limits<uint32_t>::max() - uintDistribution(mt));
+            testInput.addCarryBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
+            testInput.subBorrowAVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
+            testInput.subBorrowBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
 
             // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
             IntrinsicsTestValues expected;
@@ -188,6 +196,11 @@ class CIntrinsicsTester final : public ITester
             auto inverseGlm = glm::inverse(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.inverse));
             expected.inverse = reinterpret_cast<float32_t3x3&>(inverseGlm);
 
+            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
+            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
+            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
+            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
+
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
diff --git a/22_CppCompat/app_resources/common.hlsl b/22_CppCompat/app_resources/common.hlsl
index e2303a2fc..dc3ff5fcd 100644
--- a/22_CppCompat/app_resources/common.hlsl
+++ b/22_CppCompat/app_resources/common.hlsl
@@ -1,74 +1,74 @@
-//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
-//// This file is part of the "Nabla Engine".
-//// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
-
-// because DXC doesn't properly support `_Static_assert`
-// TODO: add a message, and move to macros.h or cpp_compat
-#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
-
-#include <boost/preprocessor.hpp>
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
-
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
-
-#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
-#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
-
-#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
-
-#include <nbl/builtin/hlsl/mpl.hlsl>
-#include <nbl/builtin/hlsl/bit.hlsl>
-
-#include <nbl/builtin/hlsl/limits.hlsl>
-
-
-#include <nbl/builtin/hlsl/barycentric/utils.hlsl>
-#include <nbl/builtin/hlsl/member_test_macros.hlsl>
-#include <nbl/builtin/hlsl/device_capabilities_traits.hlsl>
-
-#include <nbl/builtin/hlsl/tgmath.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
-
-// tgmath.hlsl and intrinsics.hlsl tests
-
-using namespace nbl::hlsl;
-struct TgmathIntputTestValues
-{
-	float floor;
-	float isnan;
-	float isinf;
-	float powX;
-	float powY;
-	float exp;
-	float exp2;
-	float log;
-	float log2;
-	float absF;
-	int absI;
-	float sqrt;
-	float sin;
-	float cos;
-	float acos;
-	float modf;
-	float round;
-	float roundEven;
-	float trunc;
-	float ceil;
-	float fmaX;
-	float fmaY;
-	float fmaZ;
-	float ldexpArg;
-	int ldexpExp;
-	float modfStruct;
-	float frexpStruct;
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
+
+// because DXC doesn't properly support `_Static_assert`
+// TODO: add a message, and move to macros.h or cpp_compat
+#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
+
+#include <boost/preprocessor.hpp>
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
+
+#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
+#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
+
+#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
+
+#include <nbl/builtin/hlsl/mpl.hlsl>
+#include <nbl/builtin/hlsl/bit.hlsl>
+
+#include <nbl/builtin/hlsl/limits.hlsl>
+
+
+#include <nbl/builtin/hlsl/barycentric/utils.hlsl>
+#include <nbl/builtin/hlsl/member_test_macros.hlsl>
+#include <nbl/builtin/hlsl/device_capabilities_traits.hlsl>
+
+#include <nbl/builtin/hlsl/tgmath.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
+
+// tgmath.hlsl and intrinsics.hlsl tests
+
+using namespace nbl::hlsl;
+struct TgmathIntputTestValues
+{
+	float floor;
+	float isnan;
+	float isinf;
+	float powX;
+	float powY;
+	float exp;
+	float exp2;
+	float log;
+	float log2;
+	float absF;
+	int absI;
+	float sqrt;
+	float sin;
+	float cos;
+	float acos;
+	float modf;
+	float round;
+	float roundEven;
+	float trunc;
+	float ceil;
+	float fmaX;
+	float fmaY;
+	float fmaZ;
+	float ldexpArg;
+	int ldexpExp;
+	float modfStruct;
+	float frexpStruct;
 	float tan;
 	float asin;
 	float atan;
@@ -78,38 +78,38 @@ struct TgmathIntputTestValues
 	float asinh;
 	float acosh;
 	float atanh;
-	float atan2X;
-	float atan2Y;
-	float erf;
-	float erfInv;
-
-	float32_t3 floorVec;
-	float32_t3 isnanVec;
-	float32_t3 isinfVec;
-	float32_t3 powXVec;
-	float32_t3 powYVec;
-	float32_t3 expVec;
-	float32_t3 exp2Vec;
-	float32_t3 logVec;
-	float32_t3 log2Vec;
-	float32_t3 absFVec;
-	int32_t3 absIVec;
-	float32_t3 sqrtVec;
-	float32_t3 sinVec;
-	float32_t3 cosVec;
-	float32_t3 acosVec;
-	float32_t3 modfVec;
-	float32_t3 roundVec;
-	float32_t3 roundEvenVec;
-	float32_t3 truncVec;
-	float32_t3 ceilVec;
-	float32_t3 fmaXVec;
-	float32_t3 fmaYVec;
-	float32_t3 fmaZVec;
-	float32_t3 ldexpArgVec;
-	int32_t3 ldexpExpVec;
-	float32_t3 modfStructVec;
-	float32_t3 frexpStructVec;
+	float atan2X;
+	float atan2Y;
+	float erf;
+	float erfInv;
+
+	float32_t3 floorVec;
+	float32_t3 isnanVec;
+	float32_t3 isinfVec;
+	float32_t3 powXVec;
+	float32_t3 powYVec;
+	float32_t3 expVec;
+	float32_t3 exp2Vec;
+	float32_t3 logVec;
+	float32_t3 log2Vec;
+	float32_t3 absFVec;
+	int32_t3 absIVec;
+	float32_t3 sqrtVec;
+	float32_t3 sinVec;
+	float32_t3 cosVec;
+	float32_t3 acosVec;
+	float32_t3 modfVec;
+	float32_t3 roundVec;
+	float32_t3 roundEvenVec;
+	float32_t3 truncVec;
+	float32_t3 ceilVec;
+	float32_t3 fmaXVec;
+	float32_t3 fmaYVec;
+	float32_t3 fmaZVec;
+	float32_t3 ldexpArgVec;
+	int32_t3 ldexpExpVec;
+	float32_t3 modfStructVec;
+	float32_t3 frexpStructVec;
 	float32_t3 tanVec;
 	float32_t3 asinVec;
 	float32_t3 atanVec;
@@ -119,35 +119,35 @@ struct TgmathIntputTestValues
 	float32_t3 asinhVec;
 	float32_t3 acoshVec;
 	float32_t3 atanhVec;
-	float32_t3 atan2XVec;
-	float32_t3 atan2YVec;
-	float32_t3 erfVec;
-	float32_t3 erfInvVec;
-};
-
-struct TgmathTestValues
-{
-	float floor;
-	int isnan;
-	int isinf;
-	float pow;
-	float exp;
-	float exp2;
-	float log;
-	float log2;
-	float absF;
-	int absI;
-	float sqrt;
-	float sin;
-	float cos;
-	float acos;
-	float modf;
-	float round;
-	float roundEven;
-	float trunc;
-	float ceil;
-	float fma;
-	float ldexp;
+	float32_t3 atan2XVec;
+	float32_t3 atan2YVec;
+	float32_t3 erfVec;
+	float32_t3 erfInvVec;
+};
+
+struct TgmathTestValues
+{
+	float floor;
+	int isnan;
+	int isinf;
+	float pow;
+	float exp;
+	float exp2;
+	float log;
+	float log2;
+	float absF;
+	int absI;
+	float sqrt;
+	float sin;
+	float cos;
+	float acos;
+	float modf;
+	float round;
+	float roundEven;
+	float trunc;
+	float ceil;
+	float fma;
+	float ldexp;
 	float tan;
 	float asin;
 	float atan;
@@ -157,40 +157,40 @@ struct TgmathTestValues
 	float asinh;
 	float acosh;
 	float atanh;
-	float atan2;
-	float erf;
-	float erfInv;
-
-	float32_t3 floorVec;
-
-	// we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below 
-	// and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035
-#ifndef __HLSL_VERSION
-	nbl::hlsl::vector<int, 3> isnanVec;
-	nbl::hlsl::vector<int, 3> isinfVec;
-#else
-	vector<int, 3> isnanVec;
-	vector<int, 3> isinfVec;
-#endif
-	
-	float32_t3 powVec;
-	float32_t3 expVec;
-	float32_t3 exp2Vec;
-	float32_t3 logVec;
-	float32_t3 log2Vec;
-	float32_t3 absFVec;
-	int32_t3 absIVec;
-	float32_t3 sqrtVec;
-	float32_t3 cosVec;
-	float32_t3 sinVec;
-	float32_t3 acosVec;
-	float32_t3 modfVec;
-	float32_t3 roundVec;
-	float32_t3 roundEvenVec;
-	float32_t3 truncVec;
-	float32_t3 ceilVec;
-	float32_t3 fmaVec;
-	float32_t3 ldexpVec;
+	float atan2;
+	float erf;
+	float erfInv;
+
+	float32_t3 floorVec;
+
+	// we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below 
+	// and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035
+#ifndef __HLSL_VERSION
+	nbl::hlsl::vector<int, 3> isnanVec;
+	nbl::hlsl::vector<int, 3> isinfVec;
+#else
+	vector<int, 3> isnanVec;
+	vector<int, 3> isinfVec;
+#endif
+	
+	float32_t3 powVec;
+	float32_t3 expVec;
+	float32_t3 exp2Vec;
+	float32_t3 logVec;
+	float32_t3 log2Vec;
+	float32_t3 absFVec;
+	int32_t3 absIVec;
+	float32_t3 sqrtVec;
+	float32_t3 cosVec;
+	float32_t3 sinVec;
+	float32_t3 acosVec;
+	float32_t3 modfVec;
+	float32_t3 roundVec;
+	float32_t3 roundEvenVec;
+	float32_t3 truncVec;
+	float32_t3 ceilVec;
+	float32_t3 fmaVec;
+	float32_t3 ldexpVec;
 	float32_t3 tanVec;
 	float32_t3 asinVec;
 	float32_t3 atanVec;
@@ -200,258 +200,275 @@ struct TgmathTestValues
 	float32_t3 asinhVec;
 	float32_t3 acoshVec;
 	float32_t3 atanhVec;
-	float32_t3 atan2Vec;
-	float32_t3 erfVec;
-	float32_t3 erfInvVec;
-
-	ModfOutput<float> modfStruct;
-	ModfOutput<float32_t3> modfStructVec;
-	FrexpOutput<float> frexpStruct;
-	FrexpOutput<float32_t3> frexpStructVec;
-
-	void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input)
-	{
-		floor = nbl::hlsl::floor(input.floor);
-		isnan = nbl::hlsl::isnan(input.isnan);
-		isinf = nbl::hlsl::isinf(input.isinf);
-		pow = nbl::hlsl::pow(input.powX, input.powY);
-		exp = nbl::hlsl::exp(input.exp);
-		exp2 = nbl::hlsl::exp2(input.exp2);
-		log = nbl::hlsl::log(input.log);
-		log2 = nbl::hlsl::log2(input.log2);
-		absF = nbl::hlsl::abs(input.absF);
-		absI = nbl::hlsl::abs(input.absI);
-		sqrt = nbl::hlsl::sqrt(input.sqrt);
-		sin = nbl::hlsl::sin(input.sin);
-		cos = nbl::hlsl::cos(input.cos);
-		tan = nbl::hlsl::tan(input.tan);
-		asin = nbl::hlsl::asin(input.asin);
-		atan = nbl::hlsl::atan(input.atan);
-		sinh = nbl::hlsl::sinh(input.sinh);
-		cosh = nbl::hlsl::cosh(input.cosh);
-		tanh = nbl::hlsl::tanh(input.tanh);
-		asinh = nbl::hlsl::asinh(input.asinh);
-		acosh = nbl::hlsl::acosh(input.acosh);
-		atanh = nbl::hlsl::atanh(input.atanh);
-		atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X);
-		erf = nbl::hlsl::erf(input.erf);
-		erfInv = nbl::hlsl::erfInv(input.erfInv);
-		acos = nbl::hlsl::acos(input.acos);
-		modf = nbl::hlsl::modf(input.modf);
-		round = nbl::hlsl::round(input.round);
-		roundEven = nbl::hlsl::roundEven(input.roundEven);
-		trunc = nbl::hlsl::trunc(input.trunc);
-		ceil = nbl::hlsl::ceil(input.ceil);
-		fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ);
-		ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp);
-
-		floorVec = nbl::hlsl::floor(input.floorVec);
-		isnanVec = nbl::hlsl::isnan(input.isnanVec);
-		isinfVec = nbl::hlsl::isinf(input.isinfVec);
-		powVec = nbl::hlsl::pow(input.powXVec, input.powYVec);
-		expVec = nbl::hlsl::exp(input.expVec);
-		exp2Vec = nbl::hlsl::exp2(input.exp2Vec);
-		logVec = nbl::hlsl::log(input.logVec);
-		log2Vec = nbl::hlsl::log2(input.log2Vec);
-		absFVec = nbl::hlsl::abs(input.absFVec);
-		absIVec = nbl::hlsl::abs(input.absIVec);
-		sqrtVec = nbl::hlsl::sqrt(input.sqrtVec);
-		sinVec = nbl::hlsl::sin(input.sinVec);
-		cosVec = nbl::hlsl::cos(input.cosVec);
-		tanVec = nbl::hlsl::tan(input.tanVec);
-		asinVec = nbl::hlsl::asin(input.asinVec);
-		atanVec = nbl::hlsl::atan(input.atanVec);
-		sinhVec = nbl::hlsl::sinh(input.sinhVec);
-		coshVec = nbl::hlsl::cosh(input.coshVec);
-		tanhVec = nbl::hlsl::tanh(input.tanhVec);
-		asinhVec = nbl::hlsl::asinh(input.asinhVec);
-		acoshVec = nbl::hlsl::acosh(input.acoshVec);
-		atanhVec = nbl::hlsl::atanh(input.atanhVec);
-		atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec);
-		acosVec = nbl::hlsl::acos(input.acosVec);
-		modfVec = nbl::hlsl::modf(input.modfVec);
-		roundVec = nbl::hlsl::round(input.roundVec);
-		roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec);
-		truncVec = nbl::hlsl::trunc(input.truncVec);
-		ceilVec = nbl::hlsl::ceil(input.ceilVec);
-		fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec);
-		ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec);
-		erfVec = nbl::hlsl::erf(input.erfVec);
-		erfInvVec = nbl::hlsl::erfInv(input.erfInvVec);
-
-		modfStruct = nbl::hlsl::modfStruct(input.modfStruct);
-		modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec);
-		frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct);
-		frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec);
-	}
-};
-
-struct IntrinsicsIntputTestValues
-{
-	int bitCount;
-	float32_t3 crossLhs;
-	float32_t3 crossRhs;
-	float clampVal;
-	float clampMin;
-	float clampMax;
-	float32_t3 length;
-	float32_t3 normalize;
-	float32_t3 dotLhs;
-	float32_t3 dotRhs;
-	float32_t3x3 determinant;
-	uint32_t findMSB;
-	uint32_t findLSB;
-	float32_t3x3 inverse;
-	float32_t3x3 transpose;
-	float32_t3x3 mulLhs;
-	float32_t3x3 mulRhs;
-	float minA;
-	float minB;
-	float maxA;
-	float maxB;
-	float rsqrt;
-	uint32_t bitReverse;
-	float frac;
-	float mixX;
-	float mixY;
-	float mixA;
-	float sign;
-	float radians;
-	float degrees;
-	float stepEdge;
-	float stepX;
-	float smoothStepEdge0;
-	float smoothStepEdge1;
-	float smoothStepX;
-
-	int32_t3 bitCountVec;
-	float32_t3 clampValVec;
-	float32_t3 clampMinVec;
-	float32_t3 clampMaxVec;
-	uint32_t3 findMSBVec;
-	uint32_t3 findLSBVec;
-	float32_t3 minAVec;
-	float32_t3 minBVec;
-	float32_t3 maxAVec;
-	float32_t3 maxBVec;
-	float32_t3 rsqrtVec;
-	uint32_t3 bitReverseVec;
-	float32_t3 fracVec;
-	float32_t3 mixXVec;
-	float32_t3 mixYVec;
-	float32_t3 mixAVec;
-	float32_t3 signVec;
-	float32_t3 radiansVec;
-	float32_t3 degreesVec;
-	float32_t3 stepEdgeVec;
-	float32_t3 stepXVec;
-	float32_t3 smoothStepEdge0Vec;
-	float32_t3 smoothStepEdge1Vec;
-	float32_t3 smoothStepXVec;
-	float32_t3 faceForwardN;
-	float32_t3 faceForwardI;
-	float32_t3 faceForwardNref;
-	float32_t3 reflectI;
-	float32_t3 reflectN;
-	float32_t3 refractI;
-	float32_t3 refractN;
-	float refractEta;
-};
-
-struct IntrinsicsTestValues
-{
-	int bitCount;
-	float clamp;
-	float length;
-	float dot;
-	float determinant;
-	int findMSB;
-	int findLSB;
-	float min;
-	float max;
-	float rsqrt;
-	float frac;
-	uint32_t bitReverse;
-	float mix;
-	float sign;
-	float radians;
-	float degrees;
-	float step;
-	float smoothStep;
-
-	float32_t3 normalize;
-	float32_t3 cross;
-	int32_t3 bitCountVec;
-	float32_t3 clampVec;
-	uint32_t3 findMSBVec;
-	uint32_t3 findLSBVec;
-	float32_t3 minVec;
-	float32_t3 maxVec;
-	float32_t3 rsqrtVec;
-	uint32_t3 bitReverseVec;
-	float32_t3 fracVec;
-	float32_t3 mixVec;
-	float32_t3 signVec;
-	float32_t3 radiansVec;
-	float32_t3 degreesVec;
-	float32_t3 stepVec;
-	float32_t3 smoothStepVec;
-	float32_t3 faceForward;
-	float32_t3 reflect;
-	float32_t3 refract;
-
-	float32_t3x3 mul;
-	float32_t3x3 transpose;
-	float32_t3x3 inverse;
-
-	void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input)
-	{
-		bitCount = nbl::hlsl::bitCount(input.bitCount);
-		cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs);
-		clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax);
-		length = nbl::hlsl::length(input.length);
-		normalize = nbl::hlsl::normalize(input.normalize);
-		dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs);
-		determinant = nbl::hlsl::determinant(input.determinant);
-		findMSB = nbl::hlsl::findMSB(input.findMSB);
-		findLSB = nbl::hlsl::findLSB(input.findLSB);
-		inverse = nbl::hlsl::inverse(input.inverse);
-		transpose = nbl::hlsl::transpose(input.transpose);
-		mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs);
-		// TODO: fix min and max
-		min = nbl::hlsl::min(input.minA, input.minB);
-		max = nbl::hlsl::max(input.maxA, input.maxB);
-		rsqrt = nbl::hlsl::rsqrt(input.rsqrt);
-		bitReverse = nbl::hlsl::bitReverse(input.bitReverse);
-		frac = nbl::hlsl::fract(input.frac);
-		mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA);
-		sign = nbl::hlsl::sign(input.sign);
-		radians = nbl::hlsl::radians(input.radians);
-		degrees = nbl::hlsl::degrees(input.degrees);
-		step = nbl::hlsl::step(input.stepEdge, input.stepX);
-		smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX);
-
-		bitCountVec = nbl::hlsl::bitCount(input.bitCountVec);
-		clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec);
-		findMSBVec = nbl::hlsl::findMSB(input.findMSBVec);
-		findLSBVec = nbl::hlsl::findLSB(input.findLSBVec);
-		// TODO: fix min and max
-		minVec = nbl::hlsl::min(input.minAVec, input.minBVec);
-		maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec);
-		rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec);
-		bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec);
-		fracVec = nbl::hlsl::fract(input.fracVec);
-		mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec);
-		
-		signVec = nbl::hlsl::sign(input.signVec);
-		radiansVec = nbl::hlsl::radians(input.radiansVec);
-		degreesVec = nbl::hlsl::degrees(input.degreesVec);
-		stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec);
-		smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec);
-		faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref);
-		reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN);
-		refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta);
-	}
-};
-
-#endif
+	float32_t3 atan2Vec;
+	float32_t3 erfVec;
+	float32_t3 erfInvVec;
+
+	ModfOutput<float> modfStruct;
+	ModfOutput<float32_t3> modfStructVec;
+	FrexpOutput<float> frexpStruct;
+	FrexpOutput<float32_t3> frexpStructVec;
+
+	void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input)
+	{
+		floor = nbl::hlsl::floor(input.floor);
+		isnan = nbl::hlsl::isnan(input.isnan);
+		isinf = nbl::hlsl::isinf(input.isinf);
+		pow = nbl::hlsl::pow(input.powX, input.powY);
+		exp = nbl::hlsl::exp(input.exp);
+		exp2 = nbl::hlsl::exp2(input.exp2);
+		log = nbl::hlsl::log(input.log);
+		log2 = nbl::hlsl::log2(input.log2);
+		absF = nbl::hlsl::abs(input.absF);
+		absI = nbl::hlsl::abs(input.absI);
+		sqrt = nbl::hlsl::sqrt(input.sqrt);
+		sin = nbl::hlsl::sin(input.sin);
+		cos = nbl::hlsl::cos(input.cos);
+		tan = nbl::hlsl::tan(input.tan);
+		asin = nbl::hlsl::asin(input.asin);
+		atan = nbl::hlsl::atan(input.atan);
+		sinh = nbl::hlsl::sinh(input.sinh);
+		cosh = nbl::hlsl::cosh(input.cosh);
+		tanh = nbl::hlsl::tanh(input.tanh);
+		asinh = nbl::hlsl::asinh(input.asinh);
+		acosh = nbl::hlsl::acosh(input.acosh);
+		atanh = nbl::hlsl::atanh(input.atanh);
+		atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X);
+		erf = nbl::hlsl::erf(input.erf);
+		erfInv = nbl::hlsl::erfInv(input.erfInv);
+		acos = nbl::hlsl::acos(input.acos);
+		modf = nbl::hlsl::modf(input.modf);
+		round = nbl::hlsl::round(input.round);
+		roundEven = nbl::hlsl::roundEven(input.roundEven);
+		trunc = nbl::hlsl::trunc(input.trunc);
+		ceil = nbl::hlsl::ceil(input.ceil);
+		fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ);
+		ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp);
+
+		floorVec = nbl::hlsl::floor(input.floorVec);
+		isnanVec = nbl::hlsl::isnan(input.isnanVec);
+		isinfVec = nbl::hlsl::isinf(input.isinfVec);
+		powVec = nbl::hlsl::pow(input.powXVec, input.powYVec);
+		expVec = nbl::hlsl::exp(input.expVec);
+		exp2Vec = nbl::hlsl::exp2(input.exp2Vec);
+		logVec = nbl::hlsl::log(input.logVec);
+		log2Vec = nbl::hlsl::log2(input.log2Vec);
+		absFVec = nbl::hlsl::abs(input.absFVec);
+		absIVec = nbl::hlsl::abs(input.absIVec);
+		sqrtVec = nbl::hlsl::sqrt(input.sqrtVec);
+		sinVec = nbl::hlsl::sin(input.sinVec);
+		cosVec = nbl::hlsl::cos(input.cosVec);
+		tanVec = nbl::hlsl::tan(input.tanVec);
+		asinVec = nbl::hlsl::asin(input.asinVec);
+		atanVec = nbl::hlsl::atan(input.atanVec);
+		sinhVec = nbl::hlsl::sinh(input.sinhVec);
+		coshVec = nbl::hlsl::cosh(input.coshVec);
+		tanhVec = nbl::hlsl::tanh(input.tanhVec);
+		asinhVec = nbl::hlsl::asinh(input.asinhVec);
+		acoshVec = nbl::hlsl::acosh(input.acoshVec);
+		atanhVec = nbl::hlsl::atanh(input.atanhVec);
+		atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec);
+		acosVec = nbl::hlsl::acos(input.acosVec);
+		modfVec = nbl::hlsl::modf(input.modfVec);
+		roundVec = nbl::hlsl::round(input.roundVec);
+		roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec);
+		truncVec = nbl::hlsl::trunc(input.truncVec);
+		ceilVec = nbl::hlsl::ceil(input.ceilVec);
+		fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec);
+		ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec);
+		erfVec = nbl::hlsl::erf(input.erfVec);
+		erfInvVec = nbl::hlsl::erfInv(input.erfInvVec);
+
+		modfStruct = nbl::hlsl::modfStruct(input.modfStruct);
+		modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec);
+		frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct);
+		frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec);
+	}
+};
+
+struct IntrinsicsIntputTestValues
+{
+	int bitCount;
+	float32_t3 crossLhs;
+	float32_t3 crossRhs;
+	float clampVal;
+	float clampMin;
+	float clampMax;
+	float32_t3 length;
+	float32_t3 normalize;
+	float32_t3 dotLhs;
+	float32_t3 dotRhs;
+	float32_t3x3 determinant;
+	uint32_t findMSB;
+	uint32_t findLSB;
+	float32_t3x3 inverse;
+	float32_t3x3 transpose;
+	float32_t3x3 mulLhs;
+	float32_t3x3 mulRhs;
+	float minA;
+	float minB;
+	float maxA;
+	float maxB;
+	float rsqrt;
+	uint32_t bitReverse;
+	float frac;
+	float mixX;
+	float mixY;
+	float mixA;
+	float sign;
+	float radians;
+	float degrees;
+	float stepEdge;
+	float stepX;
+	float smoothStepEdge0;
+	float smoothStepEdge1;
+	float smoothStepX;
+	uint32_t addCarryA;
+	uint32_t addCarryB;
+	uint32_t subBorrowA;
+	uint32_t subBorrowB;
+
+	int32_t3 bitCountVec;
+	float32_t3 clampValVec;
+	float32_t3 clampMinVec;
+	float32_t3 clampMaxVec;
+	uint32_t3 findMSBVec;
+	uint32_t3 findLSBVec;
+	float32_t3 minAVec;
+	float32_t3 minBVec;
+	float32_t3 maxAVec;
+	float32_t3 maxBVec;
+	float32_t3 rsqrtVec;
+	uint32_t3 bitReverseVec;
+	float32_t3 fracVec;
+	float32_t3 mixXVec;
+	float32_t3 mixYVec;
+	float32_t3 mixAVec;
+	float32_t3 signVec;
+	float32_t3 radiansVec;
+	float32_t3 degreesVec;
+	float32_t3 stepEdgeVec;
+	float32_t3 stepXVec;
+	float32_t3 smoothStepEdge0Vec;
+	float32_t3 smoothStepEdge1Vec;
+	float32_t3 smoothStepXVec;
+	float32_t3 faceForwardN;
+	float32_t3 faceForwardI;
+	float32_t3 faceForwardNref;
+	float32_t3 reflectI;
+	float32_t3 reflectN;
+	float32_t3 refractI;
+	float32_t3 refractN;
+	float refractEta;
+	uint32_t3 addCarryAVec;
+	uint32_t3 addCarryBVec;
+	uint32_t3 subBorrowAVec;
+	uint32_t3 subBorrowBVec;
+};
+
+struct IntrinsicsTestValues
+{
+	int bitCount;
+	float clamp;
+	float length;
+	float dot;
+	float determinant;
+	int findMSB;
+	int findLSB;
+	float min;
+	float max;
+	float rsqrt;
+	float frac;
+	uint32_t bitReverse;
+	float mix;
+	float sign;
+	float radians;
+	float degrees;
+	float step;
+	float smoothStep;
+
+	float32_t3 normalize;
+	float32_t3 cross;
+	int32_t3 bitCountVec;
+	float32_t3 clampVec;
+	uint32_t3 findMSBVec;
+	uint32_t3 findLSBVec;
+	float32_t3 minVec;
+	float32_t3 maxVec;
+	float32_t3 rsqrtVec;
+	uint32_t3 bitReverseVec;
+	float32_t3 fracVec;
+	float32_t3 mixVec;
+	float32_t3 signVec;
+	float32_t3 radiansVec;
+	float32_t3 degreesVec;
+	float32_t3 stepVec;
+	float32_t3 smoothStepVec;
+	float32_t3 faceForward;
+	float32_t3 reflect;
+	float32_t3 refract;
+
+	float32_t3x3 mul;
+	float32_t3x3 transpose;
+	float32_t3x3 inverse;
+
+	spirv::AddCarryOutput<uint32_t> addCarry;
+	spirv::SubBorrowOutput<uint32_t> subBorrow;
+	spirv::AddCarryOutput<uint32_t3> addCarryVec;
+	spirv::SubBorrowOutput<uint32_t3> subBorrowVec;
+
+	void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input)
+	{
+		bitCount = nbl::hlsl::bitCount(input.bitCount);
+		cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs);
+		clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax);
+		length = nbl::hlsl::length(input.length);
+		normalize = nbl::hlsl::normalize(input.normalize);
+		dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs);
+		determinant = nbl::hlsl::determinant(input.determinant);
+		findMSB = nbl::hlsl::findMSB(input.findMSB);
+		findLSB = nbl::hlsl::findLSB(input.findLSB);
+		inverse = nbl::hlsl::inverse(input.inverse);
+		transpose = nbl::hlsl::transpose(input.transpose);
+		mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs);
+		// TODO: fix min and max
+		min = nbl::hlsl::min(input.minA, input.minB);
+		max = nbl::hlsl::max(input.maxA, input.maxB);
+		rsqrt = nbl::hlsl::rsqrt(input.rsqrt);
+		bitReverse = nbl::hlsl::bitReverse(input.bitReverse);
+		frac = nbl::hlsl::fract(input.frac);
+		mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA);
+		sign = nbl::hlsl::sign(input.sign);
+		radians = nbl::hlsl::radians(input.radians);
+		degrees = nbl::hlsl::degrees(input.degrees);
+		step = nbl::hlsl::step(input.stepEdge, input.stepX);
+		smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX);
+
+		bitCountVec = nbl::hlsl::bitCount(input.bitCountVec);
+		clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec);
+		findMSBVec = nbl::hlsl::findMSB(input.findMSBVec);
+		findLSBVec = nbl::hlsl::findLSB(input.findLSBVec);
+		// TODO: fix min and max
+		minVec = nbl::hlsl::min(input.minAVec, input.minBVec);
+		maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec);
+		rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec);
+		bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec);
+		fracVec = nbl::hlsl::fract(input.fracVec);
+		mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec);
+		
+		signVec = nbl::hlsl::sign(input.signVec);
+		radiansVec = nbl::hlsl::radians(input.radiansVec);
+		degreesVec = nbl::hlsl::degrees(input.degreesVec);
+		stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec);
+		smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec);
+		faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref);
+		reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN);
+		refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta);
+		addCarry = nbl::hlsl::addCarry(input.addCarryA, input.addCarryB);
+		subBorrow = nbl::hlsl::subBorrow(input.subBorrowA, input.subBorrowB);
+		addCarryVec = nbl::hlsl::addCarry(input.addCarryAVec, input.addCarryBVec);
+		subBorrowVec = nbl::hlsl::subBorrow(input.subBorrowAVec, input.subBorrowBVec);
+	}
+};
+
+#endif

From f00bbf6fa914ec230df8a000deee75aee69cdce9 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 7 Apr 2025 19:48:46 -0300
Subject: [PATCH 06/57] Disable intrinsic tests for uSUbBorrow for the time
 being, start copying 22_CppCOmpat to run tests

---
 12_Mortons/Tester.h                  | 417 +++++++++++++++++++++++++++
 12_Mortons/app_resources/common.hlsl |  38 ++-
 12_Mortons/app_resources/shader.hlsl |  18 --
 12_Mortons/main.cpp                  | 198 +------------
 22_CppCompat/CIntrinsicsTester.h     |  22 +-
 5 files changed, 474 insertions(+), 219 deletions(-)
 create mode 100644 12_Mortons/Tester.h
 delete mode 100644 12_Mortons/app_resources/shader.hlsl

diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h
new file mode 100644
index 000000000..5c4773111
--- /dev/null
+++ b/12_Mortons/Tester.h
@@ -0,0 +1,417 @@
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+
+#include <nabla.h>
+#include "app_resources/common.hlsl"
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+
+class Tester
+{
+public:
+    virtual ~Tester()
+    {
+        m_outputBufferAllocation.memory->unmap();
+    };
+
+    struct PipelineSetupData
+    {
+        std::string testShaderPath;
+
+        core::smart_refctd_ptr<video::ILogicalDevice> device;
+        core::smart_refctd_ptr<video::CVulkanConnection> api;
+        core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
+        core::smart_refctd_ptr<system::ILogger> logger;
+        video::IPhysicalDevice* physicalDevice;
+        uint32_t computeFamilyIndex;
+    };
+
+    template<typename InputStruct, typename OutputStruct>
+    void setupPipeline(const PipelineSetupData& pipleineSetupData)
+    {
+        // setting up pipeline in the constructor
+        m_device = core::smart_refctd_ptr(pipleineSetupData.device);
+        m_physicalDevice = pipleineSetupData.physicalDevice;
+        m_api = core::smart_refctd_ptr(pipleineSetupData.api);
+        m_assetMgr = core::smart_refctd_ptr(pipleineSetupData.assetMgr);
+        m_logger = core::smart_refctd_ptr(pipleineSetupData.logger);
+        m_queueFamily = pipleineSetupData.computeFamilyIndex;
+        m_semaphoreCounter = 0;
+        m_semaphore = m_device->createSemaphore(0);
+        m_cmdpool = m_device->createCommandPool(m_queueFamily, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
+            logFail("Failed to create Command Buffers!\n");
+
+        // Load shaders, set up pipeline
+        core::smart_refctd_ptr<video::IGPUShader> shader;
+        {
+            asset::IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+            {
+                logFail("Could not load shader!");
+                assert(0);
+            }
+
+            // It would be super weird if loading a shader from a file produced more than 1 asset
+            assert(assets.size() == 1);
+            core::smart_refctd_ptr<asset::ICPUShader> source = asset::IAsset::castDown<asset::ICPUShader>(assets[0]);
+
+            auto* compilerSet = m_assetMgr->getCompilerSet();
+
+            asset::IShaderCompiler::SCompilerOptions options = {};
+            options.stage = source->getStage();
+            options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+            options.spirvOptimizer = nullptr;
+            options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
+            options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+            options.preprocessorOptions.logger = m_logger.get();
+            options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
+
+            auto spirv = compilerSet->compileToSPIRV(source.get(), options);
+
+            video::ILogicalDevice::SShaderCreationParameters params{};
+            params.cpushader = spirv.get();
+            shader = m_device->createShader(params);
+        }
+
+        if (!shader)
+            logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
+
+        video::IGPUDescriptorSetLayout::SBinding bindings[2] = {
+            {
+                .binding = 0,
+                .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+                .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+                .stageFlags = ShaderStage::ESS_COMPUTE,
+                .count = 1
+            },
+            {
+                .binding = 1,
+                .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+                .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+                .stageFlags = ShaderStage::ESS_COMPUTE,
+                .count = 1
+            }
+        };
+
+        core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout(bindings);
+        if (!dsLayout)
+            logFail("Failed to create a Descriptor Layout!\n");
+
+        m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout));
+        if (!m_pplnLayout)
+            logFail("Failed to create a Pipeline Layout!\n");
+
+        {
+            video::IGPUComputePipeline::SCreationParams params = {};
+            params.layout = m_pplnLayout.get();
+            params.shader.entryPoint = "main";
+            params.shader.shader = shader.get();
+            if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+                logFail("Failed to create pipelines (compile & link shaders)!\n");
+        }
+
+        // Allocate memory of the input buffer
+        {
+            constexpr size_t BufferSize = sizeof(InputStruct);
+
+            video::IGPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+            core::smart_refctd_ptr<video::IGPUBuffer> inputBuff = m_device->createBuffer(std::move(params));
+            if (!inputBuff)
+                logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
+
+            video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
+            reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+            m_inputBufferAllocation = m_device->allocate(reqs, inputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
+            if (!m_inputBufferAllocation.isValid())
+                logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+            assert(inputBuff->getBoundMemory().memory == m_inputBufferAllocation.memory.get());
+            core::smart_refctd_ptr<video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
+
+            m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
+            {
+                video::IGPUDescriptorSet::SDescriptorInfo info[1];
+                info[0].desc = core::smart_refctd_ptr(inputBuff);
+                info[0].info.buffer = { .offset = 0,.size = BufferSize };
+                video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+                    {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
+                };
+                m_device->updateDescriptorSets(writes, {});
+            }
+        }
+
+        // Allocate memory of the output buffer
+        {
+            constexpr size_t BufferSize = sizeof(OutputStruct);
+
+            video::IGPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+            core::smart_refctd_ptr<video::IGPUBuffer> outputBuff = m_device->createBuffer(std::move(params));
+            if (!outputBuff)
+                logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
+
+            video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
+            reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+            m_outputBufferAllocation = m_device->allocate(reqs, outputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
+            if (!m_outputBufferAllocation.isValid())
+                logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+            assert(outputBuff->getBoundMemory().memory == m_outputBufferAllocation.memory.get());
+            core::smart_refctd_ptr<video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
+
+            {
+                video::IGPUDescriptorSet::SDescriptorInfo info[1];
+                info[0].desc = core::smart_refctd_ptr(outputBuff);
+                info[0].info.buffer = { .offset = 0,.size = BufferSize };
+                video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+                    {.dstSet = m_ds.get(),.binding = 1,.arrayElement = 0,.count = 1,.info = info}
+                };
+                m_device->updateDescriptorSets(writes, {});
+            }
+        }
+
+        if (!m_outputBufferAllocation.memory->map({ 0ull,m_outputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ))
+            logFail("Failed to map the Device Memory!\n");
+
+        // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+        const video::ILogicalDevice::MappedMemoryRange memoryRange(m_outputBufferAllocation.memory.get(), 0ull, m_outputBufferAllocation.memory->getAllocationSize());
+        if (!m_outputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+
+        assert(memoryRange.valid() && memoryRange.length >= sizeof(OutputStruct));
+
+        m_queue = m_device->getQueue(m_queueFamily, 0);
+    }
+
+    enum class TestType
+    {
+        CPU,
+        GPU
+    };
+
+    template<typename T>
+    void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType)
+    {
+        static constexpr float MaxAllowedError = 0.1f;
+        if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError)
+            return;
+
+        std::stringstream ss;
+        switch (testType)
+        {
+        case TestType::CPU:
+            ss << "CPU TEST ERROR:\n";
+        case TestType::GPU:
+            ss << "GPU TEST ERROR:\n";
+        }
+
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n';
+
+        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
+    }
+
+    template<typename T>
+    void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector<T, 3>& expectedVal, const nbl::hlsl::vector<T, 3>& testVal, const TestType testType)
+    {
+        static constexpr float MaxAllowedError = 0.1f;
+        if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError &&
+            std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError &&
+            std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError)
+            return;
+
+        std::stringstream ss;
+        switch (testType)
+        {
+        case TestType::CPU:
+            ss << "CPU TEST ERROR:\n";
+        case TestType::GPU:
+            ss << "GPU TEST ERROR:\n";
+        }
+
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " <<
+            testVal.x << ' ' << testVal.y << ' ' << testVal.z <<
+            " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n';
+
+        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
+    }
+
+    template<typename T>
+    void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix<T, 3, 3>& expectedVal, const nbl::hlsl::matrix<T, 3, 3>& testVal, const TestType testType)
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            auto expectedValRow = expectedVal[i];
+            auto testValRow = testVal[i];
+            verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType);
+        }
+    }
+
+    void performTests()
+    {
+        m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            performCpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
+        }
+        m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+protected:
+    uint32_t m_queueFamily;
+    core::smart_refctd_ptr<video::ILogicalDevice> m_device;
+    core::smart_refctd_ptr<video::CVulkanConnection> m_api;
+    video::IPhysicalDevice* m_physicalDevice;
+    core::smart_refctd_ptr<asset::IAssetManager> m_assetMgr;
+    core::smart_refctd_ptr<system::ILogger> m_logger;
+    video::IDeviceMemoryAllocator::SAllocation m_inputBufferAllocation = {};
+    video::IDeviceMemoryAllocator::SAllocation m_outputBufferAllocation = {};
+    core::smart_refctd_ptr<video::IGPUCommandBuffer> m_cmdbuf = nullptr;
+    core::smart_refctd_ptr<video::IGPUCommandPool> m_cmdpool = nullptr;
+    core::smart_refctd_ptr<video::IGPUDescriptorSet> m_ds = nullptr;
+    core::smart_refctd_ptr<video::IGPUPipelineLayout> m_pplnLayout = nullptr;
+    core::smart_refctd_ptr<video::IGPUComputePipeline> m_pipeline;
+    core::smart_refctd_ptr<video::ISemaphore> m_semaphore;
+    video::IQueue* m_queue;
+    uint64_t m_semaphoreCounter;
+
+    template<typename InputStruct, typename OutputStruct>
+    OutputStruct dispatch(const InputStruct& input)
+    {
+        // Update input buffer
+        if (!m_inputBufferAllocation.memory->map({ 0ull,m_inputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ))
+            logFail("Failed to map the Device Memory!\n");
+
+        const video::ILogicalDevice::MappedMemoryRange memoryRange(m_inputBufferAllocation.memory.get(), 0ull, m_inputBufferAllocation.memory->getAllocationSize());
+        if (!m_inputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+
+        std::memcpy(static_cast<InputStruct*>(m_inputBufferAllocation.memory->getMappedPointer()), &input, sizeof(InputStruct));
+
+        m_inputBufferAllocation.memory->unmap();
+
+        // record command buffer
+        m_cmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+        m_cmdbuf->begin(video::IGPUCommandBuffer::USAGE::NONE);
+        m_cmdbuf->beginDebugMarker("test", core::vector4df_SIMD(0, 1, 0, 1));
+        m_cmdbuf->bindComputePipeline(m_pipeline.get());
+        m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
+        m_cmdbuf->dispatch(1, 1, 1);
+        m_cmdbuf->endDebugMarker();
+        m_cmdbuf->end();
+
+        video::IQueue::SSubmitInfo submitInfos[1] = {};
+        const video::IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
+        submitInfos[0].commandBuffers = cmdbufs;
+        const video::IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
+        submitInfos[0].signalSemaphores = signals;
+
+        m_api->startCapture();
+        m_queue->submit(submitInfos);
+        m_api->endCapture();
+
+        m_device->waitIdle();
+        OutputStruct output;
+        std::memcpy(&output, static_cast<OutputStruct*>(m_outputBufferAllocation.memory->getMappedPointer()), sizeof(OutputStruct));
+        m_device->waitIdle();
+
+        return output;
+    }
+
+private:
+    template<typename... Args>
+    inline void logFail(const char* msg, Args&&... args)
+    {
+        m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward<Args>(args)...);
+        exit(-1);
+    }
+
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+        cpuTestValues.fillTestValues(commonTestInputValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType);
+        verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType);
+        verifyTestValue("length", expectedTestValues.length, testValues.length, testType);
+        verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType);
+        verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType);
+        verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType);
+        verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType);
+        verifyTestValue("min", expectedTestValues.min, testValues.min, testType);
+        verifyTestValue("max", expectedTestValues.max, testValues.max, testType);
+        verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType);
+        verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType);
+        verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType);
+        verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType);
+        verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType);
+        verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType);
+        verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
+        verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
+        verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
+
+        verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
+        verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
+        verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType);
+        verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType);
+        verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType);
+        verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType);
+        verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType);
+        verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType);
+        verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType);
+        verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType);
+        verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType);
+        verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType);
+
+        verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType);
+        verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType);
+        verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType);
+        verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType);
+        verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType);
+        verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
+        verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
+        verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
+
+        verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
+        verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);
+        verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index bd5184f80..9632bd372 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -1,13 +1,33 @@
-//#include "nbl/builtin/hlsl/morton.hlsl"
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
 
-NBL_CONSTEXPR uint32_t bufferSize = 256;
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 
-// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations
-//using morton_t2 = nbl::hlsl::morton::code<true, 8, 2>; // Fits in an int16_t
-using vector_t2 = nbl::hlsl::vector<int16_t, 3>;
+// because DXC doesn't properly support `_Static_assert`
+// TODO: add a message, and move to macros.h or cpp_compat
+#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
 
-struct PushConstantData
+#include <boost/preprocessor.hpp>
+
+#include <nbl/builtin/hlsl/morton.hlsl>
+
+// tgmath.hlsl and intrinsics.hlsl tests
+
+using namespace nbl::hlsl;
+struct InputTestValues
+{
+	
+};
+
+struct TestValues
 {
-	uint64_t deviceBufferAddress;
-};
\ No newline at end of file
+
+	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
+	{
+
+	}
+};
+
+#endif
diff --git a/12_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl
deleted file mode 100644
index e7f570eee..000000000
--- a/12_Mortons/app_resources/shader.hlsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "app_resources/common.hlsl"
-#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
-
-[[vk::push_constant]] PushConstantData pushConstants;
-
-[numthreads(bufferSize, 1, 1)]
-void main(uint32_t3 ID : SV_DispatchThreadID)
-{
-	/*
-	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
-	
-	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
-
-	//accessor.set(0, foo.value);
-	*/
-	uint32_t bar = _static_cast<uint32_t>(0xCAFEDEADDEADBEEF);
-	accessor.set(0, bar);
-}
\ No newline at end of file
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index d1fddba7a..8118ec939 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -45,7 +45,17 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
-
+			{
+				using namespace nbl::hlsl;
+
+				auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
+				auto foo = _static_cast<hlsl::vector<uint32_t, 3>>(bar);
+				std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl;
+				
+				//auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
+				//std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl;
+				//std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl;
+			}
 			/*
 
 			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
@@ -193,188 +203,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// Unsigned
 			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
 
-
-			if(!TestHLSL)
-				return true;
-
 			*/
 
-
-
-
-
-
-
-			// ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ----------------------------------------------
-			auto shader = createShader("app_resources/shader.hlsl");
-
-			// Create massive upload/download buffers
-			constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23;
-
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
-			if (!m_utils)
-				return logFail("Failed to create Utilities!");
-			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
-
-			// Create device-local buffer
-			{
-				IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
-
-				IQueue* const queue = getComputeQueue();
-				uint32_t queueFamilyIndex = queue->getFamilyIndex();
-
-				deviceLocalBufferParams.queueFamilyIndexCount = 1;
-				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
-				deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize;
-				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
-				auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
-				mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
-
-				m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
-			}
-
-			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
-
-			{
-				auto layout = m_device->createPipelineLayout({ &pcRange,1 });
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = layout.get();
-				params.shader.shader = shader.get();
-				params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
-				params.shader.requireFullSubgroups = true;
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
-					return logFail("Failed to create compute pipeline!\n");
-			}
-
-			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
-			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
-			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
-			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
-			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
-			// We'll align to max of coherent atom size even if the memory is coherent,
-			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
-			m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
-
-			// Semaphor used here to know the FFT is done before download
-			m_timeline = m_device->createSemaphore(semaphorValue);
-
-			IQueue* const queue = getComputeQueue();
-
-			const uint32_t inputSize = sizeof(uint32_t) * bufferSize;
-
-			// Just need a single suballocation in this example
-			const uint32_t AllocationCount = 1;
-
-			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
-			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
-			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-
-			// finally allocate our output range
-			const uint32_t outputSize = inputSize;
-
-			auto outputOffset = m_downStreamingBuffer->invalid_value;
-			m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
-
-			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-			{
-				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
-					return logFail("Failed to create Command Buffers!\n");
-				}
-				cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
-				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-				cmdbuf->bindComputePipeline(m_pipeline.get());
-				// This is the new fun part, pushing constants
-				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
-				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-				// Remember we do a single workgroup per 1D array in these parts
-				cmdbuf->dispatch(1, 1, 1);
-
-				// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
-
-				decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {};
-				pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
-
-				barrier.range.buffer = m_deviceLocalBuffer;
-
-				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
-				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
-				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
-
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
-
-				IGPUCommandBuffer::SBufferCopy copyInfo = {};
-				copyInfo.srcOffset = 0;
-				copyInfo.dstOffset = 0;
-				copyInfo.size = m_deviceLocalBuffer->getSize();
-				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
-				cmdbuf->end();
-			}
-
-			semaphorValue++;
-			{
-				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
-				{
-					.cmdbuf = cmdbuf.get()
-				};
-				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
-				{
-					.semaphore = m_timeline.get(),
-					.value = semaphorValue,
-					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-				};
-
-				const IQueue::SSubmitInfo submitInfo = {
-					.waitSemaphores = {},
-					.commandBuffers = {&cmdbufInfo,1},
-					.signalSemaphores = {&signalInfo,1}
-				};
-
-				m_api->startCapture();
-				queue->submit({ &submitInfo,1 });
-				m_api->endCapture();
-			}
-
-			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
-			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
-
-			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
-			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
-			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
-			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
-				IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
-				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
-				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
-				{
-					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
-					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
-					assert(dstOffset == 0 && size == outputSize);
-
-					std::cout << "Begin array GPU\n";
-					uint32_t* const data = reinterpret_cast<uint32_t*>(const_cast<void*>(bufSrc));
-					//std::cout << std::bitset<32>(data[0]) << "\n";
-					std::cout << data[0] << "\n";
-					/*
-					for (auto i = 0u; i < bufferSize; i++) {
-						std::cout << std::bitset<32>(data[i]) << "\n";
-					}
-					*/
-					std::cout << "\nEnd array GPU\n";
-				},
-				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
-				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
-				// It could also be latched in the upstreaming deallocate, because its the same fence.
-				std::move(cmdbuf), m_downStreamingBuffer
-			);
-			// We put a function we want to execute 
-			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
-
 			return true;
 		}
 
@@ -387,12 +217,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		// Cleanup
 		bool onAppTerminated() override
 		{
-			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
-			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
-			if (TestHLSL)
-			{
-				while (m_downStreamingBuffer->cull_frees()) {}
-			}
 			return device_base_t::onAppTerminated();
 		}
 
diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index 5fe7bc08e..09219a9e7 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -147,6 +147,9 @@ class CIntrinsicsTester final : public ITester
             expected.step = glm::step(testInput.stepEdge, testInput.stepX);
             expected.smoothStep = glm::smoothstep(testInput.smoothStepEdge0, testInput.smoothStepEdge1, testInput.smoothStepX);
 
+            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
+            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
+
             expected.frac = testInput.frac - std::floor(testInput.frac);
             expected.bitReverse = glm::bitfieldReverse(testInput.bitReverse);
 
@@ -189,6 +192,9 @@ class CIntrinsicsTester final : public ITester
             expected.reflect = glm::reflect(testInput.reflectI, testInput.reflectN);
             expected.refract = glm::refract(testInput.refractI, testInput.refractN, testInput.refractEta);
 
+            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
+            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
+
             auto mulGlm = nbl::hlsl::mul(testInput.mulLhs, testInput.mulRhs);
             expected.mul = reinterpret_cast<float32_t3x3&>(mulGlm);
             auto transposeGlm = glm::transpose(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.transpose));
@@ -196,11 +202,6 @@ class CIntrinsicsTester final : public ITester
             auto inverseGlm = glm::inverse(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.inverse));
             expected.inverse = reinterpret_cast<float32_t3x3&>(inverseGlm);
 
-            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
-            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
-            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
-            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
-
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
@@ -213,6 +214,7 @@ class CIntrinsicsTester final : public ITester
     void performCpuTests(const IntrinsicsIntputTestValues& commonTestInputValues, const IntrinsicsTestValues& expectedTestValues)
     {
         IntrinsicsTestValues cpuTestValues;
+
         cpuTestValues.fillTestValues(commonTestInputValues);
         verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
 
@@ -245,6 +247,11 @@ class CIntrinsicsTester final : public ITester
         verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
         verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
         verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
+        verifyTestValue("addCarryResult", expectedTestValues.addCarry.result, testValues.addCarry.result, testType);
+        verifyTestValue("addCarryCarry", expectedTestValues.addCarry.carry, testValues.addCarry.carry, testType);
+        // Disabled: current glm implementation is wrong
+        //verifyTestValue("subBorrowResult", expectedTestValues.subBorrow.result, testValues.subBorrow.result, testType);
+        //verifyTestValue("subBorrowBorrow", expectedTestValues.subBorrow.borrow, testValues.subBorrow.borrow, testType);
 
         verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
         verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
@@ -267,6 +274,11 @@ class CIntrinsicsTester final : public ITester
         verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
         verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
         verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
+        verifyTestVector3dValue("addCarryVecResult", expectedTestValues.addCarryVec.result, testValues.addCarryVec.result, testType);
+        verifyTestVector3dValue("addCarryVecCarry", expectedTestValues.addCarryVec.carry, testValues.addCarryVec.carry, testType);
+        // Disabled: current glm implementation is wrong
+        //verifyTestVector3dValue("subBorrowVecResult", expectedTestValues.subBorrowVec.result, testValues.subBorrowVec.result, testType);
+        //verifyTestVector3dValue("subBorrowVecBorrow", expectedTestValues.subBorrowVec.borrow, testValues.subBorrowVec.borrow, testType);
 
         verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
         verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);

From b2d87c36ad63c27b8547ea6583aa4c1ce716690d Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Thu, 24 Apr 2025 16:06:16 -0300
Subject: [PATCH 07/57] Added extensive tests for Morton codes

---
 12_Mortons/Tester.h                           | 135 +++---
 12_Mortons/app_resources/common.hlsl          | 453 +++++++++++++++++-
 12_Mortons/app_resources/mortonTest.comp.hlsl |  16 +
 12_Mortons/main.cpp                           | 298 +++---------
 22_CppCompat/ITester.h                        |   1 +
 5 files changed, 604 insertions(+), 299 deletions(-)
 create mode 100644 12_Mortons/app_resources/mortonTest.comp.hlsl

diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h
index 5c4773111..480328d18 100644
--- a/12_Mortons/Tester.h
+++ b/12_Mortons/Tester.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
 
 #include <nabla.h>
 #include "app_resources/common.hlsl"
@@ -128,7 +128,7 @@ class Tester
             if (!inputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
+            inputBuff->setObjectDebugName("morton input buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -163,7 +163,7 @@ class Tester
             if (!outputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
+            outputBuff->setObjectDebugName("morton output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -208,8 +208,7 @@ class Tester
     template<typename T>
     void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType)
     {
-        static constexpr float MaxAllowedError = 0.1f;
-        if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError)
+        if (expectedVal == testVal)
             return;
 
         std::stringstream ss;
@@ -221,7 +220,7 @@ class Tester
             ss << "GPU TEST ERROR:\n";
         }
 
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n';
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n';
 
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
@@ -240,6 +239,7 @@ class Tester
         {
         case TestType::CPU:
             ss << "CPU TEST ERROR:\n";
+            break;
         case TestType::GPU:
             ss << "GPU TEST ERROR:\n";
         }
@@ -251,32 +251,60 @@ class Tester
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
 
-    template<typename T>
-    void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix<T, 3, 3>& expectedVal, const nbl::hlsl::matrix<T, 3, 3>& testVal, const TestType testType)
-    {
-        for (int i = 0; i < 3; ++i)
-        {
-            auto expectedValRow = expectedVal[i];
-            auto testValRow = testVal[i];
-            verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType);
-        }
-    }
-
     void performTests()
     {
-        m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE);
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
         for (int i = 0; i < Iterations; ++i)
         {
             // Set input thest values that will be used in both CPU and GPU tests
             InputTestValues testInput;
-
             // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
             TestValues expected;
 
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                uint64_t generatedA = longDistribution(mt);
+                uint64_t generatedB = longDistribution(mt);
+
+                testInput.generatedA = generatedA;
+                testInput.generatedB = generatedB;
+
+                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
+                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
+                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
+                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
+                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
+                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedLess = uint32_t(generatedA < generatedB);
+                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
+                expected.emulatedGreater = uint32_t(generatedA > generatedB);
+                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
+
+                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
+                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
+                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
+            }
+            {
+                uint64_t coordX = longDistribution(mt);
+                uint64_t coordY = longDistribution(mt);
+                uint64_t coordZ = longDistribution(mt);
+                uint64_t coordW = longDistribution(mt);
+
+
+            }
+
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
-        m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+        m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
     }
 
 protected:
@@ -354,7 +382,7 @@ class Tester
     {
         TestValues cpuTestValues;
         cpuTestValues.fillTestValues(commonTestInputValues);
-        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+        verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU);
 
     }
 
@@ -362,55 +390,26 @@ class Tester
     {
         TestValues gpuTestValues;
         gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
-        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+        verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU);
     }
 
-    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType)
     {
-        verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType);
-        verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType);
-        verifyTestValue("length", expectedTestValues.length, testValues.length, testType);
-        verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType);
-        verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType);
-        verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType);
-        verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType);
-        verifyTestValue("min", expectedTestValues.min, testValues.min, testType);
-        verifyTestValue("max", expectedTestValues.max, testValues.max, testType);
-        verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType);
-        verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType);
-        verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType);
-        verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType);
-        verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType);
-        verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType);
-        verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
-        verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
-        verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
-
-        verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
-        verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
-        verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType);
-        verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType);
-        verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType);
-        verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType);
-        verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType);
-        verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType);
-        verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType);
-        verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType);
-        verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType);
-        verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType);
-
-        verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType);
-        verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType);
-        verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType);
-        verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType);
-        verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType);
-        verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
-        verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
-        verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
-
-        verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
-        verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);
-        verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType);
+        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
+        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
+        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
+        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
+        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
+        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
+        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
+        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
+        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
+        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
+        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
+        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
+        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+        
+        //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
     }
 };
 
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index 9632bd372..be6a2f4a0 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -13,20 +13,471 @@
 
 #include <nbl/builtin/hlsl/morton.hlsl>
 
-// tgmath.hlsl and intrinsics.hlsl tests
+NBL_CONSTEXPR uint16_t smallBits_2 = 8;
+NBL_CONSTEXPR uint16_t mediumBits_2 = 16;
+NBL_CONSTEXPR uint16_t fullBits_2 = 32;
+NBL_CONSTEXPR uint16_t smallBits_3 = 5;
+NBL_CONSTEXPR uint16_t mediumBits_3 = 10;
+NBL_CONSTEXPR uint16_t fullBits_3 = 21;
+NBL_CONSTEXPR uint16_t smallBits_4 = 4;
+NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
+NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
 using namespace nbl::hlsl;
 struct InputTestValues
 {
+	// Both tests
+	uint32_t shift;
+
+	// Emulated int tests
+	uint64_t generatedA;
+	uint64_t generatedB;
 	
+	// Morton tests
+	uint64_t coordX;
+	uint64_t coordY;
+	uint64_t coordZ;
+	uint64_t coordW;
 };
 
 struct TestValues
 {
+	// Emulated int tests
+	emulated_uint64_t emulatedAnd;
+	emulated_uint64_t emulatedOr;
+	emulated_uint64_t emulatedXor;
+	emulated_uint64_t emulatedNot;
+	emulated_uint64_t emulatedPlus;
+	emulated_uint64_t emulatedMinus;
+	// These are bools but stored as uint because you can't store bools, causes a SPIR-V issue
+	uint32_t emulatedLess;
+	uint32_t emulatedLessEqual;
+	uint32_t emulatedGreater;
+	uint32_t emulatedGreaterEqual;
+	emulated_uint64_t emulatedLeftShifted;
+	emulated_uint64_t emulatedUnsignedRightShifted;
+	emulated_int64_t  emulatedSignedRightShifted;
+
+	// Morton tests - for each dimension let's do one small, medium and full-szied (max bits possible) test to cover representation with
+	// 16, 32 and 64-bit types. Could make it more exhaustive with macros (test all possible bitwidths)
+	// For emulated mortons, we store only the emulated uint64 representing it, because DXC complains about bitcasts otherwise
+
+	// Plus
+	morton::code<false, smallBits_2, 2>					  mortonPlus_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonPlus_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonPlus_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonPlus_emulated_2;
+	
+	morton::code<false, smallBits_3, 3>					  mortonPlus_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonPlus_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonPlus_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonPlus_emulated_3;
+	
+	morton::code<false, smallBits_4, 4>					  mortonPlus_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonPlus_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonPlus_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonPlus_emulated_4;
+	
+	// Minus
+	morton::code<false, smallBits_2, 2>					  mortonMinus_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonMinus_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonMinus_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonMinus_emulated_2;
+	
+	morton::code<false, smallBits_3, 3>					  mortonMinus_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonMinus_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonMinus_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonMinus_emulated_3;
+	
+	morton::code<false, smallBits_4, 4>					  mortonMinus_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonMinus_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonMinus_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonMinus_emulated_4;
+
+	// Coordinate-wise equality (these are bools)
+	uint32_t2 mortonEqual_small_2;
+	uint32_t2 mortonEqual_medium_2;
+	uint32_t2 mortonEqual_full_2;
+	uint32_t2 mortonEqual_emulated_2;
+
+	uint32_t3 mortonEqual_small_3;
+	uint32_t3 mortonEqual_medium_3;
+	uint32_t3 mortonEqual_full_3;
+	uint32_t3 mortonEqual_emulated_3;
+
+	uint32_t4 mortonEqual_small_4;
+	uint32_t4 mortonEqual_medium_4;
+	uint32_t4 mortonEqual_full_4;
+	uint32_t4 mortonEqual_emulated_4;
+
+	// Coordinate-wise unsigned inequality (just testing with less, again these are bools)
+	uint32_t2 mortonUnsignedLess_small_2;
+	uint32_t2 mortonUnsignedLess_medium_2;
+	uint32_t2 mortonUnsignedLess_full_2;
+	uint32_t2 mortonUnsignedLess_emulated_2;
+
+	uint32_t3 mortonUnsignedLess_small_3;
+	uint32_t3 mortonUnsignedLess_medium_3;
+	uint32_t3 mortonUnsignedLess_full_3;
+	uint32_t3 mortonUnsignedLess_emulated_3;
+
+	uint32_t4 mortonUnsignedLess_small_4;
+	uint32_t4 mortonUnsignedLess_medium_4;
+	uint32_t4 mortonUnsignedLess_full_4;
+	uint32_t4 mortonUnsignedLess_emulated_4;
+
+	// Coordinate-wise signed inequality (bools)
+	uint32_t2 mortonSignedLess_small_2;
+	uint32_t2 mortonSignedLess_medium_2;
+	uint32_t2 mortonSignedLess_full_2;
+	uint32_t2 mortonSignedLess_emulated_2;
+
+	uint32_t3 mortonSignedLess_small_3;
+	uint32_t3 mortonSignedLess_medium_3;
+	uint32_t3 mortonSignedLess_full_3;
+	uint32_t3 mortonSignedLess_emulated_3;
+
+	uint32_t4 mortonSignedLess_small_4;
+	uint32_t4 mortonSignedLess_medium_4;
+	uint32_t4 mortonSignedLess_full_4;
+	uint32_t4 mortonSignedLess_emulated_4;
+
+	// Left-shift
+	morton::code<false, smallBits_2, 2>					  mortonLeftShift_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonLeftShift_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonLeftShift_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonLeftShift_emulated_2;
+
+	morton::code<false, smallBits_3, 3>					  mortonLeftShift_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonLeftShift_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonLeftShift_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonLeftShift_emulated_3;
+
+	morton::code<false, smallBits_4, 4>					  mortonLeftShift_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonLeftShift_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonLeftShift_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonLeftShift_emulated_4;
+
+	// Unsigned right-shift
+	morton::code<false, smallBits_2, 2>					  mortonUnsignedRightShift_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonUnsignedRightShift_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonUnsignedRightShift_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonUnsignedRightShift_emulated_2;
+
+	morton::code<false, smallBits_3, 3>					  mortonUnsignedRightShift_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonUnsignedRightShift_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonUnsignedRightShift_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonUnsignedRightShift_emulated_3;
+
+	morton::code<false, smallBits_4, 4>					  mortonUnsignedRightShift_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonUnsignedRightShift_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonUnsignedRightShift_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonUnsignedRightShift_emulated_4;
+
+	// Signed right-shift
+	morton::code<true, smallBits_2, 2>					  mortonSignedRightShift_small_2;
+	morton::code<true, mediumBits_2, 2>					  mortonSignedRightShift_medium_2;
+	morton::code<true, fullBits_2, 2>					  mortonSignedRightShift_full_2;
+	morton::code<true, fullBits_2, 2, emulated_uint64_t>  mortonSignedRightShift_emulated_2;
+
+	morton::code<true, smallBits_3, 3>					  mortonSignedRightShift_small_3;
+	morton::code<true, mediumBits_3, 3>					  mortonSignedRightShift_medium_3;
+	morton::code<true, fullBits_3, 3>					  mortonSignedRightShift_full_3;
+	morton::code<true, fullBits_3, 3, emulated_uint64_t>  mortonSignedRightShift_emulated_3;
+
+	morton::code<true, smallBits_4, 4>					  mortonSignedRightShift_small_4;
+	morton::code<true, mediumBits_4, 4>					  mortonSignedRightShift_medium_4;
+	morton::code<true, fullBits_4, 4>					  mortonSignedRightShift_full_4;
+	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
 	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
 	{
+		emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
+		emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+
+		// Emulated int tests
+		emulatedAnd = emulatedA & emulatedB;
+		emulatedOr = emulatedA | emulatedB;
+		emulatedXor = emulatedA ^ emulatedB;
+		emulatedNot = emulatedA.operator~();
+		emulatedPlus = emulatedA + emulatedB;
+		emulatedMinus = emulatedA - emulatedB;
+		emulatedLess = uint32_t(emulatedA < emulatedB);
+		emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
+		emulatedGreater = uint32_t(emulatedA > emulatedB);
+		emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
+
+		left_shift_operator<emulated_uint64_t> leftShift;
+		emulatedLeftShifted = leftShift(emulatedA, input.shift);
+
+		arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
+		emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
+
+		arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
+		emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
+
+		// Morton tests
+		uint64_t2 Vec2A = { input.coordX, input.coordY };
+		uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+		uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+		uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+		uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+		uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+		int64_t2 Vec2ASigned = int64_t2(Vec2A);
+		int64_t2 Vec2BSigned = int64_t2(Vec2B);
+
+		int64_t3 Vec3ASigned = int64_t3(Vec3A);
+		int64_t3 Vec3BSigned = int64_t3(Vec3B);
+
+		int64_t4 Vec4ASigned = int64_t4(Vec4A);
+		int64_t4 Vec4BSigned = int64_t4(Vec4B);
+
+		morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
+		morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
+		morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
+		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
+		morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
+		morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
+		morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
+		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
+
+		morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
+		morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
+		morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
+		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
+		morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
+		morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
+		morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
+		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
+
+		morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
+		morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
+		morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
+		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
+		morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
+		morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
+		morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
+		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
+
+		morton::code<true, smallBits_2, 2> morton_small_2ASigned = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, mediumBits_2, 2> morton_medium_2ASigned = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, fullBits_2, 2> morton_full_2ASigned = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2ASigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
+		morton::code<true, smallBits_2, 2> morton_small_2BSigned = morton::code<true, smallBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, mediumBits_2, 2> morton_medium_2BSigned = morton::code<true, mediumBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, fullBits_2, 2> morton_full_2BSigned = morton::code<true, fullBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2BSigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2BSigned);
+
+		morton::code<true, smallBits_3, 3> morton_small_3ASigned = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, mediumBits_3, 3> morton_medium_3ASigned = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, fullBits_3, 3> morton_full_3ASigned = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3ASigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
+		morton::code<true, smallBits_3, 3> morton_small_3BSigned = morton::code<true, smallBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, mediumBits_3, 3> morton_medium_3BSigned = morton::code<true, mediumBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, fullBits_3, 3> morton_full_3BSigned = morton::code<true, fullBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3BSigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3BSigned);
+
+		morton::code<true, smallBits_4, 4> morton_small_4ASigned = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, mediumBits_4, 4> morton_medium_4ASigned = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, fullBits_4, 4> morton_full_4ASigned = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4ASigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
+		morton::code<true, smallBits_4, 4> morton_small_4BSigned = morton::code<true, smallBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, mediumBits_4, 4> morton_medium_4BSigned = morton::code<true, mediumBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, fullBits_4, 4> morton_full_4BSigned = morton::code<true, fullBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4BSigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4BSigned);
+
+		/*
+		left_shift_operator<portable_vector_t<emulated_uint64_t, 4> > leftShiftTemp;
+		portable_vector_t<emulated_uint64_t, 4> interleaved = _static_cast<portable_vector_t<emulated_uint64_t, 4> >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>;
+		
+		#define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\
+        {\
+            interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\
+            interleaved = interleaved & _static_cast<emulated_uint64_t>(morton::impl::coding_mask<4, fullBits_4, I>::value);\
+        }
+		
+		ENCODE_LOOP_ITERATION(4)
+		ENCODE_LOOP_ITERATION(3)
+		ENCODE_LOOP_ITERATION(2)
+		ENCODE_LOOP_ITERATION(1)
+		ENCODE_LOOP_ITERATION(0)
+
+		#undef ENCODE_LOOP_ITERATION
+		// After interleaving, shift each coordinate left by their index
+		return leftShiftTemp(interleaved, truncate<vector<uint16_t, Dim> >(vector<uint16_t, 4>(0, 1, 2, 3)));
+		
+		
+		array_get<portable_vector_t<emulated_uint64_t, 4>, emulated_uint64_t> getter;
+		emulatedAnd = getter(interleaved, 0);
+		*/
+		
+		// Plus
+		mortonPlus_small_2 = morton_small_2A + morton_small_2B;
+		mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
+		mortonPlus_full_2 = morton_full_2A + morton_full_2B;
+		mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
+		
+		mortonPlus_small_3 = morton_small_3A + morton_small_3B;
+		mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
+		mortonPlus_full_3 = morton_full_3A + morton_full_3B;
+		mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
+
+		mortonPlus_small_4 = morton_small_4A + morton_small_4B;
+		mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
+		mortonPlus_full_4 = morton_full_4A + morton_full_4B;
+		mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
+		
+		// Minus
+		mortonMinus_small_2 = morton_small_2A - morton_small_2B;
+		mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
+		mortonMinus_full_2 = morton_full_2A - morton_full_2B;
+		mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
+
+		mortonMinus_small_3 = morton_small_3A - morton_small_3B;
+		mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
+		mortonMinus_full_3 = morton_full_3A - morton_full_3B;
+		mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
+
+		mortonMinus_small_4 = morton_small_4A - morton_small_4B;
+		mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
+		mortonMinus_full_4 = morton_full_4A - morton_full_4B;
+		mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
+
+		// Coordinate-wise equality
+		mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
+		mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
+		mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
+		mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+
+		mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
+		mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
+		mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
+		mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+
+		mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
+		
+		// Coordinate-wise unsigned inequality (just testing with less)
+		mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
+		mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
+		mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
+		mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+		
+		mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
+		mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
+		mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
+		mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+		
+		mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+		
+		// Coordinate-wise signed inequality
+		mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
+		mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
+		mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
+		//mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
+
+		mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
+		mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
+		mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
+		//mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
+
+		mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		//mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		
+		// Left-shift
+		uint16_t castedShift = uint16_t(input.shift);
+		left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
+		mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
+		mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
+		mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
+		mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift);
+
+		left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
+		mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
+		mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
+		mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
+		mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift);
+
+		left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
+		mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
+		mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
+		mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
+		mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift);
+		
+		// Unsigned right-shift
+		arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
+		mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
+		mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
+		mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
+		mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
+		mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
+		mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
+		mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
+		mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
+		mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
+		mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
+		mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
+		mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift);
+
+		// Signed right-shift
+		arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
+		mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
+		mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
+		mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
+		//mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
+		mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
+		mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
+		mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
+		//mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift);
 
+		arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
+		mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
+		mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
+		mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
+		//mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift);
 	}
 };
 
diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/mortonTest.comp.hlsl
new file mode 100644
index 000000000..7041568b8
--- /dev/null
+++ b/12_Mortons/app_resources/mortonTest.comp.hlsl
@@ -0,0 +1,16 @@
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
+
+#include "common.hlsl"
+
+[[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
+[[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
+
+[numthreads(256, 1, 1)]
+void main(uint3 invocationID : SV_DispatchThreadID)
+{
+    if (invocationID.x == 0)
+        outputTestValues[0].fillTestValues(inputTestValues[0]);
+}
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index 8118ec939..f83c49b9e 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -1,242 +1,80 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include <nabla.h>
+#include <iostream>
+#include <cstdio>
+#include <assert.h>
 
-
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "app_resources/common.hlsl"
-#include <bitset>
-
-// Right now the test only checks that HLSL compiles the file
-constexpr bool TestHLSL = true;
+#include "Tester.h"
 
-using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::video;
+using namespace nbl::application_templates;
 
-// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
-class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-
-		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
-			const char* includeMainName)
-		{
-			std::string prelude = "#include \"";
-			auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
-			assert(CPUShader);
-			return m_device->createShader(CPUShader.get());
-		}
-	public:
-		MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-		// we stuff all our work here because its a "single shot" app
-		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-		{
-			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-			if (!asset_base_t::onAppInitialized(std::move(system)))
-				return false;
-			{
-				using namespace nbl::hlsl;
-
-				auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
-				auto foo = _static_cast<hlsl::vector<uint32_t, 3>>(bar);
-				std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl;
-				
-				//auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
-				//std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl;
-				//std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl;
-			}
-			/*
-
-			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
-			
-			// Coordinate extraction and whole vector decode tests
-			{
-				morton_t morton(vector_t(-1011, 765, 248));
-				unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011));
-
-				assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248);
-				assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u);
-
-				assert(static_cast<vector_t>(morton) == vector_t(-1011, 765, 248) && static_cast<unsigned_vector_t>(unsignedMorton) == unsigned_vector_t(154, 789, 1011));
-			}
-
-			// ***********************************************************************************************************************************
-			// ************************************************* Arithmetic operator tests *******************************************************
-			// ***********************************************************************************************************************************
-			
-			//  ----------------------------------------------------------------------------------------------------
-			//  --------------------------------------- ADDITION ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// ---------------------------------------- Signed -----------------------------------------------------
-			
-			// No overflow
-			assert(static_cast<vector_t>(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448));
-			
-			// Type 1 overflow: Addition of representable coordinates goes out of range
-			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504));
-
-			// Type 2 overflow: Addition of irrepresentable range gives correct result
-			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224));
-
-			// ---------------------------------------- Unsigned -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763));
-
-			// Type 1 overflow: Addition of representable coordinates goes out of range
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519));
-
-			// Type 2 overflow: Addition of irrepresentable range gives correct result
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- SUBTRACTION -------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// ---------------------------------------- Signed -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<vector_t>(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465));
-
-			// Type 1 overflow: Subtraction of representable coordinates goes out of range
-			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504));
-
-			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
-			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224));
-
-			// ---------------------------------------- Unsigned -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244));
-
-			// Type 1 overflow: Subtraction of representable coordinates goes out of range
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567));
-
-			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485));
-
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- UNARY NEGATION ----------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Only makes sense for signed
-			assert(static_cast<vector_t>(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475));
-
-			// ***********************************************************************************************************************************
-			// ************************************************* Comparison operator tests *******************************************************
-			// ***********************************************************************************************************************************
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR< ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-			
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR<= --------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR> ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR>= --------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
-
-			*/
-
-			return true;
-		}
-
-		// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
-		void workLoopBody() override {}
-
-		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
-		bool keepRunning() override {return false;}
-
-		// Cleanup
-		bool onAppTerminated() override
-		{
-			return device_base_t::onAppTerminated();
-		}
-
-	private:
-		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
-
-		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
-
-		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
-		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
-
-		// These are Buffer Device Addresses
-		uint64_t m_downStreamingBufferAddress;
-		uint64_t m_deviceLocalBufferAddress;
-
-		uint32_t m_alignment;
-
-		smart_refctd_ptr<ISemaphore> m_timeline;
-		uint64_t semaphorValue = 0;
+    using device_base_t = MonoDeviceApplication;
+    using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+public:
+    MortonTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+        if (!asset_base_t::onAppInitialized(std::move(system)))
+            return false;
+        {
+            
+        }
+        
+        Tester::PipelineSetupData pplnSetupData;
+        pplnSetupData.device = m_device;
+        pplnSetupData.api = m_api;
+        pplnSetupData.assetMgr = m_assetMgr;
+        pplnSetupData.logger = m_logger;
+        pplnSetupData.physicalDevice = m_physicalDevice;
+        pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
+        {
+            Tester mortonTester;
+            pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl";
+            mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
+            mortonTester.performTests();
+        }
+        
+
+        return true;
+    }
+
+    void onAppTerminated_impl() override
+    {
+        m_device->waitIdle();
+    }
+
+    void workLoopBody() override
+    {
+        m_keepRunning = false;
+    }
+
+    bool keepRunning() override
+    {
+        return m_keepRunning;
+    }
+
+
+private:
+    bool m_keepRunning = true;
 };
 
-
-NBL_MAIN_FUNC(MortonTestApp)
\ No newline at end of file
+NBL_MAIN_FUNC(MortonTest)
\ No newline at end of file
diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index a216fbf40..207cdee51 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -217,6 +217,7 @@ class ITester
         {
         case TestType::CPU:
             ss << "CPU TEST ERROR:\n";
+            break;
         case TestType::GPU:
             ss << "GPU TEST ERROR:\n";
         }

From c68c336317024ae80fb017b1cb71e6b32a152224 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 28 Apr 2025 15:16:34 -0300
Subject: [PATCH 08/57] Done with tests

---
 12_Mortons/CTester.h                          | 401 ++++++++++++++++++
 12_Mortons/{Tester.h => ITester.h}            | 133 +-----
 12_Mortons/app_resources/common.hlsl          | 299 ++-----------
 .../{mortonTest.comp.hlsl => test.comp.hlsl}  |   5 +-
 12_Mortons/app_resources/testCommon.hlsl      | 242 +++++++++++
 12_Mortons/main.cpp                           |  13 +-
 6 files changed, 691 insertions(+), 402 deletions(-)
 create mode 100644 12_Mortons/CTester.h
 rename 12_Mortons/{Tester.h => ITester.h} (66%)
 rename 12_Mortons/app_resources/{mortonTest.comp.hlsl => test.comp.hlsl} (79%)
 create mode 100644 12_Mortons/app_resources/testCommon.hlsl

diff --git a/12_Mortons/CTester.h b/12_Mortons/CTester.h
new file mode 100644
index 000000000..5a61be501
--- /dev/null
+++ b/12_Mortons/CTester.h
@@ -0,0 +1,401 @@
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+
+#include <nabla.h>
+#include "app_resources/testCommon.hlsl"
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "ITester.h"
+
+using namespace nbl;
+
+class CTester final : public ITester
+{
+public:
+    void performTests()
+    {
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                uint64_t generatedA = longDistribution(mt);
+                uint64_t generatedB = longDistribution(mt);
+
+                testInput.generatedA = generatedA;
+                testInput.generatedB = generatedB;
+
+                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
+                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
+                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
+                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
+                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
+                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedLess = uint32_t(generatedA < generatedB);
+                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
+                expected.emulatedGreater = uint32_t(generatedA > generatedB);
+                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
+
+                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
+                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
+                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
+            }
+            {
+                testInput.coordX = longDistribution(mt);
+                testInput.coordY = longDistribution(mt);
+                testInput.coordZ = longDistribution(mt);
+                testInput.coordW = longDistribution(mt);
+
+                uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
+                uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
+
+                uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 );
+                uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 );
+                uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2);
+                uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2);
+                uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2);
+                uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2);
+
+                uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
+                uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
+
+                uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3);
+                uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3);
+                uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3);
+                uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3);
+                uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3);
+                uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3);
+
+                uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
+                uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
+
+                uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4);
+                uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4);
+                uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4);
+                uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4);
+                uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4);
+                uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4);
+
+                // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them
+                // so their highest bits are all 0s or 1s depending on the sign of the number they encode
+
+                int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+                int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+
+                int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+                int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+
+                int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+                int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+
+                // Plus
+                expected.mortonPlus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall + Vec2BSmall);
+                expected.mortonPlus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium + Vec2BMedium);
+                expected.mortonPlus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull + Vec2BFull);
+                expected.mortonPlus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull + Vec2BFull);
+
+                expected.mortonPlus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall + Vec3BSmall);
+                expected.mortonPlus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium + Vec3BMedium);
+                expected.mortonPlus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull + Vec3BFull);
+                expected.mortonPlus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull + Vec3BFull);
+
+                expected.mortonPlus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall + Vec4BSmall);
+                expected.mortonPlus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium + Vec4BMedium);
+                expected.mortonPlus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull + Vec4BFull);
+                expected.mortonPlus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull + Vec4BFull);
+
+                // Minus
+                expected.mortonMinus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall - Vec2BSmall);
+                expected.mortonMinus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium - Vec2BMedium);
+                expected.mortonMinus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull - Vec2BFull);
+                expected.mortonMinus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull - Vec2BFull);
+
+                expected.mortonMinus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall - Vec3BSmall);
+                expected.mortonMinus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium - Vec3BMedium);
+                expected.mortonMinus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull - Vec3BFull);
+                expected.mortonMinus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull - Vec3BFull);
+
+                expected.mortonMinus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall - Vec4BSmall);
+                expected.mortonMinus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium - Vec4BMedium);
+                expected.mortonMinus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull - Vec4BFull);
+                expected.mortonMinus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull - Vec4BFull);
+
+                // Coordinate-wise equality
+                expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall));
+                expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium));
+                expected.mortonEqual_full_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+                expected.mortonEqual_emulated_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+
+                expected.mortonEqual_small_3 = uint32_t3(glm::equal(Vec3ASmall, Vec3BSmall));
+                expected.mortonEqual_medium_3 = uint32_t3(glm::equal(Vec3AMedium, Vec3BMedium));
+                expected.mortonEqual_full_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+                expected.mortonEqual_emulated_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+
+                expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall));
+                expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium));
+                expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise unsigned inequality (just testing with less)
+                expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall));
+                expected.mortonUnsignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2AMedium, Vec2BMedium));
+                expected.mortonUnsignedLess_full_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+                expected.mortonUnsignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+
+                expected.mortonUnsignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASmall, Vec3BSmall));
+                expected.mortonUnsignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3AMedium, Vec3BMedium));
+                expected.mortonUnsignedLess_full_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+                expected.mortonUnsignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+
+                expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall));
+                expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium));
+                expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise signed inequality
+                expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall));
+                expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium));
+                expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
+
+                expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall));
+                expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium));
+                expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
+
+                expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall));
+                expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium));
+                expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
+
+                uint16_t castedShift = uint16_t(generatedShift);
+                // Left-shift
+                expected.mortonLeftShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonLeftShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonLeftShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonLeftShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+
+                expected.mortonLeftShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonLeftShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonLeftShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonLeftShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+
+                expected.mortonLeftShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonLeftShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonLeftShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonLeftShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+
+                // Unsigned right-shift
+                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2));
+
+                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3));
+
+                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
+            
+                // Signed right-shift
+                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
+                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
+                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
+
+                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
+                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
+                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
+
+                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
+                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
+                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
+            }
+
+            performCpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
+        }
+        m_logger->log("FIRST TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+private:
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+
+        fillTestValues(commonTestInputValues, cpuTestValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
+        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
+        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
+        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
+        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
+        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
+        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
+        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
+        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
+        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
+        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
+        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
+        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+
+        // Morton Plus
+        verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
+        verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType);
+        verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType);
+        verifyTestValue("mortonPlus_emulated_2", expectedTestValues.mortonPlus_emulated_2, testValues.mortonPlus_emulated_2, testType);
+
+        verifyTestValue("mortonPlus_small_3", expectedTestValues.mortonPlus_small_3, testValues.mortonPlus_small_3, testType);
+        verifyTestValue("mortonPlus_medium_3", expectedTestValues.mortonPlus_medium_3, testValues.mortonPlus_medium_3, testType);
+        verifyTestValue("mortonPlus_full_3", expectedTestValues.mortonPlus_full_3, testValues.mortonPlus_full_3, testType);
+        verifyTestValue("mortonPlus_emulated_3", expectedTestValues.mortonPlus_emulated_3, testValues.mortonPlus_emulated_3, testType);
+
+        verifyTestValue("mortonPlus_small_4", expectedTestValues.mortonPlus_small_4, testValues.mortonPlus_small_4, testType);
+        verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType);
+        verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType);
+        verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType);
+
+        // Morton Minus
+        verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType);
+        verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType);
+        verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType);
+        verifyTestValue("mortonMinus_emulated_2", expectedTestValues.mortonMinus_emulated_2, testValues.mortonMinus_emulated_2, testType);
+
+        verifyTestValue("mortonMinus_small_3", expectedTestValues.mortonMinus_small_3, testValues.mortonMinus_small_3, testType);
+        verifyTestValue("mortonMinus_medium_3", expectedTestValues.mortonMinus_medium_3, testValues.mortonMinus_medium_3, testType);
+        verifyTestValue("mortonMinus_full_3", expectedTestValues.mortonMinus_full_3, testValues.mortonMinus_full_3, testType);
+        verifyTestValue("mortonMinus_emulated_3", expectedTestValues.mortonMinus_emulated_3, testValues.mortonMinus_emulated_3, testType);
+
+        verifyTestValue("mortonMinus_small_4", expectedTestValues.mortonMinus_small_4, testValues.mortonMinus_small_4, testType);
+        verifyTestValue("mortonMinus_medium_4", expectedTestValues.mortonMinus_medium_4, testValues.mortonMinus_medium_4, testType);
+        verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType);
+        verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType);
+
+        // Morton coordinate-wise equality
+        verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType);
+        verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType);
+        verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType);
+        verifyTestValue("mortonEqual_emulated_2", expectedTestValues.mortonEqual_emulated_2, testValues.mortonEqual_emulated_2, testType);
+
+        verifyTestValue("mortonEqual_small_3", expectedTestValues.mortonEqual_small_3, testValues.mortonEqual_small_3, testType);
+        verifyTestValue("mortonEqual_medium_3", expectedTestValues.mortonEqual_medium_3, testValues.mortonEqual_medium_3, testType);
+        verifyTestValue("mortonEqual_full_3", expectedTestValues.mortonEqual_full_3, testValues.mortonEqual_full_3, testType);
+        verifyTestValue("mortonEqual_emulated_3", expectedTestValues.mortonEqual_emulated_3, testValues.mortonEqual_emulated_3, testType);
+
+        verifyTestValue("mortonEqual_small_4", expectedTestValues.mortonEqual_small_4, testValues.mortonEqual_small_4, testType);
+        verifyTestValue("mortonEqual_medium_4", expectedTestValues.mortonEqual_medium_4, testValues.mortonEqual_medium_4, testType);
+        verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType);
+
+        // Morton coordinate-wise unsigned inequality
+        verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType);
+        verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType);
+        verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_2", expectedTestValues.mortonUnsignedLess_emulated_2, testValues.mortonUnsignedLess_emulated_2, testType);
+
+        verifyTestValue("mortonUnsignedLess_small_3", expectedTestValues.mortonUnsignedLess_small_3, testValues.mortonUnsignedLess_small_3, testType);
+        verifyTestValue("mortonUnsignedLess_medium_3", expectedTestValues.mortonUnsignedLess_medium_3, testValues.mortonUnsignedLess_medium_3, testType);
+        verifyTestValue("mortonUnsignedLess_full_3", expectedTestValues.mortonUnsignedLess_full_3, testValues.mortonUnsignedLess_full_3, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_3", expectedTestValues.mortonUnsignedLess_emulated_3, testValues.mortonUnsignedLess_emulated_3, testType);
+
+        verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType);
+        verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
+        verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
+
+        // Morton coordinate-wise signed inequality
+        verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
+        verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
+        verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
+
+        verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
+        verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
+        verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
+
+        verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
+        verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
+        verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
+
+        // Morton left-shift
+        verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
+        verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType);
+        verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType);
+        verifyTestValue("mortonLeftShift_emulated_2", expectedTestValues.mortonLeftShift_emulated_2, testValues.mortonLeftShift_emulated_2, testType);
+
+        verifyTestValue("mortonLeftShift_small_3", expectedTestValues.mortonLeftShift_small_3, testValues.mortonLeftShift_small_3, testType);
+        verifyTestValue("mortonLeftShift_medium_3", expectedTestValues.mortonLeftShift_medium_3, testValues.mortonLeftShift_medium_3, testType);
+        verifyTestValue("mortonLeftShift_full_3", expectedTestValues.mortonLeftShift_full_3, testValues.mortonLeftShift_full_3, testType);
+        verifyTestValue("mortonLeftShift_emulated_3", expectedTestValues.mortonLeftShift_emulated_3, testValues.mortonLeftShift_emulated_3, testType);
+
+        verifyTestValue("mortonLeftShift_small_4", expectedTestValues.mortonLeftShift_small_4, testValues.mortonLeftShift_small_4, testType);
+        verifyTestValue("mortonLeftShift_medium_4", expectedTestValues.mortonLeftShift_medium_4, testValues.mortonLeftShift_medium_4, testType);
+        verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType);
+        verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType);
+
+        // Morton unsigned right-shift
+        verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_2", expectedTestValues.mortonUnsignedRightShift_emulated_2, testValues.mortonUnsignedRightShift_emulated_2, testType);
+
+        verifyTestValue("mortonUnsignedRightShift_small_3", expectedTestValues.mortonUnsignedRightShift_small_3, testValues.mortonUnsignedRightShift_small_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_3", expectedTestValues.mortonUnsignedRightShift_medium_3, testValues.mortonUnsignedRightShift_medium_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_3", expectedTestValues.mortonUnsignedRightShift_full_3, testValues.mortonUnsignedRightShift_full_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_3", expectedTestValues.mortonUnsignedRightShift_emulated_3, testValues.mortonUnsignedRightShift_emulated_3, testType);
+
+        verifyTestValue("mortonUnsignedRightShift_small_4", expectedTestValues.mortonUnsignedRightShift_small_4, testValues.mortonUnsignedRightShift_small_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_4", expectedTestValues.mortonUnsignedRightShift_medium_4, testValues.mortonUnsignedRightShift_medium_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType);
+
+        // Morton signed right-shift
+        verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
+        verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
+        verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
+
+        verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType);
+        verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType);
+        verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType);
+
+        verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType);
+        verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType);
+        verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/12_Mortons/Tester.h b/12_Mortons/ITester.h
similarity index 66%
rename from 12_Mortons/Tester.h
rename to 12_Mortons/ITester.h
index 480328d18..2510dd997 100644
--- a/12_Mortons/Tester.h
+++ b/12_Mortons/ITester.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
+#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
 
 #include <nabla.h>
 #include "app_resources/common.hlsl"
@@ -8,10 +8,10 @@
 
 using namespace nbl;
 
-class Tester
+class ITester 
 {
 public:
-    virtual ~Tester()
+    virtual ~ITester()
     {
         m_outputBufferAllocation.memory->unmap();
     };
@@ -128,7 +128,7 @@ class Tester
             if (!inputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            inputBuff->setObjectDebugName("morton input buffer");
+            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -163,7 +163,7 @@ class Tester
             if (!outputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            outputBuff->setObjectDebugName("morton output buffer");
+            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -211,29 +211,6 @@ class Tester
         if (expectedVal == testVal)
             return;
 
-        std::stringstream ss;
-        switch (testType)
-        {
-        case TestType::CPU:
-            ss << "CPU TEST ERROR:\n";
-        case TestType::GPU:
-            ss << "GPU TEST ERROR:\n";
-        }
-
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n';
-
-        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
-    }
-
-    template<typename T>
-    void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector<T, 3>& expectedVal, const nbl::hlsl::vector<T, 3>& testVal, const TestType testType)
-    {
-        static constexpr float MaxAllowedError = 0.1f;
-        if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError &&
-            std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError &&
-            std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError)
-            return;
-
         std::stringstream ss;
         switch (testType)
         {
@@ -244,69 +221,11 @@ class Tester
             ss << "GPU TEST ERROR:\n";
         }
 
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " <<
-            testVal.x << ' ' << testVal.y << ' ' << testVal.z <<
-            " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n';
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n';
 
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
 
-    void performTests()
-    {
-        std::random_device rd;
-        std::mt19937 mt(rd());
-
-        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
-        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
-        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
-
-        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
-        for (int i = 0; i < Iterations; ++i)
-        {
-            // Set input thest values that will be used in both CPU and GPU tests
-            InputTestValues testInput;
-            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
-            TestValues expected;
-
-            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
-            testInput.shift = generatedShift;
-            {
-                uint64_t generatedA = longDistribution(mt);
-                uint64_t generatedB = longDistribution(mt);
-
-                testInput.generatedA = generatedA;
-                testInput.generatedB = generatedB;
-
-                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
-                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
-                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
-                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
-                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
-                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
-                expected.emulatedLess = uint32_t(generatedA < generatedB);
-                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
-                expected.emulatedGreater = uint32_t(generatedA > generatedB);
-                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
-
-                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
-                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
-                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
-            }
-            {
-                uint64_t coordX = longDistribution(mt);
-                uint64_t coordY = longDistribution(mt);
-                uint64_t coordZ = longDistribution(mt);
-                uint64_t coordW = longDistribution(mt);
-
-
-            }
-
-            performCpuTests(testInput, expected);
-            performGpuTests(testInput, expected);
-        }
-        m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
-    }
-
 protected:
     uint32_t m_queueFamily;
     core::smart_refctd_ptr<video::ILogicalDevice> m_device;
@@ -324,7 +243,7 @@ class Tester
     core::smart_refctd_ptr<video::ISemaphore> m_semaphore;
     video::IQueue* m_queue;
     uint64_t m_semaphoreCounter;
-
+    
     template<typename InputStruct, typename OutputStruct>
     OutputStruct dispatch(const InputStruct& input)
     {
@@ -375,42 +294,6 @@ class Tester
         m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward<Args>(args)...);
         exit(-1);
     }
-
-    inline static constexpr int Iterations = 100u;
-
-    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues cpuTestValues;
-        cpuTestValues.fillTestValues(commonTestInputValues);
-        verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU);
-
-    }
-
-    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues gpuTestValues;
-        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
-        verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU);
-    }
-
-    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType)
-    {
-        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
-        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
-        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
-        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
-        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
-        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
-        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
-        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
-        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
-        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
-        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
-        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
-        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
-        
-        //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
-    }
 };
 
 #endif
\ No newline at end of file
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index be6a2f4a0..b058ad821 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -5,10 +5,6 @@
 #ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 #define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 
-// because DXC doesn't properly support `_Static_assert`
-// TODO: add a message, and move to macros.h or cpp_compat
-#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
-
 #include <boost/preprocessor.hpp>
 
 #include <nbl/builtin/hlsl/morton.hlsl>
@@ -23,6 +19,22 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4;
 NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
 NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
+#ifndef __HLSL_VERSION
+
+constexpr uint64_t smallBitsMask_2 = (uint64_t(1) << smallBits_2) - 1;
+constexpr uint64_t mediumBitsMask_2 = (uint64_t(1) << mediumBits_2) - 1;
+constexpr uint64_t fullBitsMask_2 = (uint64_t(1) << fullBits_2) - 1;
+
+constexpr uint64_t smallBitsMask_3 = (uint64_t(1) << smallBits_3) - 1;
+constexpr uint64_t mediumBitsMask_3 = (uint64_t(1) << mediumBits_3) - 1;
+constexpr uint64_t fullBitsMask_3 = (uint64_t(1) << fullBits_3) - 1;
+
+constexpr uint64_t smallBitsMask_4 = (uint64_t(1) << smallBits_4) - 1;
+constexpr uint64_t mediumBitsMask_4 = (uint64_t(1) << mediumBits_4) - 1;
+constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1;
+
+#endif
+
 using namespace nbl::hlsl;
 struct InputTestValues
 {
@@ -190,33 +202,9 @@ struct TestValues
 	morton::code<true, fullBits_4, 4>					  mortonSignedRightShift_full_4;
 	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
-	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
+	/*
+	void fillSecondTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
 	{
-		emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
-		emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
-
-		// Emulated int tests
-		emulatedAnd = emulatedA & emulatedB;
-		emulatedOr = emulatedA | emulatedB;
-		emulatedXor = emulatedA ^ emulatedB;
-		emulatedNot = emulatedA.operator~();
-		emulatedPlus = emulatedA + emulatedB;
-		emulatedMinus = emulatedA - emulatedB;
-		emulatedLess = uint32_t(emulatedA < emulatedB);
-		emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
-		emulatedGreater = uint32_t(emulatedA > emulatedB);
-		emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
-
-		left_shift_operator<emulated_uint64_t> leftShift;
-		emulatedLeftShifted = leftShift(emulatedA, input.shift);
-
-		arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
-		emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
-
-		arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
-		emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
-
-		// Morton tests
 		uint64_t2 Vec2A = { input.coordX, input.coordY };
 		uint64_t2 Vec2B = { input.coordZ, input.coordW };
 
@@ -235,250 +223,29 @@ struct TestValues
 		int64_t4 Vec4ASigned = int64_t4(Vec4A);
 		int64_t4 Vec4BSigned = int64_t4(Vec4B);
 
-		morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
-		morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
-		morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
-		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
-		morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
-		morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
-		morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
-		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
-
-		morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
-		morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
-		morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
-		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
-		morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
-		morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
-		morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
-		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
-
-		morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
-		morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
-		morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
 		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
-		morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
-		morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
-		morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
-		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
-
-		morton::code<true, smallBits_2, 2> morton_small_2ASigned = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, mediumBits_2, 2> morton_medium_2ASigned = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, fullBits_2, 2> morton_full_2ASigned = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2ASigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
-		morton::code<true, smallBits_2, 2> morton_small_2BSigned = morton::code<true, smallBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, mediumBits_2, 2> morton_medium_2BSigned = morton::code<true, mediumBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, fullBits_2, 2> morton_full_2BSigned = morton::code<true, fullBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2BSigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2BSigned);
-
-		morton::code<true, smallBits_3, 3> morton_small_3ASigned = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, mediumBits_3, 3> morton_medium_3ASigned = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, fullBits_3, 3> morton_full_3ASigned = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3ASigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
-		morton::code<true, smallBits_3, 3> morton_small_3BSigned = morton::code<true, smallBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, mediumBits_3, 3> morton_medium_3BSigned = morton::code<true, mediumBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, fullBits_3, 3> morton_full_3BSigned = morton::code<true, fullBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3BSigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3BSigned);
-
-		morton::code<true, smallBits_4, 4> morton_small_4ASigned = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, mediumBits_4, 4> morton_medium_4ASigned = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, fullBits_4, 4> morton_full_4ASigned = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4ASigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
-		morton::code<true, smallBits_4, 4> morton_small_4BSigned = morton::code<true, smallBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, mediumBits_4, 4> morton_medium_4BSigned = morton::code<true, mediumBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, fullBits_4, 4> morton_full_4BSigned = morton::code<true, fullBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4BSigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4BSigned);
-
-		/*
-		left_shift_operator<portable_vector_t<emulated_uint64_t, 4> > leftShiftTemp;
-		portable_vector_t<emulated_uint64_t, 4> interleaved = _static_cast<portable_vector_t<emulated_uint64_t, 4> >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>;
-		
-		#define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\
-        {\
-            interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\
-            interleaved = interleaved & _static_cast<emulated_uint64_t>(morton::impl::coding_mask<4, fullBits_4, I>::value);\
-        }
-		
-		ENCODE_LOOP_ITERATION(4)
-		ENCODE_LOOP_ITERATION(3)
-		ENCODE_LOOP_ITERATION(2)
-		ENCODE_LOOP_ITERATION(1)
-		ENCODE_LOOP_ITERATION(0)
-
-		#undef ENCODE_LOOP_ITERATION
-		// After interleaving, shift each coordinate left by their index
-		return leftShiftTemp(interleaved, truncate<vector<uint16_t, Dim> >(vector<uint16_t, 4>(0, 1, 2, 3)));
-		
-		
-		array_get<portable_vector_t<emulated_uint64_t, 4>, emulated_uint64_t> getter;
-		emulatedAnd = getter(interleaved, 0);
-		*/
-		
-		// Plus
-		mortonPlus_small_2 = morton_small_2A + morton_small_2B;
-		mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
-		mortonPlus_full_2 = morton_full_2A + morton_full_2B;
-		mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
-		
-		mortonPlus_small_3 = morton_small_3A + morton_small_3B;
-		mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
-		mortonPlus_full_3 = morton_full_3A + morton_full_3B;
-		mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
-
-		mortonPlus_small_4 = morton_small_4A + morton_small_4B;
-		mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
-		mortonPlus_full_4 = morton_full_4A + morton_full_4B;
-		mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
-		
-		// Minus
-		mortonMinus_small_2 = morton_small_2A - morton_small_2B;
-		mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
-		mortonMinus_full_2 = morton_full_2A - morton_full_2B;
-		mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
-
-		mortonMinus_small_3 = morton_small_3A - morton_small_3B;
-		mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
-		mortonMinus_full_3 = morton_full_3A - morton_full_3B;
-		mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
-
-		mortonMinus_small_4 = morton_small_4A - morton_small_4B;
-		mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
-		mortonMinus_full_4 = morton_full_4A - morton_full_4B;
-		mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
-
-		// Coordinate-wise equality
-		mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
-		mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
-		mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
-		mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
-
-		mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
-		mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
-		mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
-		mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
-
-		mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
-		
-		// Coordinate-wise unsigned inequality (just testing with less)
-		mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
-		mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
-		mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
-		mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
-		
-		mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
-		mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
-		mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
-		mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
-		
-		mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3_signed = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
+
+		output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
 		
-		// Coordinate-wise signed inequality
-		mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
-		mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
-		mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
-		//mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
-
-		mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
-		mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
-		mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
-		//mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
-
-		mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		//mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
 		
-		// Left-shift
+		mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(int32_t2(Vec2BSigned))); 
+		mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(int32_t3(Vec3BSigned))); 
+		mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(int16_t4(Vec4BSigned))); 
+
 		uint16_t castedShift = uint16_t(input.shift);
-		left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
-		mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
-		mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
-		mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
-		mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift);
-
-		left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
-		mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
-		mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
-		mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
-		mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift);
-
-		left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
-		mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
-		mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
-		mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
-		mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift);
-		
-		// Unsigned right-shift
-		arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
-		mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
-		mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
-		mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
-		mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
-		mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
-		mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
-		mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
-		mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
-		mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
-		mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
-		mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
-		mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift);
-
-		// Signed right-shift
-		arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
-		mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
-		mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
-		mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift);
+
 		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
-		//mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
-		mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
-		mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
-		mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift);
+		mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); 
 		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
-		//mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
-		mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
-		mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
-		mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift);
+		mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); 
 		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
-		//mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift);
+		mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); 
 	}
+	*/
 };
 
 #endif
diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/test.comp.hlsl
similarity index 79%
rename from 12_Mortons/app_resources/mortonTest.comp.hlsl
rename to 12_Mortons/app_resources/test.comp.hlsl
index 7041568b8..243983d5a 100644
--- a/12_Mortons/app_resources/mortonTest.comp.hlsl
+++ b/12_Mortons/app_resources/test.comp.hlsl
@@ -1,9 +1,8 @@
 //// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
 //// This file is part of the "Nabla Engine".
 //// For conditions of distribution and use, see copyright notice in nabla.h
-#pragma shader_stage(compute)
 
-#include "common.hlsl"
+#include "testCommon.hlsl"
 
 [[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
@@ -12,5 +11,5 @@
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
     if (invocationID.x == 0)
-        outputTestValues[0].fillTestValues(inputTestValues[0]);
+        fillTestValues(inputTestValues[0], outputTestValues[0]);
 }
diff --git a/12_Mortons/app_resources/testCommon.hlsl b/12_Mortons/app_resources/testCommon.hlsl
new file mode 100644
index 000000000..9ff9a4fa8
--- /dev/null
+++ b/12_Mortons/app_resources/testCommon.hlsl
@@ -0,0 +1,242 @@
+#include "common.hlsl"
+
+void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
+{
+	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
+	emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+
+	// Emulated int tests
+	output.emulatedAnd = emulatedA & emulatedB;
+	output.emulatedOr = emulatedA | emulatedB;
+	output.emulatedXor = emulatedA ^ emulatedB;
+	output.emulatedNot = emulatedA.operator~();
+	output.emulatedPlus = emulatedA + emulatedB;
+	output.emulatedMinus = emulatedA - emulatedB;
+	output.emulatedLess = uint32_t(emulatedA < emulatedB);
+	output.emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
+	output.emulatedGreater = uint32_t(emulatedA > emulatedB);
+	output.emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
+
+	left_shift_operator<emulated_uint64_t> leftShift;
+	output.emulatedLeftShifted = leftShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
+	output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
+	output.emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
+
+	// Morton tests
+	uint64_t2 Vec2A = { input.coordX, input.coordY };
+	uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+	uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+	uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+	int64_t2 Vec2ASigned = int64_t2(Vec2A);
+	int64_t2 Vec2BSigned = int64_t2(Vec2B);
+
+	int64_t3 Vec3ASigned = int64_t3(Vec3A);
+	int64_t3 Vec3BSigned = int64_t3(Vec3B);
+
+	int64_t4 Vec4ASigned = int64_t4(Vec4A);
+	int64_t4 Vec4BSigned = int64_t4(Vec4B);
+
+	morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
+	morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
+	morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
+	morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
+	morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
+	morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
+
+	morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
+	morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
+	morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
+	morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
+	morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
+	morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
+
+	morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
+	morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
+	morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
+	morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
+	morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
+	morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
+
+	morton::code<true, smallBits_2, 2> morton_small_2_signed = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
+	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
+	morton::code<true, fullBits_2, 2> morton_full_2_signed = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
+
+	morton::code<true, smallBits_3, 3> morton_small_3_signed = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
+	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
+	morton::code<true, fullBits_3, 3> morton_full_3_signed = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
+
+	morton::code<true, smallBits_4, 4> morton_small_4_signed = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
+	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
+	morton::code<true, fullBits_4, 4> morton_full_4_signed = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
+
+	// Plus
+	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
+	output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
+	output.mortonPlus_full_2 = morton_full_2A + morton_full_2B;
+	output.mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
+
+	output.mortonPlus_small_3 = morton_small_3A + morton_small_3B;
+	output.mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
+	output.mortonPlus_full_3 = morton_full_3A + morton_full_3B;
+	output.mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
+
+	output.mortonPlus_small_4 = morton_small_4A + morton_small_4B;
+	output.mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
+	output.mortonPlus_full_4 = morton_full_4A + morton_full_4B;
+	output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
+	
+	// Minus
+	output.mortonMinus_small_2 = morton_small_2A - morton_small_2B;
+	output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
+	output.mortonMinus_full_2 = morton_full_2A - morton_full_2B;
+	output.mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
+
+	output.mortonMinus_small_3 = morton_small_3A - morton_small_3B;
+	output.mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
+	output.mortonMinus_full_3 = morton_full_3A - morton_full_3B;
+	output.mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
+
+	output.mortonMinus_small_4 = morton_small_4A - morton_small_4B;
+	output.mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
+	output.mortonMinus_full_4 = morton_full_4A - morton_full_4B;
+	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
+	
+	// Coordinate-wise equality
+	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
+	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+
+	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
+	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+
+	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+	
+	// Coordinate-wise unsigned inequality (just testing with less)
+	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
+	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+
+	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
+	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+
+	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+	
+	// Coordinate-wise signed inequality
+	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
+	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
+	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2BSigned)));
+
+	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
+	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
+	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3BSigned)));
+
+	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	
+	// Cast to uint16_t which is what left shift for Mortons expect
+	uint16_t castedShift = uint16_t(input.shift);
+	// Each left shift clamps to correct bits so the result kinda makes sense
+	// Left-shift
+	left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
+	output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
+	output.mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
+	output.mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
+	output.mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+
+	left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
+	output.mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
+	output.mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
+	output.mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
+	output.mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+
+	left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
+	output.mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
+	output.mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
+	output.mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
+	output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// Unsigned right-shift
+	arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
+	output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
+	output.mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
+	output.mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
+	output.mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+
+	arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
+	output.mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
+	output.mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
+	output.mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
+	output.mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+
+	arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
+	output.mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
+	output.mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
+	output.mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
+	output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// Signed right-shift
+	arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
+	output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
+	output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
+	output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2);
+
+	arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
+	output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
+	output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
+	output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3);
+
+	arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
+	output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
+	output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
+	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
+}
\ No newline at end of file
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index f83c49b9e..18fd067ec 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -10,7 +10,7 @@
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "app_resources/common.hlsl"
-#include "Tester.h"
+#include "CTester.h"
 
 using namespace nbl::core;
 using namespace nbl::hlsl;
@@ -35,24 +35,21 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn
             return false;
         if (!asset_base_t::onAppInitialized(std::move(system)))
             return false;
-        {
-            
-        }
         
-        Tester::PipelineSetupData pplnSetupData;
+        CTester::PipelineSetupData pplnSetupData;
         pplnSetupData.device = m_device;
         pplnSetupData.api = m_api;
         pplnSetupData.assetMgr = m_assetMgr;
         pplnSetupData.logger = m_logger;
         pplnSetupData.physicalDevice = m_physicalDevice;
         pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
+        // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
         {
-            Tester mortonTester;
-            pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl";
+            CTester mortonTester;
+            pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
             mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
             mortonTester.performTests();
         }
-        
 
         return true;
     }

From f05dec4652d1af3fa1a4664760efb1f3e934134a Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 28 Apr 2025 15:29:40 -0300
Subject: [PATCH 09/57] Clarifying comment for blocker issue

---
 12_Mortons/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index 18fd067ec..a05e61842 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -44,6 +44,7 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn
         pplnSetupData.physicalDevice = m_physicalDevice;
         pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
         // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
+        // Blocked by https://github.com/KhronosGroup/SPIRV-Tools/issues/6104
         {
             CTester mortonTester;
             pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";

From 8a8f958d179cc32afa227c30f60f4ada0d4369b8 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Wed, 22 Oct 2025 16:15:50 +0200
Subject: [PATCH 10/57] Enabled build time shader compilation in example 05

---
 .../CMakeLists.txt                            | 46 ++++++++++++++++++-
 .../main.cpp                                  |  5 +-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
index a434ff32a..3c6054992 100644
--- a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
+++ b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
@@ -21,4 +21,48 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	app_resources/common.hlsl
+	app_resources/shader.comp.hlsl
+)
+
+set(JSON [=[
+[
+    {
+		"INPUT": "app_resources/shader.comp.hlsl",
+		"KEY": "shader",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+	TARGET ${EXECUTABLE_NAME}SPIRV
+	LINK_TO ${EXECUTABLE_NAME}
+	DEPENDS ${DEPENDS}
+	BINARY_DIR ${OUTPUT_DIRECTORY}
+	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+	COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+	OUTPUT_VAR KEYS
+	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+	NAMESPACE nbl::this_example::builtin::build
+	INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin::build
+	TARGET ${EXECUTABLE_NAME}_builtinsBuild
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND ${OUTPUT_DIRECTORY}
+	BUILTINS ${KEYS}
+	COMMON_OPTIONS ${COMPILE_OPTIONS}
+)
\ No newline at end of file
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index b82dc18ca..131c7506a 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -6,6 +6,7 @@
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 
 using namespace nbl;
@@ -96,7 +97,9 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
 				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
+
+				auto key = "app_resources/" + nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())
 					return logFail("Could not load shader!");

From f1a3ee5921b5fea3f275b67344e722066a901da8 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Wed, 22 Oct 2025 17:38:08 +0200
Subject: [PATCH 11/57] Fix

---
 .../app_resources/shader.comp.hlsl                           | 5 +----
 05_StreamingAndBufferDeviceAddressApp/main.cpp               | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
index af38ffada..31c60aefd 100644
--- a/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
+++ b/05_StreamingAndBufferDeviceAddressApp/app_resources/shader.comp.hlsl
@@ -1,12 +1,9 @@
 #include "common.hlsl"
 
-// just a small test
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
 [[vk::push_constant]] PushConstantData pushConstants;
 
 // does absolutely nothing, a later example will show how it gets used
-template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
+template<typename capability_traits=DeviceConfigCaps>
 void dummyTraitTest() {}
 
 [numthreads(WorkgroupSize,1,1)]
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index 131c7506a..495890c6d 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -96,9 +96,9 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
+				lp.workingDirectory = "app_resources"; // virtual root
 
-				auto key = "app_resources/" + nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+				auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
 				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())

From e301db5db00ec1a77d4e231037bd05d7f23adbc7 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Wed, 22 Oct 2025 18:02:33 +0200
Subject: [PATCH 12/57] Updated source file generation of the
 05_streamingandbufferdeviceaddressappSPIRV project

---
 05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
index 3c6054992..1dcceed5d 100644
--- a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
+++ b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
@@ -28,6 +28,8 @@ set(DEPENDS
 	app_resources/common.hlsl
 	app_resources/shader.comp.hlsl
 )
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
 set(JSON [=[
 [

From f85ae8045c13380ace4c124d8a07349b4fd5fb62 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Thu, 23 Oct 2025 23:16:25 +0200
Subject: [PATCH 13/57] Enabled build time shader compilation in multiple
 examples

---
 03_DeviceSelectionAndSharedSources/main.cpp   |  2 +-
 .../main.cpp                                  |  4 +-
 07_StagingAndMultipleQueues/CMakeLists.txt    | 48 ++++++++++++-
 07_StagingAndMultipleQueues/main.cpp          | 44 +++++++-----
 11_FFT/main.cpp                               |  1 -
 24_ColorSpaceTest/CMakeLists.txt              | 46 +++++++++++++
 24_ColorSpaceTest/main.cpp                    | 21 +++---
 62_CAD/CMakeLists.txt                         | 68 ++++++++++++++++++-
 62_CAD/main.cpp                               | 14 ++--
 62_CAD/shaders/globals.hlsl                   |  6 --
 .../shaders/main_pipeline/vertex_shader.hlsl  | 10 +--
 64_EmulatedFloatTest/CMakeLists.txt           | 54 ++++++++++++++-
 .../benchmark/benchmark.comp.hlsl             |  1 +
 .../app_resources/test.comp.hlsl              |  1 +
 64_EmulatedFloatTest/main.cpp                 | 52 ++++----------
 67_RayQueryGeometry/CMakeLists.txt            | 48 ++++++++++++-
 .../app_resources/render.comp.hlsl            |  2 -
 67_RayQueryGeometry/main.cpp                  | 12 ++--
 18 files changed, 334 insertions(+), 100 deletions(-)

diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index b8fd3d18b..bcc849a4d 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -257,7 +257,7 @@ class DeviceSelectionAndSharedSourcesApp final : public application_templates::M
 		}
 
 		const auto* metadata = assetBundle.getMetadata();
-    const auto hlslMetadata = static_cast<const CHLSLMetadata*>(metadata);
+		const auto hlslMetadata = static_cast<const CHLSLMetadata*>(metadata);
 		const auto shaderStage = hlslMetadata->shaderStages->front();
 
 		// It would be super weird if loading a shader from a file produced more than 1 asset
diff --git a/05_StreamingAndBufferDeviceAddressApp/main.cpp b/05_StreamingAndBufferDeviceAddressApp/main.cpp
index 495890c6d..ab0984a07 100644
--- a/05_StreamingAndBufferDeviceAddressApp/main.cpp
+++ b/05_StreamingAndBufferDeviceAddressApp/main.cpp
@@ -104,9 +104,7 @@ class StreamingAndBufferDeviceAddressApp final : public application_templates::M
 				if (assets.empty())
 					return logFail("Could not load shader!");
 
-				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				const auto shaderSource = IAsset::castDown<IShader>(assets[0]);
-				shader = m_device->compileShader({shaderSource.get()});
+				shader = IAsset::castDown<IShader>(assets[0]);
 				// The down-cast should not fail!
 				assert(shader);
 			}
diff --git a/07_StagingAndMultipleQueues/CMakeLists.txt b/07_StagingAndMultipleQueues/CMakeLists.txt
index a434ff32a..cc4ecd465 100644
--- a/07_StagingAndMultipleQueues/CMakeLists.txt
+++ b/07_StagingAndMultipleQueues/CMakeLists.txt
@@ -21,4 +21,50 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/common.hlsl
+    app_resources/comp_shader.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/comp_shader.hlsl",
+        "KEY": "comp_shader",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+)
\ No newline at end of file
diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index fc6bf4551..70455eb96 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -4,6 +4,7 @@
 
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 using namespace nbl;
 using namespace nbl::core;
@@ -189,7 +190,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 		for (uint32_t imageIdx = 0; imageIdx < IMAGE_CNT; ++imageIdx)
 		{
 			const auto imagePathToLoad = imagesToLoad[imageIdx];
-			auto cpuImage = loadFistAssetInBundle<ICPUImage>(imagePathToLoad);
+			auto cpuImage = loadImageAsset(imagePathToLoad);
 			if (!cpuImage)
 				logFailAndTerminate("Failed to load image from path %s",ILogger::ELL_ERROR,imagePathToLoad);
 
@@ -279,17 +280,10 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 		}
 
 		// LOAD SHADER FROM FILE
-		smart_refctd_ptr<IShader> source;
-		{
-			source = loadFistAssetInBundle<IShader>("../app_resources/comp_shader.hlsl");
-		}
+		smart_refctd_ptr<IShader> shader = loadPreCompiledShader<"comp_shader">("../app_resources/comp_shader.hlsl");
 
-		if (!source)
-			logFailAndTerminate("Could not create a CPU shader!");
-
-		core::smart_refctd_ptr<IShader> shader = m_device->compileShader({ source.get() });
-		if(!shader)
-			logFailAndTerminate("Could not compile shader to spirv!");
+		if (!shader)
+			logFailAndTerminate("Could not load the precompiled shader!");
 
 		// CREATE COMPUTE PIPELINE
 		SPushConstantRange pc[1];
@@ -534,21 +528,39 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 
 		return false;
 	}
-
-	template<typename AssetType>
-	core::smart_refctd_ptr<AssetType> loadFistAssetInBundle(const std::string& path)
+	
+	core::smart_refctd_ptr<ICPUImage> loadImageAsset(const std::string& path)
 	{
 		IAssetLoader::SAssetLoadParams lp;
 		SAssetBundle bundle = m_assetMgr->getAsset(path, lp);
 		if (bundle.getContents().empty())
-			logFailAndTerminate("Couldn't load an asset.",ILogger::ELL_ERROR);
+			logFailAndTerminate("Couldn't load an image.",ILogger::ELL_ERROR);
 
-		auto asset = IAsset::castDown<AssetType>(bundle.getContents()[0]);
+		auto asset = IAsset::castDown<ICPUImage>(bundle.getContents()[0]);
 		if (!asset)
 			logFailAndTerminate("Incorrect asset loaded.",ILogger::ELL_ERROR);
 
 		return asset;
 	}
+
+	template<core::StringLiteral ShaderKey>
+	core::smart_refctd_ptr<IShader> loadPreCompiledShader(const std::string& path)
+	{
+		IAssetLoader::SAssetLoadParams lp;
+		lp.logger = m_logger.get();
+		lp.workingDirectory = "app_resources";
+
+		auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+		SAssetBundle bundle = m_assetMgr->getAsset(key.data(), lp);
+		if (bundle.getContents().empty())
+			logFailAndTerminate("Couldn't load a shader.", ILogger::ELL_ERROR);
+
+		auto asset = IAsset::castDown<IShader>(bundle.getContents()[0]);
+		if (!asset)
+			logFailAndTerminate("Incorrect asset loaded.", ILogger::ELL_ERROR);
+
+		return asset;
+	}
 };
 
 NBL_MAIN_FUNC(StagingAndMultipleQueuesApp)
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index 1886da72a..2be25d92b 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -2,7 +2,6 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
diff --git a/24_ColorSpaceTest/CMakeLists.txt b/24_ColorSpaceTest/CMakeLists.txt
index 026add505..fcf8faa36 100644
--- a/24_ColorSpaceTest/CMakeLists.txt
+++ b/24_ColorSpaceTest/CMakeLists.txt
@@ -32,4 +32,50 @@ add_test(NAME NBL_IMAGE_HASH_RUN_TESTS
 	COMMAND "$<TARGET_FILE:${EXECUTABLE_NAME}>" --test hash
 	WORKING_DIRECTORY "$<TARGET_FILE_DIR:${EXECUTABLE_NAME}>"
 	COMMAND_EXPAND_LISTS
+)
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/present.frag.hlsl
+    app_resources/push_constants.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/present.frag.hlsl",
+        "KEY": "present",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
 )
\ No newline at end of file
diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index 84c55ef3a..e8858f5a6 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -1,6 +1,7 @@
 ﻿// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 #include "nbl/examples/examples.hpp"
 
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
@@ -160,26 +161,24 @@ class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public B
 					return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
 				// Load Custom Shader
-				auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
+				auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>(const std::string& relPath) -> smart_refctd_ptr<IShader>
 					{
 						IAssetLoader::SAssetLoadParams lp = {};
 						lp.logger = m_logger.get();
-						lp.workingDirectory = ""; // virtual root
-						auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+						lp.workingDirectory = "app_resources";
+
+						auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+						auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 						const auto assets = assetBundle.getContents();
 						if (assets.empty())
 							return nullptr;
 
-						// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-						auto source = IAsset::castDown<IShader>(assets[0]);
-						if (!source)
-							return nullptr;
-
-						return m_device->compileShader({ source.get() });
+						auto shader = IAsset::castDown<IShader>(assets[0]);
+						return shader;
 					};
-				auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
+				auto fragmentShader = loadPrecompiledShader.operator()<"present">("app_resources/present.frag.hlsl");
 				if (!fragmentShader)
-					return logFail("Failed to Load and Compile Fragment Shader!");
+					return logFail("Failed to load precompiled fragment shader!");
 
 				// Now surface indep resources
 				m_semaphore = m_device->createSemaphore(m_submitIx);
diff --git a/62_CAD/CMakeLists.txt b/62_CAD/CMakeLists.txt
index c3a0fa47e..c193dc63c 100644
--- a/62_CAD/CMakeLists.txt
+++ b/62_CAD/CMakeLists.txt
@@ -61,4 +61,70 @@ else()
 	foreach(NBL_TARGET IN LISTS NBL_MSDFGEN_TARGETS)
 		target_include_directories(${EXECUTABLE_NAME} PUBLIC $<TARGET_PROPERTY:${NBL_TARGET},INCLUDE_DIRECTORIES>)
 	endforeach()
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	shaders/globals.hlsl
+	shaders/runtimeDeviceConfigCaps.hlsl
+	shaders/main_pipeline/common.hlsl
+	shaders/main_pipeline/dtm.hlsl
+	shaders/main_pipeline/fragment.hlsl
+	shaders/main_pipeline/fragment_shader.hlsl
+	shaders/main_pipeline/fragment_shader_debug.hlsl
+	shaders/main_pipeline/line_style.hlsl
+	shaders/main_pipeline/resolve_alphas.hlsl
+	shaders/main_pipeline/vertex_shader.hlsl
+)
+
+set(SM 6_8)
+
+set(REQUIRED_CAPS [=[
+    {
+        "kind": "features",
+        "name": "fragmentShaderPixelInterlock",
+        "type": "bool",
+        "values": [1]
+    }
+]=])
+
+set(JSON [=[
+[
+	{
+		"INPUT": "shaders/main_pipeline/vertex_shader.hlsl",
+		"KEY": "main_pipeline_vertex_shader",
+    "COMPILE_OPTIONS": ["-T", "cs_6_8"],
+		"DEPENDS": [],
+		"CAPS": [${REQUIRED_CAPS}]
+    },
+    {
+		"INPUT": "shaders/main_pipeline/fragment.hlsl",
+		"KEY": "main_pipeline_fragment_shader",
+    "COMPILE_OPTIONS": ["-T", "cs_6_8"],
+		"DEPENDS": [],
+		"CAPS": [${REQUIRED_CAPS}]
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
\ No newline at end of file
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index f4a886791..ec7b177eb 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -1,5 +1,5 @@
 ﻿// TODO: Copyright notice
-
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 #include "nbl/examples/examples.hpp"
 
@@ -961,12 +961,14 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 			}
 
 			// Load Custom Shader
-			auto loadCompileShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
+			auto loadCompileShader = [&]<core::StringLiteral ShaderKey>(const std::string& relPath) -> smart_refctd_ptr<IShader>
 				{
 					IAssetLoader::SAssetLoadParams lp = {};
 					lp.logger = m_logger.get();
-					lp.workingDirectory = ""; // virtual root
-					auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+					lp.workingDirectory = "shaders";
+
+					auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+					auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 					const auto assets = assetBundle.getContents();
 					if (assets.empty())
 						return nullptr;
@@ -979,8 +981,8 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 					return m_device->compileShader( ILogicalDevice::SShaderCreationParameters { .source = source.get(), .readCache = shaderReadCache.get(), .writeCache = shaderWriteCache.get(), .stage = IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY });
 				};
 
-			mainPipelineFragmentShaders = loadCompileShader("../shaders/main_pipeline/fragment.hlsl");
-			mainPipelineVertexShader = loadCompileShader("../shaders/main_pipeline/vertex_shader.hlsl");
+			mainPipelineFragmentShaders = loadCompileShader.operator()<"main_pipeline_fragment_shader">("../shaders/main_pipeline/fragment.hlsl");
+			mainPipelineVertexShader = loadCompileShader.operator() <"main_pipeline_vertex_shader"> ("../shaders/main_pipeline/vertex_shader.hlsl");
 			
 			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 			{
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 5c3681910..bad6e6132 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -1,12 +1,6 @@
 #ifndef _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 #define _CAD_EXAMPLE_GLOBALS_HLSL_INCLUDED_
 
-#ifdef __HLSL_VERSION
-#ifndef NBL_USE_SPIRV_BUILTINS
-#include "runtimeDeviceConfigCaps.hlsl" // defines DeviceConfigCaps, uses JIT device caps
-#endif
-#endif
-
 // TODO[Erfan]: Turn off in the future, but keep enabled to test
 // #define NBL_FORCE_EMULATED_FLOAT_64
 
diff --git a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
index 90394e935..df566f002 100644
--- a/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+++ b/62_CAD/shaders/main_pipeline/vertex_shader.hlsl
@@ -706,19 +706,19 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
 
             if (corner.x == 0.0f && corner.y == 0.0f)
             {
-                dilationVector.x = ieee754::flipSign(dilationVector.x);
+                dilationVector.x = ieee754::flipSign(dilationVector.x, true);
                 uvOffset.x = -uvOffset.x;
                 uvOffset.y = -uvOffset.y;
             }
             else if (corner.x == 0.0f && corner.y == 1.0f)
             {
-                dilationVector.x = ieee754::flipSign(dilationVector.x);
-                dilationVector.y = ieee754::flipSign(dilationVector.y);
+                dilationVector.x = ieee754::flipSign(dilationVector.x, true);
+                dilationVector.y = ieee754::flipSign(dilationVector.y, true);
                 uvOffset.x = -uvOffset.x;
             }
             else if (corner.x == 1.0f && corner.y == 1.0f)
             {
-                dilationVector.y = ieee754::flipSign(dilationVector.y);
+                dilationVector.y = ieee754::flipSign(dilationVector.y, true);
             }
             else if (corner.x == 1.0f && corner.y == 0.0f)
             {
@@ -730,7 +730,7 @@ PSInput vtxMain(uint vertexID : SV_VertexID)
 
             pfloat64_t2 worldSpaceExtentsYAxisFlipped;
             worldSpaceExtentsYAxisFlipped.x = worldSpaceExtents.x;
-            worldSpaceExtentsYAxisFlipped.y = ieee754::flipSign(worldSpaceExtents.y);
+            worldSpaceExtentsYAxisFlipped.y = ieee754::flipSign(worldSpaceExtents.y, true);
             const pfloat64_t2 vtxPos = topLeft + worldSpaceExtentsYAxisFlipped * _static_cast<pfloat64_t2>(corner);
             const pfloat64_t2 dilatedVtxPos = vtxPos + dilationVector;
 
diff --git a/64_EmulatedFloatTest/CMakeLists.txt b/64_EmulatedFloatTest/CMakeLists.txt
index aae93590d..1b272bf2e 100644
--- a/64_EmulatedFloatTest/CMakeLists.txt
+++ b/64_EmulatedFloatTest/CMakeLists.txt
@@ -27,4 +27,56 @@ if(MSVC)
   target_compile_options("${EXECUTABLE_NAME}" PUBLIC "/fp:strict")
 else()
   target_compile_options("${EXECUTABLE_NAME}" PUBLIC -ffloat-store -frounding-math -fsignaling-nans -ftrapping-math)
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	app_resources/common.hlsl
+    app_resources/test.comp.hlsl
+	app_resources/benchmark/benchmark.comp.hlsl
+	app_resources/benchmark/common.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/test.comp.hlsl",
+        "KEY": "test",
+    },
+	{
+        "INPUT": "app_resources/benchmark/benchmark.comp.hlsl",
+        "KEY": "benchmark",
+    },
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+)
\ No newline at end of file
diff --git a/64_EmulatedFloatTest/app_resources/benchmark/benchmark.comp.hlsl b/64_EmulatedFloatTest/app_resources/benchmark/benchmark.comp.hlsl
index b31da3737..a515f6bcb 100644
--- a/64_EmulatedFloatTest/app_resources/benchmark/benchmark.comp.hlsl
+++ b/64_EmulatedFloatTest/app_resources/benchmark/benchmark.comp.hlsl
@@ -66,6 +66,7 @@ uint64_t calcIntegral()
 }
 
 [numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)]
+[shader("compute")]
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
 	static const uint32_t NativeToEmulatedRatio = 6;
diff --git a/64_EmulatedFloatTest/app_resources/test.comp.hlsl b/64_EmulatedFloatTest/app_resources/test.comp.hlsl
index 7681e80a5..e95eadd49 100644
--- a/64_EmulatedFloatTest/app_resources/test.comp.hlsl
+++ b/64_EmulatedFloatTest/app_resources/test.comp.hlsl
@@ -12,6 +12,7 @@
 PushConstants pc;
 
 [numthreads(WORKGROUP_SIZE, 1, 1)]
+[shader("compute")]
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
     const nbl::hlsl::emulated_float64_t<false, true> a = nbl::hlsl::bit_cast<emulated_float64_t<false, true> >(pc.a);
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index 3fc635e87..a4f177f16 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 #include "nbl/examples/examples.hpp"
 
@@ -262,9 +262,10 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
                 {
                     IAssetLoader::SAssetLoadParams lp = {};
                     lp.logger = base.m_logger.get();
-                    lp.workingDirectory = ""; // virtual root
-                    // this time we load a shader directly from a file
-                    auto assetBundle = base.m_assetMgr->getAsset("app_resources/test.comp.hlsl", lp);
+                    lp.workingDirectory = "app_resources"; // virtual root
+
+                    auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get());
+                    auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp);
                     const auto assets = assetBundle.getContents();
                     if (assets.empty())
                     {
@@ -274,26 +275,11 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
 
                     // It would be super weird if loading a shader from a file produced more than 1 asset
                     assert(assets.size() == 1);
-                    smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
-
-                    auto* compilerSet = base.m_assetMgr->getCompilerSet();
-
-                    nbl::asset::IShaderCompiler::SCompilerOptions options = {};
-                    options.stage = ESS_COMPUTE;
-                    options.preprocessorOptions.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
-                    options.spirvOptimizer = nullptr;
-                    options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
-                    options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
-                    options.preprocessorOptions.logger = base.m_logger.get();
-                    options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
-
-                    auto spirv = compilerSet->compileToSPIRV(source.get(), options);
-
-                    shader = base.m_device->compileShader({spirv.get()});
+                    shader = IAsset::castDown<IShader>(assets[0]);
                 }
 
                 if (!shader)
-                    base.logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
+                    base.logFail("Failed to load precompiled \"test\" shader!\n");
 
                 nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
                     {
@@ -928,9 +914,10 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
                 {
                     IAssetLoader::SAssetLoadParams lp = {};
                     lp.logger = base.m_logger.get();
-                    lp.workingDirectory = ""; // virtual root
+                    lp.workingDirectory = "app_resources"; // virtual root
                     // this time we load a shader directly from a file
-                    auto assetBundle = base.m_assetMgr->getAsset("app_resources/benchmark/benchmark.comp.hlsl", lp);
+                    auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get());
+                    auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp);
                     const auto assets = assetBundle.getContents();
                     if (assets.empty())
                     {
@@ -940,26 +927,11 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
 
                     // It would be super weird if loading a shader from a file produced more than 1 asset
                     assert(assets.size() == 1);
-                    smart_refctd_ptr<IShader> source = IAsset::castDown<IShader>(assets[0]);
-
-                    auto* compilerSet = base.m_assetMgr->getCompilerSet();
-
-                    IShaderCompiler::SCompilerOptions options = {};
-                    options.stage = ESS_COMPUTE;
-                    options.preprocessorOptions.targetSpirvVersion = base.m_device->getPhysicalDevice()->getLimits().spirvVersion;
-                    options.spirvOptimizer = nullptr;
-                    options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
-                    options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
-                    options.preprocessorOptions.logger = base.m_logger.get();
-                    options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
-
-                    auto spirv = compilerSet->compileToSPIRV(source.get(), options);
-
-                    shader = base.m_device->compileShader({spirv.get()});
+                    shader = IAsset::castDown<IShader>(assets[0]);
                 }
 
                 if (!shader)
-                    base.logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
+                    base.logFail("Failed to load precompiled \"benchmark\" shader!\n");
 
                 nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
                     {
diff --git a/67_RayQueryGeometry/CMakeLists.txt b/67_RayQueryGeometry/CMakeLists.txt
index d26a90205..40f32624a 100644
--- a/67_RayQueryGeometry/CMakeLists.txt
+++ b/67_RayQueryGeometry/CMakeLists.txt
@@ -25,4 +25,50 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/common.hlsl
+    app_resources/render.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/render.comp.hlsl",
+        "KEY": "render",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+)
\ No newline at end of file
diff --git a/67_RayQueryGeometry/app_resources/render.comp.hlsl b/67_RayQueryGeometry/app_resources/render.comp.hlsl
index 954598c9a..889e1f38b 100644
--- a/67_RayQueryGeometry/app_resources/render.comp.hlsl
+++ b/67_RayQueryGeometry/app_resources/render.comp.hlsl
@@ -1,7 +1,5 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 #include "nbl/builtin/hlsl/bda/__ptr.hlsl"
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 2783385f2..b35000485 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -2,6 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "common.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 class RayQueryGeometryApp final : public SimpleWindowedApplication, public BuiltinResourcesApplication
 {
@@ -150,8 +151,10 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 				const std::string shaderPath = "app_resources/render.comp.hlsl";
 				IAssetLoader::SAssetLoadParams lparams = {};
 				lparams.logger = m_logger.get();
-				lparams.workingDirectory = "";
-				auto bundle = m_assetMgr->getAsset(shaderPath, lparams);
+				lparams.workingDirectory = "app_resources";
+
+				auto key = nbl::this_example::builtin::build::get_spirv_key<"render">(m_device.get());
+				auto bundle = m_assetMgr->getAsset(key.data(), lparams);
 				if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
 				{
 					m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, shaderPath);
@@ -160,10 +163,9 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 
 				const auto assets = bundle.getContents();
 				assert(assets.size() == 1);
-				smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
-				auto shader = m_device->compileShader({shaderSrc.get()});
+				smart_refctd_ptr<IShader> shader = IAsset::castDown<IShader>(assets[0]);
 				if (!shader)
-					return logFail("Failed to create shader!");
+					return logFail("Failed to load precompiled shader!");
 
 				SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0u, .size = sizeof(SPushConstants)};
 				auto pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr<const IGPUDescriptorSetLayout>(renderDs->getLayout()), nullptr, nullptr, nullptr);

From 22f2a17401e8e70dddff477e11db12ebd1dea2bd Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Mon, 27 Oct 2025 15:51:37 +0100
Subject: [PATCH 14/57] Fixed project creation of multiple examples

---
 .../CMakeLists.txt                            |  32 ++--
 07_StagingAndMultipleQueues/CMakeLists.txt    |   4 +-
 07_StagingAndMultipleQueues/main.cpp          |   4 +-
 11_FFT/CMakeLists.txt                         |  48 +++++-
 11_FFT/main.cpp                               |  32 ++--
 24_ColorSpaceTest/CMakeLists.txt              |   4 +-
 24_ColorSpaceTest/main.cpp                    |  30 ++--
 62_CAD/CMakeLists.txt                         |  27 ++--
 62_CAD/main.cpp                               |  42 ++---
 64_EmulatedFloatTest/CMakeLists.txt           |   4 +-
 67_RayQueryGeometry/CMakeLists.txt            |   4 +-
 70_FLIPFluids/CMakeLists.txt                  |  99 +++++++++++-
 .../app_resources/compute/diffusion.comp.hlsl |   3 +
 .../compute/pressureSolver.comp.hlsl          |   2 +
 .../compute/updateFluidCells.comp.hlsl        |   1 +
 70_FLIPFluids/main.cpp                        | 148 +++++++-----------
 16 files changed, 298 insertions(+), 186 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
index 1dcceed5d..a342ac3d5 100644
--- a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
+++ b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
@@ -31,6 +31,7 @@ set(DEPENDS
 target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
 set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
+set(SM 6_8)
 set(JSON [=[
 [
     {
@@ -48,23 +49,22 @@ set(COMPILE_OPTIONS
 )
 
 NBL_CREATE_NSC_COMPILE_RULES(
-	TARGET ${EXECUTABLE_NAME}SPIRV
-	LINK_TO ${EXECUTABLE_NAME}
-	DEPENDS ${DEPENDS}
-	BINARY_DIR ${OUTPUT_DIRECTORY}
-	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-	COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
-	OUTPUT_VAR KEYS
-	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
-	NAMESPACE nbl::this_example::builtin::build
-	INPUTS ${JSON}
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
 )
 
 NBL_CREATE_RESOURCE_ARCHIVE(
-	NAMESPACE nbl::this_example::builtin::build
-	TARGET ${EXECUTABLE_NAME}_builtinsBuild
-	LINK_TO ${EXECUTABLE_NAME}
-	BIND ${OUTPUT_DIRECTORY}
-	BUILTINS ${KEYS}
-	COMMON_OPTIONS ${COMPILE_OPTIONS}
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
 )
\ No newline at end of file
diff --git a/07_StagingAndMultipleQueues/CMakeLists.txt b/07_StagingAndMultipleQueues/CMakeLists.txt
index cc4ecd465..19515454d 100644
--- a/07_StagingAndMultipleQueues/CMakeLists.txt
+++ b/07_StagingAndMultipleQueues/CMakeLists.txt
@@ -31,6 +31,7 @@ set(DEPENDS
 target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
 set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
+set(SM 6_8)
 set(JSON [=[
 [
     {
@@ -53,7 +54,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
     DEPENDS ${DEPENDS}
     BINARY_DIR ${OUTPUT_DIRECTORY}
     MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
     OUTPUT_VAR KEYS
     INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
     NAMESPACE nbl::this_example::builtin::build
@@ -66,5 +67,4 @@ NBL_CREATE_RESOURCE_ARCHIVE(
     LINK_TO ${EXECUTABLE_NAME}
     BIND ${OUTPUT_DIRECTORY}
     BUILTINS ${KEYS}
-    COMMON_OPTIONS ${COMPILE_OPTIONS}
 )
\ No newline at end of file
diff --git a/07_StagingAndMultipleQueues/main.cpp b/07_StagingAndMultipleQueues/main.cpp
index 70455eb96..a850c1c47 100644
--- a/07_StagingAndMultipleQueues/main.cpp
+++ b/07_StagingAndMultipleQueues/main.cpp
@@ -280,7 +280,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 		}
 
 		// LOAD SHADER FROM FILE
-		smart_refctd_ptr<IShader> shader = loadPreCompiledShader<"comp_shader">("../app_resources/comp_shader.hlsl");
+		smart_refctd_ptr<IShader> shader = loadPreCompiledShader<"comp_shader">(); // "../app_resources/comp_shader.hlsl"
 
 		if (!shader)
 			logFailAndTerminate("Could not load the precompiled shader!");
@@ -544,7 +544,7 @@ class StagingAndMultipleQueuesApp final : public application_templates::BasicMul
 	}
 
 	template<core::StringLiteral ShaderKey>
-	core::smart_refctd_ptr<IShader> loadPreCompiledShader(const std::string& path)
+	core::smart_refctd_ptr<IShader> loadPreCompiledShader()
 	{
 		IAssetLoader::SAssetLoadParams lp;
 		lp.logger = m_logger.get();
diff --git a/11_FFT/CMakeLists.txt b/11_FFT/CMakeLists.txt
index a434ff32a..9a2ee5a21 100644
--- a/11_FFT/CMakeLists.txt
+++ b/11_FFT/CMakeLists.txt
@@ -21,4 +21,50 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/common.hlsl
+    app_resources/shader.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/shader.comp.hlsl",
+        "KEY": "shader",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
\ No newline at end of file
diff --git a/11_FFT/main.cpp b/11_FFT/main.cpp
index 2be25d92b..49d157a38 100644
--- a/11_FFT/main.cpp
+++ b/11_FFT/main.cpp
@@ -2,6 +2,8 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
 #include "nbl/examples/examples.hpp"
 
 using namespace nbl;
@@ -44,15 +46,6 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 	smart_refctd_ptr<ISemaphore> m_timeline;
 	uint64_t semaphorValue = 0;
 
-	inline core::smart_refctd_ptr<asset::IShader> createShader(
-		const char* includeMainName)
-	{
-		std::string prelude = "#include \"";
-		auto hlslShader = core::make_smart_refctd_ptr<IShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
-		assert(hlslShader);
-		return m_device->compileShader({ hlslShader.get() });
-	}
-
 public:
 	// Yay thanks to multiple inheritance we cannot forward ctors anymore
 	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -67,28 +60,23 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
-		// this time we load a shader directly from a file
 		smart_refctd_ptr<IShader> shader;
-		/* {
+		{
 			IAssetLoader::SAssetLoadParams lp = {};
 			lp.logger = m_logger.get();
-			lp.workingDirectory = ""; // virtual root
-			auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl", lp);
+			lp.workingDirectory = "app_resources"; // virtual root
+			auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+			auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 			const auto assets = assetBundle.getContents();
 			if (assets.empty())
 				return logFail("Could not load shader!");
 
 			// Cast down the asset to its proper type
-			auto source = IAsset::castDown<IShader>(assets[0]);
-			// The down-cast should not fail!
-			assert(source);
-
-			// Compile directly to SPIR-V Shader
-			shader = m_device->compileShader({ source.get() });
+			shader = IAsset::castDown<IShader>(assets[0]);
+			
 			if (!shader)
-				return logFail("Creation of a SPIR-V Shader from HLSL Shader source failed!");
-		}*/
-		shader = createShader("app_resources/shader.comp.hlsl");
+				return logFail("Invalid shader!");
+		}
 
 		// Create massive upload/download buffers
 		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;
diff --git a/24_ColorSpaceTest/CMakeLists.txt b/24_ColorSpaceTest/CMakeLists.txt
index fcf8faa36..a2c5e752b 100644
--- a/24_ColorSpaceTest/CMakeLists.txt
+++ b/24_ColorSpaceTest/CMakeLists.txt
@@ -42,6 +42,7 @@ set(DEPENDS
 target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
 set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
+set(SM 6_8)
 set(JSON [=[
 [
     {
@@ -64,7 +65,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
     DEPENDS ${DEPENDS}
     BINARY_DIR ${OUTPUT_DIRECTORY}
     MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
     OUTPUT_VAR KEYS
     INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
     NAMESPACE nbl::this_example::builtin::build
@@ -77,5 +78,4 @@ NBL_CREATE_RESOURCE_ARCHIVE(
     LINK_TO ${EXECUTABLE_NAME}
     BIND ${OUTPUT_DIRECTORY}
     BUILTINS ${KEYS}
-    COMMON_OPTIONS ${COMPILE_OPTIONS}
 )
\ No newline at end of file
diff --git a/24_ColorSpaceTest/main.cpp b/24_ColorSpaceTest/main.cpp
index e8858f5a6..750756321 100644
--- a/24_ColorSpaceTest/main.cpp
+++ b/24_ColorSpaceTest/main.cpp
@@ -161,22 +161,22 @@ class ColorSpaceTestSampleApp final : public SimpleWindowedApplication, public B
 					return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
 
 				// Load Custom Shader
-				auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>(const std::string& relPath) -> smart_refctd_ptr<IShader>
-					{
-						IAssetLoader::SAssetLoadParams lp = {};
-						lp.logger = m_logger.get();
-						lp.workingDirectory = "app_resources";
-
-						auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
-						auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
-						const auto assets = assetBundle.getContents();
-						if (assets.empty())
-							return nullptr;
+				auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.logger = m_logger.get();
+					lp.workingDirectory = "app_resources";
+
+					auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+					auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+						return nullptr;
 
-						auto shader = IAsset::castDown<IShader>(assets[0]);
-						return shader;
-					};
-				auto fragmentShader = loadPrecompiledShader.operator()<"present">("app_resources/present.frag.hlsl");
+					auto shader = IAsset::castDown<IShader>(assets[0]);
+					return shader;
+				};
+				auto fragmentShader = loadPrecompiledShader.operator()<"present">(); // "app_resources/present.frag.hlsl"
 				if (!fragmentShader)
 					return logFail("Failed to load precompiled fragment shader!");
 
diff --git a/62_CAD/CMakeLists.txt b/62_CAD/CMakeLists.txt
index c193dc63c..dd181ff87 100644
--- a/62_CAD/CMakeLists.txt
+++ b/62_CAD/CMakeLists.txt
@@ -76,16 +76,17 @@ set(DEPENDS
 	shaders/main_pipeline/resolve_alphas.hlsl
 	shaders/main_pipeline/vertex_shader.hlsl
 )
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
 set(SM 6_8)
-
 set(REQUIRED_CAPS [=[
-    {
-        "kind": "features",
-        "name": "fragmentShaderPixelInterlock",
-        "type": "bool",
-        "values": [1]
-    }
+{
+  "kind": "features",
+  "name": "fragmentShaderPixelInterlock",
+  "type": "bool",
+  "values": [1]
+}
 ]=])
 
 set(JSON [=[
@@ -93,28 +94,30 @@ set(JSON [=[
 	{
 		"INPUT": "shaders/main_pipeline/vertex_shader.hlsl",
 		"KEY": "main_pipeline_vertex_shader",
-    "COMPILE_OPTIONS": ["-T", "cs_6_8"],
-		"DEPENDS": [],
 		"CAPS": [${REQUIRED_CAPS}]
     },
     {
 		"INPUT": "shaders/main_pipeline/fragment.hlsl",
 		"KEY": "main_pipeline_fragment_shader",
-    "COMPILE_OPTIONS": ["-T", "cs_6_8"],
-		"DEPENDS": [],
 		"CAPS": [${REQUIRED_CAPS}]
     }
 ]
 ]=])
 string(CONFIGURE "${JSON}" JSON)
 
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
 NBL_CREATE_NSC_COMPILE_RULES(
     TARGET ${EXECUTABLE_NAME}SPIRV
     LINK_TO ${EXECUTABLE_NAME}
     DEPENDS ${DEPENDS}
     BINARY_DIR ${OUTPUT_DIRECTORY}
     MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
     OUTPUT_VAR KEYS
     INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
     NAMESPACE nbl::this_example::builtin::build
diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index ec7b177eb..15ee597ec 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -961,28 +961,28 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 			}
 
 			// Load Custom Shader
-			auto loadCompileShader = [&]<core::StringLiteral ShaderKey>(const std::string& relPath) -> smart_refctd_ptr<IShader>
-				{
-					IAssetLoader::SAssetLoadParams lp = {};
-					lp.logger = m_logger.get();
-					lp.workingDirectory = "shaders";
-
-					auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
-					auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
-					const auto assets = assetBundle.getContents();
-					if (assets.empty())
-						return nullptr;
-
-					// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-					auto source = IAsset::castDown<IShader>(assets[0]);
-					if (!source)
-						return nullptr;
-	
-					return m_device->compileShader( ILogicalDevice::SShaderCreationParameters { .source = source.get(), .readCache = shaderReadCache.get(), .writeCache = shaderWriteCache.get(), .stage = IShader::E_SHADER_STAGE::ESS_ALL_OR_LIBRARY });
-				};
+			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = "shaders";
+
+				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+				{
+					m_logger->log("Failed to load a precompiled ahsder.", ILogger::ELL_ERROR);
+					return nullptr;
+				}
+					
+
+				auto shader = IAsset::castDown<IShader>(assets[0]);
+				return shader;
+			};
 
-			mainPipelineFragmentShaders = loadCompileShader.operator()<"main_pipeline_fragment_shader">("../shaders/main_pipeline/fragment.hlsl");
-			mainPipelineVertexShader = loadCompileShader.operator() <"main_pipeline_vertex_shader"> ("../shaders/main_pipeline/vertex_shader.hlsl");
+			mainPipelineFragmentShaders = loadPrecompiledShader.operator()<"main_pipeline_fragment_shader">(); // "../shaders/main_pipeline/fragment.hlsl"
+			mainPipelineVertexShader = loadPrecompiledShader.operator() <"main_pipeline_vertex_shader">(); // "../shaders/main_pipeline/vertex_shader.hlsl"
 			
 			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
 			{
diff --git a/64_EmulatedFloatTest/CMakeLists.txt b/64_EmulatedFloatTest/CMakeLists.txt
index 1b272bf2e..6470cdc74 100644
--- a/64_EmulatedFloatTest/CMakeLists.txt
+++ b/64_EmulatedFloatTest/CMakeLists.txt
@@ -39,6 +39,7 @@ set(DEPENDS
 target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
 set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
+set(SM 6_8)
 set(JSON [=[
 [
     {
@@ -65,7 +66,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
     DEPENDS ${DEPENDS}
     BINARY_DIR ${OUTPUT_DIRECTORY}
     MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
     OUTPUT_VAR KEYS
     INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
     NAMESPACE nbl::this_example::builtin::build
@@ -78,5 +79,4 @@ NBL_CREATE_RESOURCE_ARCHIVE(
     LINK_TO ${EXECUTABLE_NAME}
     BIND ${OUTPUT_DIRECTORY}
     BUILTINS ${KEYS}
-    COMMON_OPTIONS ${COMPILE_OPTIONS}
 )
\ No newline at end of file
diff --git a/67_RayQueryGeometry/CMakeLists.txt b/67_RayQueryGeometry/CMakeLists.txt
index 40f32624a..503c5a31a 100644
--- a/67_RayQueryGeometry/CMakeLists.txt
+++ b/67_RayQueryGeometry/CMakeLists.txt
@@ -35,6 +35,7 @@ set(DEPENDS
 target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
 set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
 
+set(SM 6_8)
 set(JSON [=[
 [
     {
@@ -57,7 +58,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
     DEPENDS ${DEPENDS}
     BINARY_DIR ${OUTPUT_DIRECTORY}
     MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-    COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
     OUTPUT_VAR KEYS
     INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
     NAMESPACE nbl::this_example::builtin::build
@@ -70,5 +71,4 @@ NBL_CREATE_RESOURCE_ARCHIVE(
     LINK_TO ${EXECUTABLE_NAME}
     BIND ${OUTPUT_DIRECTORY}
     BUILTINS ${KEYS}
-    COMMON_OPTIONS ${COMPILE_OPTIONS}
 )
\ No newline at end of file
diff --git a/70_FLIPFluids/CMakeLists.txt b/70_FLIPFluids/CMakeLists.txt
index a434ff32a..19a561f78 100644
--- a/70_FLIPFluids/CMakeLists.txt
+++ b/70_FLIPFluids/CMakeLists.txt
@@ -21,4 +21,101 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/compute/advectParticles.comp.hlsl
+	app_resources/compute/applyBodyForces.comp.hlsl
+	app_resources/compute/diffusion.comp.hlsl
+	app_resources/compute/genParticleVertices.comp.hlsl
+	app_resources/compute/particlesInit.comp.hlsl
+	app_resources/compute/prepareCellUpdate.comp.hlsl
+	app_resources/compute/pressureSolver.comp.hlsl
+	app_resources/compute/updateFluidCells.comp.hlsl
+	app_resources/cellUtils.hlsl
+	app_resources/common.hlsl
+	app_resources/descriptor_bindings.hlsl
+	app_resources/fluidParticles.fragment.hlsl
+	app_resources/fluidParticles.vertex.hlsl
+	app_resources/gridSampling.hlsl
+	app_resources/gridUtils.hlsl
+	app_resources/render_common.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/compute/diffusion.comp.hlsl",
+        "KEY": "diffusion",
+    },
+    {
+        "INPUT": "app_resources/fluidParticles.vertex.hlsl",
+        "KEY": "fluidParticles_vertex",
+    },
+    {
+        "INPUT": "app_resources/fluidParticles.fragment.hlsl",
+        "KEY": "fluidParticles_fragment",
+    },
+    {
+        "INPUT": "app_resources/compute/particlesInit.comp.hlsl",
+        "KEY": "particlesInit",
+    },
+    {
+        "INPUT": "app_resources/compute/genParticleVertices.comp.hlsl",
+        "KEY": "genParticleVertices",
+    },
+    {
+        "INPUT": "app_resources/compute/prepareCellUpdate.comp.hlsl",
+        "KEY": "prepareCellUpdate",
+    },
+    {
+        "INPUT": "app_resources/compute/updateFluidCells.comp.hlsl",
+        "KEY": "updateFluidCells",
+    },
+    {
+        "INPUT": "app_resources/compute/applyBodyForces.comp.hlsl",
+        "KEY": "applyBodyForces",
+    },
+    {
+        "INPUT": "app_resources/compute/pressureSolver.comp.hlsl",
+        "KEY": "pressureSolver",
+    },
+    {
+        "INPUT": "app_resources/compute/advectParticles.comp.hlsl",
+        "KEY": "advectParticles",
+    }
+    
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
\ No newline at end of file
diff --git a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
index e53c91d2d..288b82764 100644
--- a/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/diffusion.comp.hlsl
@@ -67,6 +67,7 @@ void setAxisCellMaterial(uint32_t3 ID : SV_DispatchThreadID)
 }
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void setNeighborAxisCellMaterial(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cellIdx = ID;
@@ -127,6 +128,7 @@ float3 calculateDiffusionVelStep(int3 idx, float3 sampledVelocity, uint cellMate
 }
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void iterateDiffusion(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint3 gid = nbl::hlsl::glsl::gl_WorkGroupID();
@@ -212,6 +214,7 @@ void iterateDiffusion(uint32_t3 ID : SV_DispatchThreadID)
 
 // TODO: same as the pressure solver, this kernel/dispatch should be fused onto `iterateDiffusion` guarded by `isLastIteration` push constant
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void applyDiffusion(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cellIdx = ID;
diff --git a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
index b5db995c5..e71f05912 100644
--- a/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/pressureSolver.comp.hlsl
@@ -89,6 +89,7 @@ float calculatePressureStep(int3 idx)
 }
 
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void iteratePressureSystem(uint32_t3 ID : SV_DispatchThreadID)
 {
     uint3 gid = nbl::hlsl::glsl::gl_WorkGroupID();
@@ -168,6 +169,7 @@ void iteratePressureSystem(uint32_t3 ID : SV_DispatchThreadID)
 
 // TODO: why doesn't the last invocation of `iteratePressureSystem` have this step fused into it!? It would be just a simple push constant `isLastIteration` that would decide whether to run this dispatch
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void updateVelocities(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cellIdx = ID;
diff --git a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
index 62ddfd822..ea37660c1 100644
--- a/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
+++ b/70_FLIPFluids/app_resources/compute/updateFluidCells.comp.hlsl
@@ -23,6 +23,7 @@ cbuffer GridData
 
 // TODO: f 0 is AIR, and >=2 is SOLID, we can perform Atomic OR 0b01 to have a particle set the cell to FLUID, and this dispatch looping over all grid cells is not needed!
 [numthreads(WorkgroupGridDim, WorkgroupGridDim, WorkgroupGridDim)]
+[shader("compute")]
 void updateFluidCells(uint32_t3 ID : SV_DispatchThreadID)
 {
     int3 cIdx = ID;
diff --git a/70_FLIPFluids/main.cpp b/70_FLIPFluids/main.cpp
index 899d00ba4..a70064245 100644
--- a/70_FLIPFluids/main.cpp
+++ b/70_FLIPFluids/main.cpp
@@ -2,6 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 #include "nbl/examples/examples.hpp"
 // TODO: why is it not in nabla.h ?
@@ -344,11 +345,12 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         if (!initGraphicsPipeline())
             return logFail("Failed to initialize render pipeline!\n");
 
-        auto createComputePipeline = [&](smart_refctd_ptr<IGPUComputePipeline>& pipeline, smart_refctd_ptr<IDescriptorPool>& pool,
-            smart_refctd_ptr<IGPUDescriptorSet>& set, const std::string& shaderPath, const std::string& entryPoint,
+        
+        auto createComputePipeline = [&]<core::StringLiteral ShaderKey>(smart_refctd_ptr<IGPUComputePipeline>& pipeline, smart_refctd_ptr<IDescriptorPool>& pool,
+            smart_refctd_ptr<IGPUDescriptorSet>& set, const std::string& entryPoint,
             const std::span<const IGPUDescriptorSetLayout::SBinding> bindings, const asset::SPushConstantRange& pcRange = {}) -> void
             {
-                auto shader = compileShader(shaderPath, entryPoint);
+                auto shader = loadPrecompiledShader<ShaderKey>();
 
                 auto descriptorSetLayout1 = m_device->createDescriptorSetLayout(bindings);
 
@@ -378,8 +380,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         {
             // init particles pipeline
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = 2 * sizeof(uint64_t) };
-            createComputePipeline(m_initParticlePipeline, m_initParticlePool, m_initParticleDs,
-                "app_resources/compute/particlesInit.comp.hlsl", "main", piParticlesInit_bs1, pcRange);
+            createComputePipeline.operator()<"particlesInit">(m_initParticlePipeline, m_initParticlePool, m_initParticleDs,
+                 "main", piParticlesInit_bs1, pcRange);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[1];
@@ -395,8 +397,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         {
             // generate particle vertex pipeline
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = 3 * sizeof(uint64_t) };
-            createComputePipeline(m_genParticleVerticesPipeline, m_genVerticesPool, m_genVerticesDs,
-                "app_resources/compute/genParticleVertices.comp.hlsl", "main", gpvGenVertices_bs1, pcRange);
+            createComputePipeline.operator()<"genParticleVertices">(m_genParticleVerticesPipeline, m_genVerticesPool, m_genVerticesDs,
+                "main", gpvGenVertices_bs1, pcRange);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[2];
@@ -414,8 +416,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         // update fluid cells pipelines
         {
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = 2 * sizeof(uint64_t) };
-            createComputePipeline(m_accumulateWeightsPipeline, m_accumulateWeightsPool, m_accumulateWeightsDs,
-                "app_resources/compute/prepareCellUpdate.comp.hlsl", "main", ufcAccWeights_bs1, pcRange);
+            createComputePipeline.operator()<"prepareCellUpdate">(m_accumulateWeightsPipeline, m_accumulateWeightsPool, m_accumulateWeightsDs,
+                "main", ufcAccWeights_bs1, pcRange);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[2];
@@ -457,8 +459,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            createComputePipeline(m_updateFluidCellsPipeline, m_updateFluidCellsPool, m_updateFluidCellsDs,
-                "app_resources/compute/updateFluidCells.comp.hlsl", "updateFluidCells", ufcFluidCell_bs1);
+            createComputePipeline.operator()<"updateFluidCells">(m_updateFluidCellsPipeline, m_updateFluidCellsPool, m_updateFluidCellsDs,
+                "updateFluidCells", ufcFluidCell_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[3];
@@ -479,8 +481,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            createComputePipeline(m_updateNeighborCellsPipeline, m_updateNeighborCellsPool, m_updateNeighborCellsDs,
-                "app_resources/compute/updateFluidCells.comp.hlsl", "updateNeighborFluidCells", ufcNeighborCell_bs1);
+            createComputePipeline.operator()<"updateFluidCells">(m_updateNeighborCellsPipeline, m_updateNeighborCellsPool, m_updateNeighborCellsDs,
+                "updateNeighborFluidCells", ufcNeighborCell_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[3];
@@ -527,8 +529,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         }
         {
             // apply forces pipeline
-            createComputePipeline(m_applyBodyForcesPipeline, m_applyForcesPool, m_applyForcesDs, 
-                "app_resources/compute/applyBodyForces.comp.hlsl", "main", abfApplyForces_bs1);
+            createComputePipeline.operator()<"applyBodyForces">(m_applyBodyForcesPipeline, m_applyForcesPool, m_applyForcesDs, 
+                "main", abfApplyForces_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[2];
@@ -559,8 +561,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         }
         // apply diffusion pipelines
         {
-            createComputePipeline(m_axisCellsPipeline, m_axisCellsPool, m_axisCellsDs, 
-                "app_resources/compute/diffusion.comp.hlsl", "setAxisCellMaterial", dAxisCM_bs1);
+            createComputePipeline.operator()<"diffusion">(m_axisCellsPipeline, m_axisCellsPool, m_axisCellsDs, 
+                "setAxisCellMaterial", dAxisCM_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[3];
@@ -581,8 +583,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            createComputePipeline(m_neighborAxisCellsPipeline, m_neighborAxisCellsPool, m_neighborAxisCellsDs, 
-                "app_resources/compute/diffusion.comp.hlsl", "setNeighborAxisCellMaterial", dNeighborAxisCM_bs1);
+            createComputePipeline.operator()<"diffusion">(m_neighborAxisCellsPipeline, m_neighborAxisCellsPool, m_neighborAxisCellsDs, 
+                "setNeighborAxisCellMaterial", dNeighborAxisCM_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[3];
@@ -603,10 +605,7 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            const std::string iterateKernel = "iterateDiffusion";
-            const std::string applyKernel = "applyDiffusion";
-            auto iterateShader = compileShader("app_resources/compute/diffusion.comp.hlsl", iterateKernel);
-            auto applyShader = compileShader("app_resources/compute/diffusion.comp.hlsl", applyKernel);
+            smart_refctd_ptr<IShader> diffusion = loadPrecompiledShader<"diffusion">(); // "app_resources/compute/diffusion.comp.hlsl"
 
             auto descriptorSetLayout1 = m_device->createDescriptorSetLayout(dDiffuse_bs1);
 
@@ -625,16 +624,16 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             {
                 IGPUComputePipeline::SCreationParams params = {};
                 params.layout = pipelineLayout.get();
-                params.shader.entryPoint = iterateKernel;
-                params.shader.shader = iterateShader.get();
+                params.shader.entryPoint = "iterateDiffusion";
+                params.shader.shader = diffusion.get();
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_iterateDiffusionPipeline);
             }
             {
                 IGPUComputePipeline::SCreationParams params = {};
                 params.layout = pipelineLayout.get();
-                params.shader.entryPoint = applyKernel;
-                params.shader.shader = applyShader.get();
+                params.shader.entryPoint = "applyDiffusion";
+                params.shader.shader = diffusion.get();
 
                 m_device->createComputePipelines(nullptr, { &params,1 }, &m_diffusionPipeline);
             }
@@ -676,8 +675,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         }
         // solve pressure system pipelines
         {
-            createComputePipeline(m_calcDivergencePipeline, m_calcDivergencePool, m_calcDivergenceDs, 
-                "app_resources/compute/pressureSolver.comp.hlsl", "calculateNegativeDivergence", psDivergence_bs1);
+            createComputePipeline.operator()<"pressureSolver">(m_calcDivergencePipeline, m_calcDivergencePool, m_calcDivergenceDs, 
+                "calculateNegativeDivergence", psDivergence_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[3];
@@ -711,8 +710,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            createComputePipeline(m_iteratePressurePipeline, m_iteratePressurePool, m_iteratePressureDs,
-                "app_resources/compute/pressureSolver.comp.hlsl", "iteratePressureSystem", psIteratePressure_bs1);
+            createComputePipeline.operator()<"pressureSolver">(m_iteratePressurePipeline, m_iteratePressurePool, m_iteratePressureDs,
+                "iteratePressureSystem", psIteratePressure_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[5];
@@ -740,8 +739,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
             }
         }
         {
-            createComputePipeline(m_updateVelPsPipeline, m_updateVelPsPool, m_updateVelPsDs, 
-                "app_resources/compute/pressureSolver.comp.hlsl", "updateVelocities", psUpdateVelPs_bs1);
+            createComputePipeline.operator()<"pressureSolver">(m_updateVelPsPipeline, m_updateVelPsPool, m_updateVelPsDs, 
+                "updateVelocities", psUpdateVelPs_bs1);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[4];
@@ -780,8 +779,8 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         {
             // advect particles pipeline
             const asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0, .size = 2 * sizeof(uint64_t) };
-            createComputePipeline(m_advectParticlesPipeline, m_advectParticlesPool, m_advectParticlesDs,
-                "app_resources/compute/advectParticles.comp.hlsl", "main", apAdvectParticles_bs1, pcRange);
+            createComputePipeline.operator()<"advectParticles">(m_advectParticlesPipeline, m_advectParticlesPool, m_advectParticlesDs,
+                "main", apAdvectParticles_bs1, pcRange);
 
             {
                 IGPUDescriptorSet::SDescriptorInfo infos[2];
@@ -1400,51 +1399,25 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
         numParticles = m_gridData.particleInitSize.x * m_gridData.particleInitSize.y * m_gridData.particleInitSize.z * particlesPerCell;
     }
 
-    smart_refctd_ptr<IShader> compileShader(const std::string& filePath, const std::string& entryPoint = "main")
+    template<core::StringLiteral ShaderKey>
+    smart_refctd_ptr<IShader> loadPrecompiledShader()
     {
         IAssetLoader::SAssetLoadParams lparams = {};
         lparams.logger = m_logger.get();
-        lparams.workingDirectory = "";
-        auto bundle = m_assetMgr->getAsset(filePath, lparams);
+        lparams.workingDirectory = "app_resources";
+        auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+        auto bundle = m_assetMgr->getAsset(key.data(), lparams);
         if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
         {
-            m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+            m_logger->log("Failed to find shader with key '%s'.", ILogger::ELL_ERROR, ShaderKey);
             exit(-1);
         }
         
         const auto assets = bundle.getContents();
         assert(assets.size() == 1);
-        smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
-        const auto hlslMetadata = static_cast<const CHLSLMetadata*>(bundle.getMetadata());
-        const auto shaderStage = hlslMetadata->shaderStages->front();
+        smart_refctd_ptr<IShader> shader = IAsset::castDown<IShader>(assets[0]);
 
-        smart_refctd_ptr<IShader> shader = shaderSrc;
-        if (entryPoint != "main")
-        {
-            auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
-            CHLSLCompiler::SOptions options = {};
-            options.stage = shaderStage;
-            if (!(options.stage == IShader::E_SHADER_STAGE::ESS_COMPUTE || options.stage == IShader::E_SHADER_STAGE::ESS_FRAGMENT))
-                options.stage = IShader::E_SHADER_STAGE::ESS_VERTEX;
-            options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
-            options.spirvOptimizer = nullptr;
-        #ifndef _NBL_DEBUG
-            ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
-            auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-            options.spirvOptimizer = opt.get();
-        #endif
-            options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
-            options.preprocessorOptions.sourceIdentifier = shaderSrc->getFilepathHint();
-            options.preprocessorOptions.logger = m_logger.get();
-            options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
-
-            std::string dxcOptionStr[] = {"-E " + entryPoint};
-            options.dxcOptions = std::span(dxcOptionStr);
-
-            shader = compiler->compileToSPIRV((const char*)shaderSrc->getContent()->getPointer(), options);
-        }
-
-        return m_device->compileShader({ shader.get() });
+        return shader;
     }
 
     // TODO: there's a method in IUtilities for this
@@ -1563,28 +1536,27 @@ class FLIPFluidsApp final : public SimpleWindowedApplication, public BuiltinReso
 
         // init shaders and pipeline
 
-        auto compileShader = [&](const std::string& filePath) -> smart_refctd_ptr<IShader>
+        auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
+        {
+            IAssetLoader::SAssetLoadParams lparams = {};
+            lparams.logger = m_logger.get();
+            lparams.workingDirectory = "app_resources";
+            auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+            auto bundle = m_assetMgr->getAsset(key.data(), lparams);
+            if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
             {
-                IAssetLoader::SAssetLoadParams lparams = {};
-                lparams.logger = m_logger.get();
-                lparams.workingDirectory = "";
-                auto bundle = m_assetMgr->getAsset(filePath, lparams);
-                if (bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER)
-                {
-                    m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
-                    exit(-1);
-                }
+                m_logger->log("Failed to find shader with key '%s'.", ILogger::ELL_ERROR, ShaderKey);
+                exit(-1);
+            }
         
-                const auto assets = bundle.getContents();
-                assert(assets.size() == 1);
-                smart_refctd_ptr<IShader> shaderSrc = IAsset::castDown<IShader>(assets[0]);
-                if (!shaderSrc)
-                    return nullptr;
+            const auto assets = bundle.getContents();
+            assert(assets.size() == 1);
+            smart_refctd_ptr<IShader> shader = IAsset::castDown<IShader>(assets[0]);
 
-                return m_device->compileShader({ shaderSrc.get() });
-            };
-        auto vs = compileShader("app_resources/fluidParticles.vertex.hlsl");
-        auto fs = compileShader("app_resources/fluidParticles.fragment.hlsl");
+            return shader;
+        };
+        auto vs = loadPrecompiledShader.operator()<"fluidParticles_vertex">(); // "app_resources/fluidParticles.vertex.hlsl"
+        auto fs = loadPrecompiledShader.operator()<"fluidParticles_fragment">(); // "app_resources/fluidParticles.fragment.hlsl"
 
         smart_refctd_ptr<video::IGPUDescriptorSetLayout> descriptorSetLayout1;
         {

From 974d23f74a537648ac307c08a81ea97908a74874 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Tue, 28 Oct 2025 16:11:57 +0100
Subject: [PATCH 15/57] Enabled build time shader compilation in example 10

---
 10_CountingSort/CMakeLists.txt            | 68 +++++++++++++++++++++++
 10_CountingSort/app_resources/common.hlsl |  6 ++
 10_CountingSort/main.cpp                  | 45 +++++++++------
 3 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/10_CountingSort/CMakeLists.txt b/10_CountingSort/CMakeLists.txt
index b7cad41da..3acc73022 100644
--- a/10_CountingSort/CMakeLists.txt
+++ b/10_CountingSort/CMakeLists.txt
@@ -22,3 +22,71 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
 endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	app_resources/common.hlsl
+	app_resources/prefix_sum_shader.comp.hlsl
+	app_resources/scatter_shader.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(REQUIRED_CAPS [=[
+	{
+	  "kind": "limits",
+	  "name": "maxComputeWorkGroupInvocations",
+	  "type": "uint32_t",
+	  "values": [256,512,1024]
+	},
+	{
+	  "kind": "limits",
+	  "name": "maxComputeSharedMemorySize",
+	  "type": "uint32_t",
+	  "values": [16384, 32768, 65536]
+	}
+]=])
+
+set(JSON [=[
+[
+	{
+		"INPUT": "app_resources/prefix_sum_shader.comp.hlsl",
+		"KEY": "prefix_sum_shader",
+		"CAPS": [${REQUIRED_CAPS}]
+    },
+    {
+		"INPUT": "app_resources/scatter_shader.comp.hlsl",
+		"KEY": "scatter_shader",
+		"CAPS": [${REQUIRED_CAPS}]
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
diff --git a/10_CountingSort/app_resources/common.hlsl b/10_CountingSort/app_resources/common.hlsl
index bcbf01727..1074432b0 100644
--- a/10_CountingSort/app_resources/common.hlsl
+++ b/10_CountingSort/app_resources/common.hlsl
@@ -22,6 +22,10 @@ using namespace nbl::hlsl;
 #ifdef __HLSL_VERSION
 #include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"
 
+static const uint32_t WorkgroupSize = DeviceConfigCaps::maxComputeWorkGroupInvocations;
+static const uint32_t MaxBucketCount = (DeviceConfigCaps::maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
+static const uint32_t BucketCount = (MaxBucketCount > 3000) ? 3000 : MaxBucketCount;
+
 using Ptr = bda::__ptr<uint32_t>;
 using PtrAccessor = BdaAccessor<uint32_t>;
 
@@ -54,6 +58,8 @@ uint32_t3 glsl::gl_WorkGroupSize()
 {
     return uint32_t3(WorkgroupSize, 1, 1);
 }
+
+
 #endif
 
 #endif
\ No newline at end of file
diff --git a/10_CountingSort/main.cpp b/10_CountingSort/main.cpp
index d51650919..a22647750 100644
--- a/10_CountingSort/main.cpp
+++ b/10_CountingSort/main.cpp
@@ -1,4 +1,5 @@
 #include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
 using namespace nbl;
 using namespace nbl::core;
@@ -32,19 +33,34 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 				return false;
 
 			auto limits = m_physicalDevice->getLimits();
+			constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = {
+				16384, 32768, 65536
+			};
+
+			auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize);
+			// devices which support less than 16KB of max compute shared memory size are not supported
+			if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+			{
+				m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
+				exit(0);
+			}
+
+			limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+
 			const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations;
 			const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
 			constexpr uint32_t element_count = 100000;
 			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
 			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
 
-			auto prepShader = [&](const core::string& path) -> smart_refctd_ptr<IShader>
+			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
 			{
 				// this time we load a shader directly from a file
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset(path,lp);
+				lp.workingDirectory = "app_resources"; // virtual root
+				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(limits, m_physicalDevice->getFeatures());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())
 				{
@@ -52,29 +68,24 @@ class CountingSortApp final : public application_templates::MonoDeviceApplicatio
 					return nullptr;
 				}
 
-				auto source = IAsset::castDown<IShader>(assets[0]);
+				auto shader = IAsset::castDown<IShader>(assets[0]);
 				// The down-cast should not fail!
-				assert(source);
+				assert(shader);
 			
 				// There's two ways of doing stuff like this:
 				// 1. this - modifying the asset after load
 				// 2. creating a short shader source file that includes the asset you would have wanted to load
-				auto overrideSource = CHLSLCompiler::createOverridenCopy(
-					source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
-					WorkgroupSize, bucket_count
-				);
+				// 
+				//auto overrideSource = CHLSLCompiler::createOverridenCopy(
+				//	source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
+				//	WorkgroupSize, bucket_count
+				//);
 
 				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
-				auto shader = m_device->compileShader({ overrideSource.get() });
-				if (!shader)
-				{
-					logFail("Creation of Prefix Sum Shader from CPU Shader source failed!");
-					return nullptr;
-				}
 				return shader;
 			};
-			auto prefixSumShader = prepShader("app_resources/prefix_sum_shader.comp.hlsl");
-			auto scatterShader = prepShader("app_resources/scatter_shader.comp.hlsl");
+			auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl"
+			auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl"
 
 			// People love Reflection but I prefer Shader Sources instead!
 			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) };

From eb1e29f4d071956d8397108680cb0256ec012b5b Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Tue, 28 Oct 2025 16:25:00 +0100
Subject: [PATCH 16/57] Enabled build time shader compilation in example 71

---
 62_CAD/main.cpp                               |  59 +---------
 71_RayTracingPipeline/CMakeLists.txt          | 101 ++++++++++++++++++
 .../app_resources/raytrace.rahit.hlsl         |   2 +-
 .../app_resources/raytrace.rchit.hlsl         |  20 ++--
 .../app_resources/raytrace.rgen.hlsl          |   1 -
 .../app_resources/raytrace_shadow.rahit.hlsl  |   2 +-
 71_RayTracingPipeline/main.cpp                |  99 +++++------------
 7 files changed, 138 insertions(+), 146 deletions(-)

diff --git a/62_CAD/main.cpp b/62_CAD/main.cpp
index 15ee597ec..905177f6b 100644
--- a/62_CAD/main.cpp
+++ b/62_CAD/main.cpp
@@ -929,43 +929,12 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 		smart_refctd_ptr<IShader> mainPipelineVertexShader = {};
 		std::array<smart_refctd_ptr<IShader>, 2u> geoTexturePipelineShaders = {};
 		{
-			smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-			smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-			auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
-
-			{
-				core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
-				{
-					system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-					m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
-					if (future.wait())
-					{
-						future.acquire().move_into(shaderReadCacheFile);
-						if (shaderReadCacheFile)
-						{
-							const size_t size = shaderReadCacheFile->getSize();
-							if (size > 0ull)
-							{
-								std::vector<uint8_t> contents(size);
-								system::IFile::success_t succ;
-								shaderReadCacheFile->read(succ, contents.data(), 0, size);
-								if (succ)
-									shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
-							}
-						}
-					}
-					else
-						m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
-				}
-
-			}
-
 			// Load Custom Shader
 			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
-				lp.workingDirectory = "shaders";
+				lp.workingDirectory = "app_resources";
 
 				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
 				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
@@ -983,32 +952,6 @@ class ComputerAidedDesign final : public nbl::examples::SimpleWindowedApplicatio
 
 			mainPipelineFragmentShaders = loadPrecompiledShader.operator()<"main_pipeline_fragment_shader">(); // "../shaders/main_pipeline/fragment.hlsl"
 			mainPipelineVertexShader = loadPrecompiledShader.operator() <"main_pipeline_vertex_shader">(); // "../shaders/main_pipeline/vertex_shader.hlsl"
-			
-			core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-			{
-				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-				m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-				if (future.wait())
-				{
-					future.acquire().move_into(shaderWriteCacheFile);
-					if (shaderWriteCacheFile)
-					{
-						auto serializedCache = shaderWriteCache->serialize();
-						if (shaderWriteCacheFile)
-						{
-							system::IFile::success_t succ;
-							shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-							if (!succ)
-								m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
-						}
-					}
-					else
-						m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-				}
-				else
-					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-			}
 		}
 
 		// Shared Blend Params between pipelines
diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt
index 07b0fd396..5c853040e 100644
--- a/71_RayTracingPipeline/CMakeLists.txt
+++ b/71_RayTracingPipeline/CMakeLists.txt
@@ -34,4 +34,105 @@ if(NBL_BUILD_IMGUI)
 	endif()
 endif()
 
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+    app_resources/common.hlsl
+    app_resources/light_directional.rcall.hlsl
+	app_resources/light_point.rcall.hlsl
+	app_resources/light_spot.rcall.hlsl
+	app_resources/present.frag.hlsl
+	app_resources/raytrace.rahit.hlsl
+	app_resources/raytrace.rchit.hlsl
+	app_resources/raytrace.rgen.hlsl
+	app_resources/raytrace.rint.hlsl
+	app_resources/raytrace.rmiss.hlsl
+	app_resources/raytrace_procedural.rchit.hlsl
+	app_resources/raytrace_shadow.rahit.hlsl
+	app_resources/raytrace_shadow.rmiss.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/raytrace.rgen.hlsl",
+        "KEY": "raytrace_rgen",
+    },
+	{
+        "INPUT": "app_resources/raytrace.rchit.hlsl",
+        "KEY": "raytrace_rchit",
+    },
+	{
+        "INPUT": "app_resources/raytrace_procedural.rchit.hlsl",
+        "KEY": "raytrace_procedural_rchit",
+    },
+	{
+        "INPUT": "app_resources/raytrace.rint.hlsl",
+        "KEY": "raytrace_rint",
+    },
+	{
+        "INPUT": "app_resources/raytrace.rahit.hlsl",
+        "KEY": "raytrace_rahit",
+    },
+	{
+        "INPUT": "app_resources/raytrace_shadow.rahit.hlsl",
+        "KEY": "raytrace_shadow_rahit",
+    },
+	{
+        "INPUT": "app_resources/raytrace.rmiss.hlsl",
+        "KEY": "raytrace_rmiss",
+    },
+	{
+        "INPUT": "app_resources/raytrace_shadow.rmiss.hlsl",
+        "KEY": "raytrace_shadow_rmiss",
+    },
+	{
+        "INPUT": "app_resources/light_directional.rcall.hlsl",
+        "KEY": "light_directional_rcall",
+    },
+	{
+        "INPUT": "app_resources/light_point.rcall.hlsl",
+        "KEY": "light_point_rcall",
+    },
+	{
+        "INPUT": "app_resources/light_spot.rcall.hlsl",
+        "KEY": "light_spot_rcall",
+    },
+	{
+        "INPUT": "app_resources/present.frag.hlsl",
+        "KEY": "present_frag",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -O3
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    DEPENDS ${DEPENDS}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
+
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index 956ad5fe6..f5c9080e8 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -10,7 +10,7 @@ using namespace nbl::hlsl;
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = spirv::InstanceCustomIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo), 8);
 
     const uint32_t bitpattern = payload.pcg();
     // Cannot use spirv::ignoreIntersectionKHR and spirv::terminateRayKHR due to https://github.com/microsoft/DirectXShaderCompiler/issues/7279
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index 0a8bc5ec8..dc83b5cd2 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -38,9 +38,9 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
 
     if (normalBufferAddress == 0)
     {
-        float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12);
-        float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12);
-        float3 v2 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * 12);
+        float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12, 8);
+        float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12, 8);
+        float3 v2 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * 12, 8);
 
         return normalize(cross(v2 - v0, v1 - v0));
     }
@@ -50,9 +50,9 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
     {
         case NT_R8G8B8A8_SNORM:
         {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4);
+            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4, 8);
+            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4, 8);
+            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4, 8);
 
             n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
             n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
@@ -61,9 +61,9 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
         break;
         case NT_R32G32B32_SFLOAT:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12));
-            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12));
-            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12));
+            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12, 8));
+            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12, 8));
+            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12, 8));
         }
         break;
     }
@@ -81,7 +81,7 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
     const int primID = spirv::PrimitiveId;
     const int instanceCustomIndex = spirv::InstanceCustomIndexKHR;
     const int geometryIndex = spirv::RayGeometryIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo));
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo), 8);
     const float32_t3 vertexNormal = calculateNormals(primID, geom, attribs.barycentrics);
     const float32_t3 worldNormal = normalize(mul(vertexNormal, transpose(spirv::WorldToObjectKHR)).xyz);
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index efc99cad9..6571c5c67 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -1,6 +1,5 @@
 #include "common.hlsl"
 
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index e41551512..dd83f92c9 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -10,7 +10,7 @@ using namespace nbl::hlsl;
 void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = spirv::InstanceCustomIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo));
+    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo), 8);
     const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
     const float attenuation = (1.f-material.alpha) * payload.attenuation;
diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index 59b610f4b..ecaf53b7f 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -3,6 +3,8 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "common.hpp"
 
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 #include "nbl/builtin/hlsl/indirect_commands.hlsl"
 
@@ -106,95 +108,42 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
 			return false;
 
-		smart_refctd_ptr<IShaderCompiler::CCache> shaderReadCache = nullptr;
-		smart_refctd_ptr<IShaderCompiler::CCache> shaderWriteCache = core::make_smart_refctd_ptr<IShaderCompiler::CCache>();
-		auto shaderCachePath = localOutputCWD / "main_pipeline_shader_cache.bin";
-
-		{
-			core::smart_refctd_ptr<system::IFile> shaderReadCacheFile;
-			{
-				system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-				m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_READ);
-				if (future.wait())
-				{
-					future.acquire().move_into(shaderReadCacheFile);
-					if (shaderReadCacheFile)
-					{
-						const size_t size = shaderReadCacheFile->getSize();
-						if (size > 0ull)
-						{
-							std::vector<uint8_t> contents(size);
-							system::IFile::success_t succ;
-							shaderReadCacheFile->read(succ, contents.data(), 0, size);
-							if (succ)
-								shaderReadCache = IShaderCompiler::CCache::deserialize(contents);
-						}
-					}
-				}
-				else
-					m_logger->log("Failed Openning Shader Cache File.", ILogger::ELL_ERROR);
-			}
-
-		}
-
 		// Load Custom Shader
-		auto loadCompileAndCreateShader = [&](const std::string& relPath) -> smart_refctd_ptr<IShader>
+		auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset(relPath, lp);
+				lp.workingDirectory = "app_resources"; // virtual root
+				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())
 					return nullptr;
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto sourceRaw = IAsset::castDown<IShader>(assets[0]);
-				if (!sourceRaw)
+				auto shader = IAsset::castDown<IShader>(assets[0]);
+				if (!shader)
+				{
+					m_logger->log("Failed to load a precompiled shader.", ILogger::ELL_ERROR);
 					return nullptr;
+				}
 
-				return m_device->compileShader({ sourceRaw.get(), nullptr, shaderReadCache.get(), shaderWriteCache.get() });
+				return shader;
 			};
 
 		// load shaders
-		const auto raygenShader = loadCompileAndCreateShader("app_resources/raytrace.rgen.hlsl");
-		const auto closestHitShader = loadCompileAndCreateShader("app_resources/raytrace.rchit.hlsl");
-		const auto proceduralClosestHitShader = loadCompileAndCreateShader("app_resources/raytrace_procedural.rchit.hlsl");
-		const auto intersectionHitShader = loadCompileAndCreateShader("app_resources/raytrace.rint.hlsl");
-		const auto anyHitShaderColorPayload = loadCompileAndCreateShader("app_resources/raytrace.rahit.hlsl");
-		const auto anyHitShaderShadowPayload = loadCompileAndCreateShader("app_resources/raytrace_shadow.rahit.hlsl");
-		const auto missShader = loadCompileAndCreateShader("app_resources/raytrace.rmiss.hlsl");
-		const auto missShadowShader = loadCompileAndCreateShader("app_resources/raytrace_shadow.rmiss.hlsl");
-		const auto directionalLightCallShader = loadCompileAndCreateShader("app_resources/light_directional.rcall.hlsl");
-		const auto pointLightCallShader = loadCompileAndCreateShader("app_resources/light_point.rcall.hlsl");
-		const auto spotLightCallShader = loadCompileAndCreateShader("app_resources/light_spot.rcall.hlsl");
-		const auto fragmentShader = loadCompileAndCreateShader("app_resources/present.frag.hlsl");
-
-		core::smart_refctd_ptr<system::IFile> shaderWriteCacheFile;
-		{
-			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
-			m_system->deleteFile(shaderCachePath); // temp solution instead of trimming, to make sure we won't have corrupted json
-			m_system->createFile(future, shaderCachePath.c_str(), system::IFile::ECF_WRITE);
-			if (future.wait())
-			{
-				future.acquire().move_into(shaderWriteCacheFile);
-				if (shaderWriteCacheFile)
-				{
-					auto serializedCache = shaderWriteCache->serialize();
-					if (shaderWriteCacheFile)
-					{
-						system::IFile::success_t succ;
-						shaderWriteCacheFile->write(succ, serializedCache->getPointer(), 0, serializedCache->getSize());
-						if (!succ)
-							m_logger->log("Failed Writing To Shader Cache File.", ILogger::ELL_ERROR);
-					}
-				}
-				else
-					m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-			}
-			else
-				m_logger->log("Failed Creating Shader Cache File.", ILogger::ELL_ERROR);
-		}
+		const auto raygenShader = loadPrecompiledShader.operator()<"raytrace_rgen">(); // "app_resources/raytrace.rgen.hlsl"
+		const auto closestHitShader = loadPrecompiledShader.operator()<"raytrace_rchit">(); // "app_resources/raytrace.rchit.hlsl"
+		const auto proceduralClosestHitShader = loadPrecompiledShader.operator()<"raytrace_procedural_rchit">(); // "app_resources/raytrace_procedural.rchit.hlsl"
+		const auto intersectionHitShader = loadPrecompiledShader.operator()<"raytrace_rint">(); // "app_resources/raytrace.rint.hlsl"
+		const auto anyHitShaderColorPayload = loadPrecompiledShader.operator()<"raytrace_rahit">(); // "app_resources/raytrace.rahit.hlsl"
+		const auto anyHitShaderShadowPayload = loadPrecompiledShader.operator()<"raytrace_shadow_rahit">(); // "app_resources/raytrace_shadow.rahit.hlsl"
+		const auto missShader = loadPrecompiledShader.operator()<"raytrace_rmiss">(); // "app_resources/raytrace.rmiss.hlsl"
+		const auto missShadowShader = loadPrecompiledShader.operator()<"raytrace_shadow_rmiss">(); // "app_resources/raytrace_shadow.rmiss.hlsl"
+		const auto directionalLightCallShader = loadPrecompiledShader.operator()<"light_directional_rcall">(); // "app_resources/light_directional.rcall.hlsl"
+		const auto pointLightCallShader = loadPrecompiledShader.operator()<"light_point_rcall">(); // "app_resources/light_point.rcall.hlsl"
+		const auto spotLightCallShader = loadPrecompiledShader.operator()<"light_spot_rcall">(); // "app_resources/light_spot.rcall.hlsl"
+		const auto fragmentShader = loadPrecompiledShader.operator()<"present_frag">(); // "app_resources/present.frag.hlsl"
 
 		m_semaphore = m_device->createSemaphore(m_realFrameIx);
 		if (!m_semaphore)

From e1e8dd6fb0c46612defeea46c960a6b85f4b4155 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Thu, 30 Oct 2025 18:34:18 +0100
Subject: [PATCH 17/57] Replaced `vk::RawBufferLoad` with `vk::PointerBuffer`
 in example 71

---
 .../app_resources/common.hlsl                 |  7 ++++++
 .../app_resources/raytrace.rahit.hlsl         |  3 ++-
 .../app_resources/raytrace.rchit.hlsl         | 25 +++++++++++--------
 .../app_resources/raytrace.rgen.hlsl          |  5 ++--
 .../app_resources/raytrace.rint.hlsl          |  3 ++-
 .../app_resources/raytrace_shadow.rahit.hlsl  |  4 ++-
 6 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/71_RayTracingPipeline/app_resources/common.hlsl b/71_RayTracingPipeline/app_resources/common.hlsl
index f9d67af78..502b53160 100644
--- a/71_RayTracingPipeline/app_resources/common.hlsl
+++ b/71_RayTracingPipeline/app_resources/common.hlsl
@@ -4,6 +4,7 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/cpp_compat/basic.h"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
 NBL_CONSTEXPR uint32_t MAX_UNORM_10 = 1023;
@@ -78,6 +79,9 @@ struct MaterialPacked
         return (xi>>22) > alpha;
     }
 };
+#ifdef __HLSL_VERSION
+NBL_REGISTER_OBJ_TYPE(MaterialPacked, 4)
+#endif
 
 struct SProceduralGeomInfo
 {
@@ -103,6 +107,9 @@ struct STriangleGeomInfo
     uint32_t indexType : 1; // 16 bit, 32 bit
 
 };
+#ifdef __HLSL_VERSION
+NBL_REGISTER_OBJ_TYPE(STriangleGeomInfo, 8)
+#endif
 
 enum E_GEOM_TYPE : uint16_t
 {
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
index f5c9080e8..da7cc1594 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rahit.hlsl
@@ -10,7 +10,8 @@ using namespace nbl::hlsl;
 void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = spirv::InstanceCustomIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo), 8);
+    const static uint64_t STriangleGeomInfoAlignment = nbl::hlsl::alignment_of_v<STriangleGeomInfo>;
+    const STriangleGeomInfo geom = vk::BufferPointer<STriangleGeomInfo, STriangleGeomInfoAlignment>(pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)).Get();
 
     const uint32_t bitpattern = payload.pcg();
     // Cannot use spirv::ignoreIntersectionKHR and spirv::terminateRayKHR due to https://github.com/microsoft/DirectXShaderCompiler/issues/7279
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
index dc83b5cd2..e6ebcda78 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rchit.hlsl
@@ -38,9 +38,9 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
 
     if (normalBufferAddress == 0)
     {
-        float3 v0 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[0] * 12, 8);
-        float3 v1 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[1] * 12, 8);
-        float3 v2 = vk::RawBufferLoad<float3>(vertexBufferAddress + indices[2] * 12, 8);
+        float3 v0 = (nbl::hlsl::bda::__ptr<float3>::create(vertexBufferAddress) + indices[0]).deref().load();
+        float3 v1 = (nbl::hlsl::bda::__ptr<float3>::create(vertexBufferAddress) + indices[1]).deref().load();
+        float3 v2 = (nbl::hlsl::bda::__ptr<float3>::create(vertexBufferAddress) + indices[2]).deref().load();
 
         return normalize(cross(v2 - v0, v1 - v0));
     }
@@ -50,9 +50,9 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
     {
         case NT_R8G8B8A8_SNORM:
         {
-            uint32_t v0 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[0] * 4, 8);
-            uint32_t v1 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[1] * 4, 8);
-            uint32_t v2 = vk::RawBufferLoad<uint32_t>(normalBufferAddress + indices[2] * 4, 8);
+            uint32_t v0 = (nbl::hlsl::bda::__ptr<uint32_t>::create(normalBufferAddress) + indices[0]).deref().load();
+            uint32_t v1 = (nbl::hlsl::bda::__ptr<uint32_t>::create(normalBufferAddress) + indices[1]).deref().load();
+            uint32_t v2 = (nbl::hlsl::bda::__ptr<uint32_t>::create(normalBufferAddress) + indices[2]).deref().load();
 
             n0 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v0).xyz);
             n1 = normalize(nbl::hlsl::spirv::unpackSnorm4x8(v1).xyz);
@@ -61,9 +61,13 @@ float3 calculateNormals(int primID, STriangleGeomInfo geom, float2 bary)
         break;
         case NT_R32G32B32_SFLOAT:
         {
-            n0 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[0] * 12, 8));
-            n1 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[1] * 12, 8));
-            n2 = normalize(vk::RawBufferLoad<float3>(normalBufferAddress + indices[2] * 12, 8));
+            float3 v0 = (nbl::hlsl::bda::__ptr<float3>::create(normalBufferAddress) + indices[0]).deref().load();
+            float3 v1 = (nbl::hlsl::bda::__ptr<float3>::create(normalBufferAddress) + indices[1]).deref().load();
+            float3 v2 = (nbl::hlsl::bda::__ptr<float3>::create(normalBufferAddress) + indices[2]).deref().load();
+
+            n0 = normalize(v0);
+            n1 = normalize(v1);
+            n2 = normalize(v2);
         }
         break;
     }
@@ -81,7 +85,8 @@ void main(inout PrimaryPayload payload, in BuiltInTriangleIntersectionAttributes
     const int primID = spirv::PrimitiveId;
     const int instanceCustomIndex = spirv::InstanceCustomIndexKHR;
     const int geometryIndex = spirv::RayGeometryIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo), 8);
+    const static uint64_t STriangleGeomInfoAlignment = nbl::hlsl::alignment_of_v<STriangleGeomInfo>;
+    const STriangleGeomInfo geom = vk::BufferPointer<STriangleGeomInfo, STriangleGeomInfoAlignment>(pc.triangleGeomInfoBuffer + (instanceCustomIndex + geometryIndex) * sizeof(STriangleGeomInfo)).Get();
     const float32_t3 vertexNormal = calculateNormals(primID, geom, attribs.barycentrics);
     const float32_t3 worldNormal = normalize(mul(vertexNormal, transpose(spirv::WorldToObjectKHR)).xyz);
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
index 6571c5c67..c42d5a7df 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rgen.hlsl
@@ -79,15 +79,16 @@ void main()
 
         Material material;
         MaterialId materialId = payload.materialId;
+        const static uint64_t MaterialPackedAlignment = nbl::hlsl::alignment_of_v<MaterialPacked>;
         // we use negative index to indicate that this is a procedural geometry
         if (materialId.isHitProceduralGeom())
         {
-            const MaterialPacked materialPacked = vk::RawBufferLoad<MaterialPacked>(pc.proceduralGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(SProceduralGeomInfo));
+            const MaterialPacked materialPacked = vk::BufferPointer<MaterialPacked, MaterialPackedAlignment>(pc.proceduralGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(SProceduralGeomInfo)).Get();
             material = nbl::hlsl::_static_cast<Material>(materialPacked);
         }
         else
         {
-            const MaterialPacked materialPacked = vk::RawBufferLoad<MaterialPacked>(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo));
+            const MaterialPacked materialPacked = vk::BufferPointer<MaterialPacked, MaterialPackedAlignment>(pc.triangleGeomInfoBuffer + materialId.getMaterialIndex() * sizeof(STriangleGeomInfo)).Get();
             material = nbl::hlsl::_static_cast<Material>(materialPacked);
         }
 
diff --git a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
index 72f9beffd..551be1c8a 100644
--- a/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace.rint.hlsl
@@ -36,8 +36,9 @@ void main()
 
     const int primID = spirv::PrimitiveId;
 
+    const static uint64_t SProceduralGeomInfoAlignment = nbl::hlsl::alignment_of_v<STriangleGeomInfo>;
     // Sphere data
-    SProceduralGeomInfo sphere = vk::RawBufferLoad<SProceduralGeomInfo>(pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo));
+    SProceduralGeomInfo sphere = vk::BufferPointer<SProceduralGeomInfo, SProceduralGeomInfoAlignment>(pc.proceduralGeomInfoBuffer + primID * sizeof(SProceduralGeomInfo)).Get();
 
     const float32_t tHit = hitSphere(sphere, ray);
     
diff --git a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
index dd83f92c9..d87b8dd5d 100644
--- a/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
+++ b/71_RayTracingPipeline/app_resources/raytrace_shadow.rahit.hlsl
@@ -1,6 +1,7 @@
 #include "common.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/raytracing.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -10,7 +11,8 @@ using namespace nbl::hlsl;
 void main(inout OcclusionPayload payload, in BuiltInTriangleIntersectionAttributes attribs)
 {
     const int instID = spirv::InstanceCustomIndexKHR;
-    const STriangleGeomInfo geom = vk::RawBufferLoad < STriangleGeomInfo > (pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo), 8);
+    const static uint64_t STriangleGeomInfoAlignment = nbl::hlsl::alignment_of_v<STriangleGeomInfo>;
+    const STriangleGeomInfo geom = vk::BufferPointer<STriangleGeomInfo, STriangleGeomInfoAlignment>(pc.triangleGeomInfoBuffer + instID * sizeof(STriangleGeomInfo)).Get();
     const Material material = nbl::hlsl::_static_cast<Material>(geom.material);
     
     const float attenuation = (1.f-material.alpha) * payload.attenuation;

From 08c898d5af460ba6469a78fb625216e27a1bc8a8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 27 Nov 2025 20:22:03 +0700
Subject: [PATCH 18/57] Reindex mortons example from 12 to 73

---
 {12_Mortons => 73_Mortons}/CMakeLists.txt     |  0
 {12_Mortons => 73_Mortons}/CTester.h          |  2 --
 {12_Mortons => 73_Mortons}/ITester.h          | 27 +++----------------
 .../app_resources/common.hlsl                 |  0
 .../app_resources/test.comp.hlsl              |  1 +
 .../app_resources/testCommon.hlsl             |  0
 .../config.json.template                      |  0
 {12_Mortons => 73_Mortons}/main.cpp           |  7 ++---
 {12_Mortons => 73_Mortons}/pipeline.groovy    |  0
 CMakeLists.txt                                |  2 +-
 10 files changed, 10 insertions(+), 29 deletions(-)
 rename {12_Mortons => 73_Mortons}/CMakeLists.txt (100%)
 rename {12_Mortons => 73_Mortons}/CTester.h (99%)
 rename {12_Mortons => 73_Mortons}/ITester.h (90%)
 rename {12_Mortons => 73_Mortons}/app_resources/common.hlsl (100%)
 rename {12_Mortons => 73_Mortons}/app_resources/test.comp.hlsl (96%)
 rename {12_Mortons => 73_Mortons}/app_resources/testCommon.hlsl (100%)
 rename {12_Mortons => 73_Mortons}/config.json.template (100%)
 rename {12_Mortons => 73_Mortons}/main.cpp (89%)
 rename {12_Mortons => 73_Mortons}/pipeline.groovy (100%)

diff --git a/12_Mortons/CMakeLists.txt b/73_Mortons/CMakeLists.txt
similarity index 100%
rename from 12_Mortons/CMakeLists.txt
rename to 73_Mortons/CMakeLists.txt
diff --git a/12_Mortons/CTester.h b/73_Mortons/CTester.h
similarity index 99%
rename from 12_Mortons/CTester.h
rename to 73_Mortons/CTester.h
index 5a61be501..c47e94376 100644
--- a/12_Mortons/CTester.h
+++ b/73_Mortons/CTester.h
@@ -3,8 +3,6 @@
 
 #include <nabla.h>
 #include "app_resources/testCommon.hlsl"
-#include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 #include "ITester.h"
 
 using namespace nbl;
diff --git a/12_Mortons/ITester.h b/73_Mortons/ITester.h
similarity index 90%
rename from 12_Mortons/ITester.h
rename to 73_Mortons/ITester.h
index 2510dd997..a0c76ac75 100644
--- a/12_Mortons/ITester.h
+++ b/73_Mortons/ITester.h
@@ -4,7 +4,6 @@
 #include <nabla.h>
 #include "app_resources/common.hlsl"
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 using namespace nbl;
 
@@ -45,7 +44,7 @@ class ITester
             logFail("Failed to create Command Buffers!\n");
 
         // Load shaders, set up pipeline
-        core::smart_refctd_ptr<video::IGPUShader> shader;
+        core::smart_refctd_ptr<asset::IShader> shader;
         {
             asset::IAssetLoader::SAssetLoadParams lp = {};
             lp.logger = m_logger.get();
@@ -53,31 +52,13 @@ class ITester
             auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp);
             const auto assets = assetBundle.getContents();
             if (assets.empty())
-            {
-                logFail("Could not load shader!");
-                assert(0);
-            }
+                return logFail("Could not load shader!");
 
             // It would be super weird if loading a shader from a file produced more than 1 asset
             assert(assets.size() == 1);
-            core::smart_refctd_ptr<asset::ICPUShader> source = asset::IAsset::castDown<asset::ICPUShader>(assets[0]);
-
-            auto* compilerSet = m_assetMgr->getCompilerSet();
-
-            asset::IShaderCompiler::SCompilerOptions options = {};
-            options.stage = source->getStage();
-            options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
-            options.spirvOptimizer = nullptr;
-            options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
-            options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
-            options.preprocessorOptions.logger = m_logger.get();
-            options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
-
-            auto spirv = compilerSet->compileToSPIRV(source.get(), options);
+            core::smart_refctd_ptr<asset::IShader> source = asset::IAsset::castDown<asset::IShader>(assets[0]);
 
-            video::ILogicalDevice::SShaderCreationParameters params{};
-            params.cpushader = spirv.get();
-            shader = m_device->createShader(params);
+            shader = m_device->compileShader({source.get()});
         }
 
         if (!shader)
diff --git a/12_Mortons/app_resources/common.hlsl b/73_Mortons/app_resources/common.hlsl
similarity index 100%
rename from 12_Mortons/app_resources/common.hlsl
rename to 73_Mortons/app_resources/common.hlsl
diff --git a/12_Mortons/app_resources/test.comp.hlsl b/73_Mortons/app_resources/test.comp.hlsl
similarity index 96%
rename from 12_Mortons/app_resources/test.comp.hlsl
rename to 73_Mortons/app_resources/test.comp.hlsl
index 243983d5a..d1010aeb0 100644
--- a/12_Mortons/app_resources/test.comp.hlsl
+++ b/73_Mortons/app_resources/test.comp.hlsl
@@ -8,6 +8,7 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
 
 [numthreads(256, 1, 1)]
+[shader("compute")]
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
     if (invocationID.x == 0)
diff --git a/12_Mortons/app_resources/testCommon.hlsl b/73_Mortons/app_resources/testCommon.hlsl
similarity index 100%
rename from 12_Mortons/app_resources/testCommon.hlsl
rename to 73_Mortons/app_resources/testCommon.hlsl
diff --git a/12_Mortons/config.json.template b/73_Mortons/config.json.template
similarity index 100%
rename from 12_Mortons/config.json.template
rename to 73_Mortons/config.json.template
diff --git a/12_Mortons/main.cpp b/73_Mortons/main.cpp
similarity index 89%
rename from 12_Mortons/main.cpp
rename to 73_Mortons/main.cpp
index a05e61842..6034e3469 100644
--- a/12_Mortons/main.cpp
+++ b/73_Mortons/main.cpp
@@ -7,7 +7,7 @@
 #include <assert.h>
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
 
 #include "app_resources/common.hlsl"
 #include "CTester.h"
@@ -17,12 +17,13 @@ using namespace nbl::hlsl;
 using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::video;
+using namespace nbl::examples;
 using namespace nbl::application_templates;
 
-class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication
+class MortonTest final : public MonoDeviceApplication, public BuiltinResourcesApplication
 {
     using device_base_t = MonoDeviceApplication;
-    using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
 public:
     MortonTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
         IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
diff --git a/12_Mortons/pipeline.groovy b/73_Mortons/pipeline.groovy
similarity index 100%
rename from 12_Mortons/pipeline.groovy
rename to 73_Mortons/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eff5154dc..b85577144 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,7 +87,7 @@ if(NBL_BUILD_EXAMPLES)
 
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
-	add_subdirectory(12_Mortons EXCLUDE_FROM_ALL)
+	add_subdirectory(73_Mortons EXCLUDE_FROM_ALL)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 7f8dd73473e47d3ea02537ed042859b913855f13 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 28 Nov 2025 20:26:58 +0700
Subject: [PATCH 19/57] Global variable of hlsl to use
 NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR

---
 .../app_resources/common.hlsl                 |  6 +--
 27_MPMCScheduler/app_resources/common.hlsl    |  6 +--
 62_CAD/shaders/geotexture/common.hlsl         |  2 +-
 62_CAD/shaders/globals.hlsl                   | 38 +++++++++----------
 .../app_resources/benchmark/common.hlsl       |  8 ++--
 .../app_resources/common.hlsl                 |  2 +-
 66_HLSLBxDFTests/app_resources/tests.hlsl     |  8 ++--
 67_RayQueryGeometry/app_resources/common.hlsl |  2 +-
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/07_StagingAndMultipleQueues/app_resources/common.hlsl b/07_StagingAndMultipleQueues/app_resources/common.hlsl
index 259d5069d..de15810c9 100644
--- a/07_StagingAndMultipleQueues/app_resources/common.hlsl
+++ b/07_StagingAndMultipleQueues/app_resources/common.hlsl
@@ -1,8 +1,8 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeX = 16;
-NBL_CONSTEXPR uint32_t WorkgroupSizeY = 16;
-NBL_CONSTEXPR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSizeX = 16;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSizeY = 16;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;
 
 static const uint32_t FRAMES_IN_FLIGHT = 3u;
 
diff --git a/27_MPMCScheduler/app_resources/common.hlsl b/27_MPMCScheduler/app_resources/common.hlsl
index 2fb8971ad..2783f13a2 100644
--- a/27_MPMCScheduler/app_resources/common.hlsl
+++ b/27_MPMCScheduler/app_resources/common.hlsl
@@ -1,8 +1,8 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeX = 8;
-NBL_CONSTEXPR uint32_t WorkgroupSizeY = 8;
-NBL_CONSTEXPR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSizeX = 8;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSizeY = 8;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSize = WorkgroupSizeX*WorkgroupSizeY;
 
 struct PushConstants
 {
diff --git a/62_CAD/shaders/geotexture/common.hlsl b/62_CAD/shaders/geotexture/common.hlsl
index 691cd3d3b..f2053e003 100644
--- a/62_CAD/shaders/geotexture/common.hlsl
+++ b/62_CAD/shaders/geotexture/common.hlsl
@@ -4,7 +4,7 @@
 #include "../globals.hlsl"
 
 // Handle multiple geo textures, separate set, array of texture? index allocator? or multiple sets?
-NBL_CONSTEXPR uint32_t MaxGeoTextures = 256; 
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t MaxGeoTextures = 256; 
 
 // GeoTexture Oriented Bounding Box
 struct GeoTextureOBB
diff --git a/62_CAD/shaders/globals.hlsl b/62_CAD/shaders/globals.hlsl
index 5c3681910..7c2b7e893 100644
--- a/62_CAD/shaders/globals.hlsl
+++ b/62_CAD/shaders/globals.hlsl
@@ -352,8 +352,8 @@ static_assert(offsetof(CurveBox, curveMax[0]) == 56u);
 static_assert(sizeof(CurveBox) == 80u);
 #endif
 
-NBL_CONSTEXPR uint32_t InvalidRigidSegmentIndex = 0xffffffff;
-NBL_CONSTEXPR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>::infinity;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidRigidSegmentIndex = 0xffffffff;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float InvalidStyleStretchValue = nbl::hlsl::numeric_limits<float>::infinity;
 
 
 // TODO[Przemek]: we will need something similar to LineStyles but related to heigh shading settings which is user customizable (like  stipple patterns) and requires upper_bound to figure out the color based on height value.
@@ -547,27 +547,27 @@ inline bool operator==(const DTMSettings& lhs, const DTMSettings& rhs)
 }
 #endif
 
-NBL_CONSTEXPR uint32_t ImagesBindingArraySize = 128;
-NBL_CONSTEXPR uint32_t MainObjectIdxBits = 24u; // It will be packed next to alpha in a texture
-NBL_CONSTEXPR uint32_t AlphaBits = 32u - MainObjectIdxBits;
-NBL_CONSTEXPR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
-NBL_CONSTEXPR uint32_t InvalidStyleIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
-NBL_CONSTEXPR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
-NBL_CONSTEXPR uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t ImagesBindingArraySize = 128;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t MainObjectIdxBits = 24u; // It will be packed next to alpha in a texture
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t AlphaBits = 32u - MainObjectIdxBits;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t MaxIndexableMainObjects = (1u << MainObjectIdxBits) - 1u;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidStyleIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidDTMSettingsIdx = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidMainObjectIdx = MaxIndexableMainObjects;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidCustomProjectionIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidCustomClipRectIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t InvalidTextureIndex = nbl::hlsl::numeric_limits<uint32_t>::max;
 
 // Hatches
-NBL_CONSTEXPR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;
-NBL_CONSTEXPR MajorAxis SelectedMinorAxis = MajorAxis::MAJOR_X; //(MajorAxis) (1 - (uint32_t) SelectedMajorAxis);
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR MajorAxis SelectedMajorAxis = MajorAxis::MAJOR_Y;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR MajorAxis SelectedMinorAxis = MajorAxis::MAJOR_X; //(MajorAxis) (1 - (uint32_t) SelectedMajorAxis);
 
 // Text or MSDF Hatches
-NBL_CONSTEXPR float MSDFPixelRange = 4.0f;
-NBL_CONSTEXPR float MSDFPixelRangeHalf = MSDFPixelRange / 2.0f;
-NBL_CONSTEXPR float MSDFSize = 64.0f; 
-NBL_CONSTEXPR uint32_t MSDFMips = 4; 
-NBL_CONSTEXPR float HatchFillMSDFSceenSpaceSize = 8.0; 
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float MSDFPixelRange = 4.0f;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float MSDFPixelRangeHalf = MSDFPixelRange / 2.0f;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float MSDFSize = 64.0f; 
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t MSDFMips = 4; 
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float HatchFillMSDFSceenSpaceSize = 8.0; 
 
 inline bool isInvalidGridDtmHeightValue(float value)
 {
diff --git a/64_EmulatedFloatTest/app_resources/benchmark/common.hlsl b/64_EmulatedFloatTest/app_resources/benchmark/common.hlsl
index 98875c42f..7f6d1dec1 100644
--- a/64_EmulatedFloatTest/app_resources/benchmark/common.hlsl
+++ b/64_EmulatedFloatTest/app_resources/benchmark/common.hlsl
@@ -4,10 +4,10 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 
-NBL_CONSTEXPR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 128u;
-NBL_CONSTEXPR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u;
-NBL_CONSTEXPR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u;
-NBL_CONSTEXPR uint32_t BENCHMARK_WORKGROUP_COUNT = 1024u;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 128u;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 1024u;
 
 enum EF64_BENCHMARK_MODE
 {
diff --git a/64_EmulatedFloatTest/app_resources/common.hlsl b/64_EmulatedFloatTest/app_resources/common.hlsl
index aea1ce94d..0e8762c5a 100644
--- a/64_EmulatedFloatTest/app_resources/common.hlsl
+++ b/64_EmulatedFloatTest/app_resources/common.hlsl
@@ -8,7 +8,7 @@
 #include <nbl/builtin/hlsl/portable/vector_t.hlsl>
 #include <nbl/builtin/hlsl/portable/matrix_t.hlsl>
 
-NBL_CONSTEXPR uint32_t WORKGROUP_SIZE = 1;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WORKGROUP_SIZE = 1;
 
 using namespace nbl;
 using namespace hlsl;
diff --git a/66_HLSLBxDFTests/app_resources/tests.hlsl b/66_HLSLBxDFTests/app_resources/tests.hlsl
index 256ed3ce9..6f67c359f 100644
--- a/66_HLSLBxDFTests/app_resources/tests.hlsl
+++ b/66_HLSLBxDFTests/app_resources/tests.hlsl
@@ -356,13 +356,13 @@ struct is_microfacet_bsdf : bool_constant<
 > {};
 
 template<class T>
-NBL_CONSTEXPR bool is_basic_brdf_v = is_basic_brdf<T>::value;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR bool is_basic_brdf_v = is_basic_brdf<T>::value;
 template<class T>
-NBL_CONSTEXPR bool is_microfacet_brdf_v = is_microfacet_brdf<T>::value;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR bool is_microfacet_brdf_v = is_microfacet_brdf<T>::value;
 template<class T>
-NBL_CONSTEXPR bool is_basic_bsdf_v = is_basic_bsdf<T>::value;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR bool is_basic_bsdf_v = is_basic_bsdf<T>::value;
 template<class T>
-NBL_CONSTEXPR bool is_microfacet_bsdf_v = is_microfacet_bsdf<T>::value;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR bool is_microfacet_bsdf_v = is_microfacet_bsdf<T>::value;
 
 
 template<class BxDF, bool aniso = false>
diff --git a/67_RayQueryGeometry/app_resources/common.hlsl b/67_RayQueryGeometry/app_resources/common.hlsl
index 68a353adc..ecac0f59d 100644
--- a/67_RayQueryGeometry/app_resources/common.hlsl
+++ b/67_RayQueryGeometry/app_resources/common.hlsl
@@ -3,7 +3,7 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-NBL_CONSTEXPR uint32_t WorkgroupSize = 16;
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t WorkgroupSize = 16;
 
 enum NormalType : uint32_t
 {

From 43b8634502fd09e1405bf9a07f55bca21d613823 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 1 Dec 2025 17:58:40 +0700
Subject: [PATCH 20/57] Add test for operator-

---
 73_Mortons/CTester.h                     | 2 ++
 73_Mortons/app_resources/common.hlsl     | 1 +
 73_Mortons/app_resources/testCommon.hlsl | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/73_Mortons/CTester.h b/73_Mortons/CTester.h
index c47e94376..fa29f3c9c 100644
--- a/73_Mortons/CTester.h
+++ b/73_Mortons/CTester.h
@@ -42,6 +42,7 @@ class CTester final : public ITester
                 expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
                 expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
                 expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedUnaryMinus = _static_cast<emulated_int64_t>(-generatedA);
                 expected.emulatedLess = uint32_t(generatedA < generatedB);
                 expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
                 expected.emulatedGreater = uint32_t(generatedA > generatedB);
@@ -273,6 +274,7 @@ class CTester final : public ITester
         verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
         verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
         verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+        verifyTestValue("emulatedUnaryMinus", expectedTestValues.emulatedUnaryMinus, testValues.emulatedUnaryMinus, testType);
 
         // Morton Plus
         verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
diff --git a/73_Mortons/app_resources/common.hlsl b/73_Mortons/app_resources/common.hlsl
index b058ad821..18cdc058f 100644
--- a/73_Mortons/app_resources/common.hlsl
+++ b/73_Mortons/app_resources/common.hlsl
@@ -61,6 +61,7 @@ struct TestValues
 	emulated_uint64_t emulatedNot;
 	emulated_uint64_t emulatedPlus;
 	emulated_uint64_t emulatedMinus;
+	emulated_int64_t emulatedUnaryMinus;
 	// These are bools but stored as uint because you can't store bools, causes a SPIR-V issue
 	uint32_t emulatedLess;
 	uint32_t emulatedLessEqual;
diff --git a/73_Mortons/app_resources/testCommon.hlsl b/73_Mortons/app_resources/testCommon.hlsl
index 9ff9a4fa8..4ca2b859d 100644
--- a/73_Mortons/app_resources/testCommon.hlsl
+++ b/73_Mortons/app_resources/testCommon.hlsl
@@ -4,6 +4,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 {
 	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
 	emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+	emulated_int64_t signedEmulatedA = _static_cast<emulated_int64_t>(input.generatedA);
 
 	// Emulated int tests
 	output.emulatedAnd = emulatedA & emulatedB;
@@ -24,7 +25,9 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
 
 	arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
-	output.emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
+	output.emulatedSignedRightShifted = signedRightShift(signedEmulatedA, input.shift);
+
+	output.emulatedUnaryMinus = signedEmulatedA.operator-();
 
 	// Morton tests
 	uint64_t2 Vec2A = { input.coordX, input.coordY };

From ba6641f6de9e107c923d3999cd7f52a8774797b7 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 1 Dec 2025 18:20:31 +0700
Subject: [PATCH 21/57] Use 1, 1, 1 workgroup dimension

---
 73_Mortons/app_resources/test.comp.hlsl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/73_Mortons/app_resources/test.comp.hlsl b/73_Mortons/app_resources/test.comp.hlsl
index d1010aeb0..60cdf94b1 100644
--- a/73_Mortons/app_resources/test.comp.hlsl
+++ b/73_Mortons/app_resources/test.comp.hlsl
@@ -3,14 +3,15 @@
 //// For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "testCommon.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 
 [[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
 
-[numthreads(256, 1, 1)]
+[numthreads(1, 1, 1)]
 [shader("compute")]
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
-    if (invocationID.x == 0)
-        fillTestValues(inputTestValues[0], outputTestValues[0]);
+    uint32_t testID = glsl::gl_GlobalInvocationID().x;
+    fillTestValues(inputTestValues[testID], outputTestValues[testID]);
 }

From e830c3423b723e4efd9dbc2d2a981098e6830a56 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 1 Dec 2025 18:27:44 +0700
Subject: [PATCH 22/57] Enable previously failed test because of bug in glm

---
 22_CppCompat/CIntrinsicsTester.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index fa35d1a68..f014bd1cb 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -250,9 +250,8 @@ class CIntrinsicsTester final : public ITester
         verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
         verifyTestValue("addCarryResult", expectedTestValues.addCarry.result, testValues.addCarry.result, testType);
         verifyTestValue("addCarryCarry", expectedTestValues.addCarry.carry, testValues.addCarry.carry, testType);
-        // Disabled: current glm implementation is wrong
-        //verifyTestValue("subBorrowResult", expectedTestValues.subBorrow.result, testValues.subBorrow.result, testType);
-        //verifyTestValue("subBorrowBorrow", expectedTestValues.subBorrow.borrow, testValues.subBorrow.borrow, testType);
+        verifyTestValue("subBorrowResult", expectedTestValues.subBorrow.result, testValues.subBorrow.result, testType);
+        verifyTestValue("subBorrowBorrow", expectedTestValues.subBorrow.borrow, testValues.subBorrow.borrow, testType);
 
         verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
         verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
@@ -277,9 +276,8 @@ class CIntrinsicsTester final : public ITester
         verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
         verifyTestVector3dValue("addCarryVecResult", expectedTestValues.addCarryVec.result, testValues.addCarryVec.result, testType);
         verifyTestVector3dValue("addCarryVecCarry", expectedTestValues.addCarryVec.carry, testValues.addCarryVec.carry, testType);
-        // Disabled: current glm implementation is wrong
-        //verifyTestVector3dValue("subBorrowVecResult", expectedTestValues.subBorrowVec.result, testValues.subBorrowVec.result, testType);
-        //verifyTestVector3dValue("subBorrowVecBorrow", expectedTestValues.subBorrowVec.borrow, testValues.subBorrowVec.borrow, testType);
+        verifyTestVector3dValue("subBorrowVecResult", expectedTestValues.subBorrowVec.result, testValues.subBorrowVec.result, testType);
+        verifyTestVector3dValue("subBorrowVecBorrow", expectedTestValues.subBorrowVec.borrow, testValues.subBorrowVec.borrow, testType);
 
         verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
         verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);

From f18160276e78f860f64c45111c874e3351b44ffb Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 3 Dec 2025 23:24:18 +0300
Subject: [PATCH 23/57] New example, copy of 61_UI, updated a lot, visualizer,
 still not "solid angle", rest should be shader work

---
 72_SolidAngleVisualizer/CMakeLists.txt        |   20 +
 72_SolidAngleVisualizer/README.md             |    0
 .../hlsl/SolidAngleVis.frag.hlsl              |  175 +++
 .../app_resources/hlsl/common.hlsl            |   14 +
 72_SolidAngleVisualizer/config.json.template  |   28 +
 72_SolidAngleVisualizer/include/common.hpp    |   20 +
 72_SolidAngleVisualizer/include/transform.hpp |  172 +++
 72_SolidAngleVisualizer/main.cpp              | 1105 +++++++++++++++++
 72_SolidAngleVisualizer/pipeline.groovy       |   50 +
 72_SolidAngleVisualizer/src/transform.cpp     |    0
 CMakeLists.txt                                |    1 +
 11 files changed, 1585 insertions(+)
 create mode 100644 72_SolidAngleVisualizer/CMakeLists.txt
 create mode 100644 72_SolidAngleVisualizer/README.md
 create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
 create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
 create mode 100644 72_SolidAngleVisualizer/config.json.template
 create mode 100644 72_SolidAngleVisualizer/include/common.hpp
 create mode 100644 72_SolidAngleVisualizer/include/transform.hpp
 create mode 100644 72_SolidAngleVisualizer/main.cpp
 create mode 100644 72_SolidAngleVisualizer/pipeline.groovy
 create mode 100644 72_SolidAngleVisualizer/src/transform.cpp

diff --git a/72_SolidAngleVisualizer/CMakeLists.txt b/72_SolidAngleVisualizer/CMakeLists.txt
new file mode 100644
index 000000000..5d0021f61
--- /dev/null
+++ b/72_SolidAngleVisualizer/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(NBL_BUILD_IMGUI)
+	set(NBL_EXTRA_SOURCES
+		"${CMAKE_CURRENT_SOURCE_DIR}/src/transform.cpp"
+	)
+
+	set(NBL_INCLUDE_SERACH_DIRECTORIES
+		"${CMAKE_CURRENT_SOURCE_DIR}/include"
+	)
+
+	list(APPEND NBL_LIBRARIES 
+		imtestengine
+		imguizmo
+		"${NBL_EXT_IMGUI_UI_LIB}"
+	)
+	
+	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
+	nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}")
+	# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
+	# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
+endif()
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/README.md b/72_SolidAngleVisualizer/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
new file mode 100644
index 000000000..d783a5b37
--- /dev/null
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -0,0 +1,175 @@
+#pragma wave shader_stage(fragment)
+
+#include "common.hlsl"
+
+#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+
+using namespace nbl::hlsl;
+using namespace ext::FullScreenTriangle;
+
+[[vk::push_constant]] struct PushConstants pc;
+
+static const float CIRCLE_RADIUS = 0.45f;
+
+// --- Geometry Utils ---
+
+// Adjacency of edges to faces
+static const int2 edgeToFaces[12] = { 
+    {4,2}, {3,4}, {2,5}, {5,3}, 
+    {2,0}, {0,3}, {1,2}, {3,1}, 
+    {0,4}, {5,0}, {4,1}, {1,5} 
+};
+
+static const float3 localNormals[6] = {
+    float3(0, 0, -1), // Face 0 (Z-)
+    float3(0, 0, 1),  // Face 1 (Z+)
+    float3(-1, 0, 0), // Face 2 (X-)
+    float3(1, 0, 0),  // Face 3 (X+)
+    float3(0, -1, 0), // Face 4 (Y-)
+    float3(0, 1, 0)   // Face 5 (Y+)
+};
+
+static float3 corners[8];
+static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), 
+                            float3(0,0,0), float3(0,0,0), float3(0,0,0) };
+static float2 projCorners[8];
+
+
+// Converts UV into centered, aspect-corrected NDC circle space
+float2 toCircleSpace(float2 uv)
+{
+    float aspect = pc.viewport.z / pc.viewport.w;
+    float2 centered = uv - 0.5f;
+    centered.x *= aspect;
+    return centered;
+}
+
+// Distance to a 2D line segment
+float sdSegment(float2 p, float2 a, float2 b)
+{
+    float2 pa = p - a;
+    float2 ba = b - a;
+    float h = clamp(dot(pa, ba) / dot(ba, ba), 0.0f, 1.0f);
+    return length(pa - ba * h);
+}
+
+// TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection)
+float2 project(float3 p) 
+{
+    return normalize(p).xy;
+}
+
+void computeCubeGeo()
+{
+    for (int i = 0; i < 8; i++)
+    {
+        float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f;
+        float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
+        
+        corners[i] = worldPos;
+        
+        faceCenters[i/4]      += worldPos / 4.0f; 
+        faceCenters[2+i%2]    += worldPos / 4.0f; 
+        faceCenters[4+(i/2)%2] += worldPos / 4.0f; 
+
+        float3 viewPos = worldPos; 
+        projCorners[i] = project(viewPos);
+    }
+}
+
+int getVisibilityCount(int2 faces, float3 cameraPos)
+{
+    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
+    float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
+    float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
+    
+    float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; 
+    float3 viewVec_f2 = faceCenters[faces.y] - cameraPos;
+
+    // Face is visible if its outward normal points towards the origin (camera).
+    bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f;
+    bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f;
+
+    // Determine Line Style:
+    bool isSilhouette = visible1 != visible2; // One face visible, the other hidden
+    bool isInner = visible1 && visible2;      // Both faces visible
+    
+    int visibilityCount = 0;
+    if (isSilhouette) 
+    {
+        visibilityCount = 1;
+    }
+    else if (isInner)
+    {
+        visibilityCount = 2;
+    }
+
+    return visibilityCount;
+}
+
+void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth)
+{
+    if (visibilityCount > 0)
+    {
+        float3 A = corners[a];
+        float3 B = corners[b];
+
+        float avgDepth = (length(A) + length(B)) * 0.5f;
+        float referenceDepth = 3.0f;
+        float depthScale = referenceDepth / avgDepth;
+
+        float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f;
+        float intensity = (visibilityCount == 1) ? 1.0f : 0.5f;
+        float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red
+        
+        float width = min(baseWidth * depthScale, 0.03f); 
+        
+        float dist = sdSegment(p, projCorners[a], projCorners[b]);
+        
+        float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
+        
+        color += edgeColor * alpha * intensity;
+    }
+}
+
+void drawRing(float2 p, inout float4 color, float aaWidth)
+{
+    float positionLength = length(p);
+
+    // Mask to cut off drawing outside the circle
+    // float circleMask = 1.0f - smoothstep(CIRCLE_RADIUS, CIRCLE_RADIUS + aaWidth, positionLength);
+    // color *= circleMask;
+    
+    // Add a white background circle ring
+    float ringWidth = 0.005f;
+    float ringDistance = abs(positionLength - CIRCLE_RADIUS);
+    float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
+    
+    // Ring color is now white
+    color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); 
+}
+
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
+{
+    float3 cameraPos = float3(0, 0, 0); // Camera at origin
+    float2 p = toCircleSpace(vx.uv);
+    float4 color = float4(0, 0, 0, 0);
+
+    computeCubeGeo();
+    
+    float aaWidth = max(fwidth(p.x), fwidth(p.y)); 
+
+    for (int j = 0; j < 12; j++)
+    {
+        int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
+        int b = a + (4 >> (j / 4));
+
+        int2 faces = edgeToFaces[j];
+        int visibilityCount = getVisibilityCount(faces, cameraPos);
+        drawLine(p, a, b, visibilityCount, color, aaWidth);
+    }
+
+    drawRing(p, color, aaWidth);
+
+    return color;
+}
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
new file mode 100644
index 000000000..80368d08f
--- /dev/null
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
@@ -0,0 +1,14 @@
+#ifndef _SOLID_ANGLE_VIS_COMMON_HLSL_
+#define _SOLID_ANGLE_VIS_COMMON_HLSL_
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+
+
+struct PushConstants
+{
+	nbl::hlsl::float32_t3x4 modelMatrix;
+	nbl::hlsl::float32_t4 viewport;
+};
+
+
+#endif // _SOLID_ANGLE_VIS_COMMON_HLSL_
diff --git a/72_SolidAngleVisualizer/config.json.template b/72_SolidAngleVisualizer/config.json.template
new file mode 100644
index 000000000..f961745c1
--- /dev/null
+++ b/72_SolidAngleVisualizer/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/include/common.hpp b/72_SolidAngleVisualizer/include/common.hpp
new file mode 100644
index 000000000..2e8e985dd
--- /dev/null
+++ b/72_SolidAngleVisualizer/include/common.hpp
@@ -0,0 +1,20 @@
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+
+
+#include "nbl/examples/examples.hpp"
+
+// the example's headers
+#include "transform.hpp"
+#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp
new file mode 100644
index 000000000..002a9d215
--- /dev/null
+++ b/72_SolidAngleVisualizer/include/transform.hpp
@@ -0,0 +1,172 @@
+#ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
+
+
+#include "nbl/ui/ICursorControl.h"
+
+#include "nbl/ext/ImGui/ImGui.h"
+
+#include "imgui/imgui_internal.h"
+#include "imguizmo/ImGuizmo.h"
+
+
+struct TransformRequestParams
+{
+	float camDistance = 8.f;
+	uint8_t sceneTexDescIx = ~0;
+	bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false;
+};
+
+struct TransformReturnInfo
+{
+	nbl::hlsl::uint16_t2 sceneResolution = { 2048,1024 };
+	bool isGizmoWindowHovered;
+	bool isGizmoBeingUsed;
+};
+
+TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params)
+{
+	static ImGuizmo::OPERATION mCurrentGizmoOperation(ImGuizmo::TRANSLATE);
+	static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL);
+	static bool useSnap = false;
+	static float snap[3] = { 1.f, 1.f, 1.f };
+	static float bounds[] = { -0.5f, -0.5f, -0.5f, 0.5f, 0.5f, 0.5f };
+	static float boundsSnap[] = { 0.1f, 0.1f, 0.1f };
+	static bool boundSizing = false;
+	static bool boundSizingSnap = false;
+
+	if (params.editTransformDecomposition)
+	{
+		if (ImGui::IsKeyPressed(ImGuiKey_T))
+			mCurrentGizmoOperation = ImGuizmo::TRANSLATE;
+		if (ImGui::IsKeyPressed(ImGuiKey_R))
+			mCurrentGizmoOperation = ImGuizmo::ROTATE;
+		if (ImGui::IsKeyPressed(ImGuiKey_S))
+			mCurrentGizmoOperation = ImGuizmo::SCALE;
+		if (ImGui::RadioButton("Translate", mCurrentGizmoOperation == ImGuizmo::TRANSLATE))
+			mCurrentGizmoOperation = ImGuizmo::TRANSLATE;
+		ImGui::SameLine();
+		if (ImGui::RadioButton("Rotate", mCurrentGizmoOperation == ImGuizmo::ROTATE))
+			mCurrentGizmoOperation = ImGuizmo::ROTATE;
+		ImGui::SameLine();
+		if (ImGui::RadioButton("Scale", mCurrentGizmoOperation == ImGuizmo::SCALE))
+			mCurrentGizmoOperation = ImGuizmo::SCALE;
+		if (ImGui::RadioButton("Universal", mCurrentGizmoOperation == ImGuizmo::UNIVERSAL))
+			mCurrentGizmoOperation = ImGuizmo::UNIVERSAL;
+		float matrixTranslation[3], matrixRotation[3], matrixScale[3];
+		ImGuizmo::DecomposeMatrixToComponents(matrix, matrixTranslation, matrixRotation, matrixScale);
+		ImGui::InputFloat3("Tr", matrixTranslation);
+		ImGui::InputFloat3("Rt", matrixRotation);
+		ImGui::InputFloat3("Sc", matrixScale);
+		ImGuizmo::RecomposeMatrixFromComponents(matrixTranslation, matrixRotation, matrixScale, matrix);
+
+		if (mCurrentGizmoOperation != ImGuizmo::SCALE)
+		{
+			if (ImGui::RadioButton("Local", mCurrentGizmoMode == ImGuizmo::LOCAL))
+				mCurrentGizmoMode = ImGuizmo::LOCAL;
+			ImGui::SameLine();
+			if (ImGui::RadioButton("World", mCurrentGizmoMode == ImGuizmo::WORLD))
+				mCurrentGizmoMode = ImGuizmo::WORLD;
+		}
+		if (ImGui::IsKeyPressed(ImGuiKey_S) && ImGui::IsKeyPressed(ImGuiKey_LeftShift))
+			useSnap = !useSnap;
+		ImGui::Checkbox("##UseSnap", &useSnap);
+		ImGui::SameLine();
+
+		switch (mCurrentGizmoOperation)
+		{
+		case ImGuizmo::TRANSLATE:
+			ImGui::InputFloat3("Snap", &snap[0]);
+			break;
+		case ImGuizmo::ROTATE:
+			ImGui::InputFloat("Angle Snap", &snap[0]);
+			break;
+		case ImGuizmo::SCALE:
+			ImGui::InputFloat("Scale Snap", &snap[0]);
+			break;
+		}
+		ImGui::Checkbox("Bound Sizing", &boundSizing);
+		if (boundSizing)
+		{
+			ImGui::PushID(3);
+			ImGui::Checkbox("##BoundSizing", &boundSizingSnap);
+			ImGui::SameLine();
+			ImGui::InputFloat3("Snap", boundsSnap);
+			ImGui::PopID();
+		}
+	}
+
+	ImGuiIO& io = ImGui::GetIO();
+	float viewManipulateRight = io.DisplaySize.x;
+	float viewManipulateTop = 0;
+	static ImGuiWindowFlags gizmoWindowFlags = 0;
+
+	/*
+		for the "useWindow" case we just render to a gui area, 
+		otherwise to fake full screen transparent window
+
+		note that for both cases we make sure gizmo being 
+		rendered is aligned to our texture scene using 
+        imgui  "cursor" screen positions
+	*/
+// TODO: this shouldn't be handled here I think
+	SImResourceInfo info;
+	info.textureID = params.sceneTexDescIx;
+	info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER;
+
+	TransformReturnInfo retval;
+	if (params.useWindow)
+	{
+		ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing);
+		ImGui::SetNextWindowPos(ImVec2(400, 20), ImGuiCond_Appearing);
+		ImGui::PushStyleColor(ImGuiCol_WindowBg, (ImVec4)ImColor(0.35f, 0.3f, 0.3f));
+		ImGui::Begin("Gizmo", 0, gizmoWindowFlags);
+		ImGuizmo::SetDrawlist();
+
+		ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
+		ImVec2 windowPos = ImGui::GetWindowPos();
+		ImVec2 cursorPos = ImGui::GetCursorScreenPos();
+
+		ImGui::Image(info, contentRegionSize);
+		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
+		retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y};
+		retval.isGizmoWindowHovered = ImGui::IsWindowHovered();
+
+		viewManipulateRight = cursorPos.x + contentRegionSize.x;
+		viewManipulateTop = cursorPos.y;
+
+		ImGuiWindow* window = ImGui::GetCurrentWindow();
+		gizmoWindowFlags = (ImGui::IsWindowHovered() && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0);
+	}
+	else
+	{
+		ImGui::SetNextWindowPos(ImVec2(0, 0));
+		ImGui::SetNextWindowSize(io.DisplaySize);
+		ImGui::PushStyleColor(ImGuiCol_WindowBg, ImVec4(0, 0, 0, 0)); // fully transparent fake window
+		ImGui::Begin("FullScreenWindow", nullptr, ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoScrollWithMouse | ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoBringToFrontOnFocus | ImGuiWindowFlags_NoBackground | ImGuiWindowFlags_NoInputs);
+
+		ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
+		ImVec2 cursorPos = ImGui::GetCursorScreenPos();
+
+		ImGui::Image(info, contentRegionSize);
+		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
+		retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y};
+		retval.isGizmoWindowHovered = ImGui::IsWindowHovered();
+
+		viewManipulateRight = cursorPos.x + contentRegionSize.x;
+		viewManipulateTop = cursorPos.y;
+	}
+
+	ImGuizmo::Manipulate(cameraView, cameraProjection, mCurrentGizmoOperation, mCurrentGizmoMode, matrix, NULL, useSnap ? &snap[0] : NULL, boundSizing ? bounds : NULL, boundSizingSnap ? boundsSnap : NULL);
+	retval.isGizmoBeingUsed = ImGuizmo::IsOver() || (ImGuizmo::IsUsing() && ImGui::IsMouseDown(ImGuiMouseButton_Left));
+
+	if(params.enableViewManipulate)
+		ImGuizmo::ViewManipulate(cameraView, params.camDistance, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010);
+
+	ImGui::End();
+	ImGui::PopStyleColor();
+
+	return retval;
+}
+
+#endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
new file mode 100644
index 000000000..b6d723e70
--- /dev/null
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -0,0 +1,1105 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+#include "common.hpp"
+#include "app_resources/hlsl/common.hlsl"
+
+#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
+
+/*
+Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window.
+
+Written with Nabla's UI extension and got integrated with ImGuizmo to handle scene's object translations.
+*/
+class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinResourcesApplication
+{
+	using device_base_t = MonoWindowApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+	inline static std::string SolidAngleVisShaderPath = "app_resources/hlsl/SolidAngleVis.frag.hlsl";
+public:
+	inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
+		: IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD),
+		device_base_t({ 2048,1024 }, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
+	}
+
+	inline bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
+
+		auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i,1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		const uint32_t addtionalBufferOwnershipFamilies[] = { getGraphicsQueue()->getFamilyIndex() };
+		m_scene = CGeometryCreatorScene::create(
+			{
+				.transferQueue = getTransferUpQueue(),
+				.utilities = m_utils.get(),
+				.logger = m_logger.get(),
+				.addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies
+			},
+			CSimpleDebugRenderer::DefaultPolygonGeometryPatch
+		);
+
+		// for the scene drawing pass
+		{
+			IGPURenderpass::SCreationParams params = {};
+			const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = {
+				{{
+					{
+						.format = sceneRenderDepthFormat,
+						.samples = IGPUImage::ESCF_1_BIT,
+						.mayAlias = false
+					},
+				/*.loadOp =*/ {IGPURenderpass::LOAD_OP::CLEAR},
+				/*.storeOp =*/ {IGPURenderpass::STORE_OP::STORE},
+				/*.initialLayout =*/ {IGPUImage::LAYOUT::UNDEFINED},
+				/*.finalLayout =*/ {IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}
+			}},
+			IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd
+			};
+			params.depthStencilAttachments = depthAttachments;
+			const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = {
+				{{
+					{
+						.format = finalSceneRenderFormat,
+						.samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
+						.mayAlias = false
+					},
+				/*.loadOp =*/ IGPURenderpass::LOAD_OP::CLEAR,
+				/*.storeOp =*/ IGPURenderpass::STORE_OP::STORE,
+				/*.initialLayout =*/ IGPUImage::LAYOUT::UNDEFINED,
+				/*.finalLayout =*/ IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read
+			}},
+			IGPURenderpass::SCreationParams::ColorAttachmentsEnd
+			};
+			params.colorAttachments = colorAttachments;
+			IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = {
+				{},
+				IGPURenderpass::SCreationParams::SubpassesEnd
+			};
+			subpasses[0].depthStencilAttachment = { {.render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}} };
+			subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} };
+			params.subpasses = subpasses;
+
+			const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+				// wipe-transition of Color to ATTACHMENT_OPTIMAL and depth
+				{
+					.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+					.dstSubpass = 0,
+					.memoryBarrier = {
+					// last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later
+					// while color is sampled by ImGUI
+					.srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT,
+					// don't want any writes to be available, as we are clearing both attachments
+					.srcAccessMask = ACCESS_FLAGS::NONE,
+					// destination needs to wait as early as possible
+					// TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h`
+					.dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					// because depth and color get cleared first no read mask
+					.dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+				// leave view offsets and flags default
+			},
+			{
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier = {
+					// last place where the color can get modified, depth is implicitly earlier
+					.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					// only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else
+					.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT,
+					// the ImGUI will sample the color, then next frame we overwrite both attachments
+					.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT,
+					// but we only care about the availability-visibility chain between renderpass and imgui 
+					.dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT
+				}
+				// leave view offsets and flags default
+			},
+			IGPURenderpass::SCreationParams::DependenciesEnd
+			};
+			params.dependencies = dependencies;
+			auto solidAngleRenderpassParams = params;
+			m_mainRenderpass = m_device->createRenderpass(std::move(params));
+			if (!m_mainRenderpass)
+				return logFail("Failed to create Main Renderpass!");
+
+			m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams));
+			if (!m_solidAngleRenderpass)
+				return logFail("Failed to create Solid Angle Renderpass!");
+
+		}
+
+		const auto& geometries = m_scene->getInitParams().geometries;
+		m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, { &geometries.front().get(),geometries.size() });
+		// special case
+		{
+			const auto& pipelines = m_renderer->getInitParams().pipelines;
+			auto ix = 0u;
+			for (const auto& name : m_scene->getInitParams().geometryNames)
+			{
+				if (name == "Cone")
+					m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone];
+				ix++;
+			}
+		}
+		// we'll only display one thing at a time
+		m_renderer->m_instances.resize(1);
+
+		// Create graphics pipeline
+		{
+			auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "") -> smart_refctd_ptr<IShader>
+				{
+					IAssetLoader::SAssetLoadParams lp = {};
+					lp.workingDirectory = localInputCWD;
+					auto assetBundle = m_assetMgr->getAsset(pathToShader, lp);
+					const auto assets = assetBundle.getContents();
+					if (assets.empty())
+					{
+						m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					auto source = smart_refctd_ptr_static_cast<IShader>(assets[0]);
+					// The down-cast should not fail!
+					assert(source);
+
+					auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+					CHLSLCompiler::SOptions options = {};
+					options.stage = IShader::E_SHADER_STAGE::ESS_FRAGMENT;
+					options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+					options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+					ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+					auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+					options.spirvOptimizer = opt.get();
+#endif
+					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+					options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+					options.preprocessorOptions.logger = m_logger.get();
+					options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
+
+					core::vector<IShaderCompiler::SMacroDefinition> defines;
+					if (!defineMacro.empty())
+						defines.push_back({ defineMacro, "" });
+
+					options.preprocessorOptions.extraDefines = defines;
+
+					source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+
+					auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr });
+					if (!shader)
+					{
+						m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader);
+						std::exit(-1);
+					}
+
+					return shader;
+				};
+
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
+			if (!fsTriProtoPPln)
+				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
+
+			// Load Fragment Shader
+			auto fragmentShader = loadAndCompileHLSLShader(SolidAngleVisShaderPath);
+			if (!fragmentShader)
+				return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
+
+			const IGPUPipelineBase::SShaderSpecInfo fragSpec = {
+				.shader = fragmentShader.get(),
+				.entryPoint = "main"
+			};
+
+			const asset::SPushConstantRange ranges[] = { {
+				.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT,
+				.offset = 0,
+				.size = sizeof(PushConstants)
+			} };
+
+			auto visualizationLayout = m_device->createPipelineLayout(
+				ranges,
+				nullptr,
+				nullptr,
+				nullptr,
+				nullptr
+			);
+			m_visualizationPipeline = fsTriProtoPPln.createPipeline(fragSpec, visualizationLayout.get(), m_solidAngleRenderpass.get());
+			if (!m_visualizationPipeline)
+				return logFail("Could not create Graphics Pipeline!");
+
+		}
+
+		// Create ImGUI
+		{
+			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+			ext::imgui::UI::SCreationParameters params = {};
+			params.resources.texturesInfo = { .setIx = 0u,.bindingIx = TexturesImGUIBindingIndex };
+			params.resources.samplersInfo = { .setIx = 0u,.bindingIx = 1u };
+			params.utilities = m_utils;
+			params.transfer = getTransferUpQueue();
+			params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures);
+			params.assetManager = make_smart_refctd_ptr<IAssetManager>(smart_refctd_ptr(m_system));
+			params.renderpass = smart_refctd_ptr<IGPURenderpass>(scRes->getRenderpass());
+			params.subpassIx = 0u;
+			params.pipelineCache = nullptr;
+			interface.imGUI = ext::imgui::UI::create(std::move(params));
+			if (!interface.imGUI)
+				return logFail("Failed to create `nbl::ext::imgui::UI` class");
+		}
+
+		// create rest of User Interface
+		{
+			auto* imgui = interface.imGUI.get();
+			// create the suballocated descriptor set
+			{
+				// note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources
+				const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u);
+				auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, { &layout,1 });
+				auto ds = pool->createDescriptorSet(smart_refctd_ptr<const IGPUDescriptorSetLayout>(layout));
+				interface.subAllocDS = make_smart_refctd_ptr<SubAllocatedDescriptorSet>(std::move(ds));
+				if (!interface.subAllocDS)
+					return logFail("Failed to create the descriptor set");
+				// make sure Texture Atlas slot is taken for eternity
+				{
+					auto dummy = SubAllocatedDescriptorSet::invalid_value;
+					interface.subAllocDS->multi_allocate(0, 1, &dummy);
+					assert(dummy == ext::imgui::UI::FontAtlasTexId);
+				}
+				// write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout
+				IGPUDescriptorSet::SDescriptorInfo info = {};
+				info.desc = smart_refctd_ptr<nbl::video::IGPUImageView>(interface.imGUI->getFontAtlasView());
+				info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL;
+				const IGPUDescriptorSet::SWriteDescriptorSet write = {
+					.dstSet = interface.subAllocDS->getDescriptorSet(),
+					.binding = TexturesImGUIBindingIndex,
+					.arrayElement = ext::imgui::UI::FontAtlasTexId,
+					.count = 1,
+					.info = &info
+				};
+				if (!m_device->updateDescriptorSets({ &write,1 }, {}))
+					return logFail("Failed to write the descriptor set");
+			}
+			imgui->registerListener([this]() {interface(); });
+		}
+
+		interface.camera.mapKeysToWASD();
+
+		onAppInitializedFinish();
+		return true;
+	}
+
+	//
+	virtual inline bool onAppTerminated()
+	{
+		SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId;
+		IGPUDescriptorSet::SDropDescriptorSet dummy[1];
+		interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx);
+		return device_base_t::onAppTerminated();
+	}
+
+	inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override
+	{
+		// CPU events
+		update(nextPresentationTimestamp);
+
+		const auto& virtualWindowRes = interface.transformReturnInfo.sceneResolution;
+		// TODO: check main frame buffer too
+		if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualWindowRes[1])
+			recreateFramebuffer(virtualWindowRes);
+
+		//
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+		auto* const cb = m_cmdBufs.data()[resourceIx].get();
+		cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		// clear to black for both things
+		const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
+		if (m_solidAngleViewFramebuffer)
+		{
+			cb->beginDebugMarker("Draw Circle View Frame");
+			{
+				const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f };
+				const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
+				{
+					.framebuffer = m_solidAngleViewFramebuffer.get(),
+					.colorClearValues = &clearValue,
+					.depthStencilClearValues = &farValue,
+					.renderArea = {
+						.offset = {0,0},
+						.extent = {virtualWindowRes[0],virtualWindowRes[1]}
+					}
+				};
+				beginRenderpass(cb, renderpassInfo);
+			}
+			// draw scene
+			{
+				PushConstants pc{
+					.modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)),
+					.viewport = { 0.f,0.f,static_cast<float>(virtualWindowRes[0]),static_cast<float>(virtualWindowRes[1]) }
+				};
+				auto pipeline = m_visualizationPipeline;
+				cb->bindGraphicsPipeline(pipeline.get());
+				cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
+				//cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 3, 1, &ds);
+				ext::FullScreenTriangle::recordDrawCall(cb);
+			}
+			cb->endRenderPass();
+			cb->endDebugMarker();
+		}
+		// draw main view
+		if (m_mainViewFramebuffer)
+		{
+			cb->beginDebugMarker("Main Scene Frame");
+			{
+				const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f };
+				const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
+				{
+					.framebuffer = m_mainViewFramebuffer.get(),
+					.colorClearValues = &clearValue,
+					.depthStencilClearValues = &farValue,
+					.renderArea = {
+						.offset = {0,0},
+						.extent = {virtualWindowRes[0],virtualWindowRes[1]}
+					}
+				};
+				beginRenderpass(cb, renderpassInfo);
+			}
+			// draw scene
+			{
+				float32_t3x4 viewMatrix;
+				float32_t4x4 viewProjMatrix;
+				// TODO: get rid of legacy matrices
+				{
+					const auto& camera = interface.camera;
+					memcpy(&viewMatrix, camera.getViewMatrix().pointer(), sizeof(viewMatrix));
+					memcpy(&viewProjMatrix, camera.getConcatenatedMatrix().pointer(), sizeof(viewProjMatrix));
+				}
+				const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix);
+
+				// tear down scene every frame
+				auto& instance = m_renderer->m_instances[0];
+				auto transposed = hlsl::transpose(interface.m_OBBModelMatrix);
+				memcpy(&instance.world, &transposed, sizeof(instance.world));
+				instance.packedGeo = m_renderer->getGeometries().data();// +interface.gcIndex;
+				m_renderer->render(cb, viewParams); // draw the cube/OBB
+
+
+				// TODO: a better way to get identity matrix
+				float32_t3x4 origin = {
+					0.2f,0.0f,0.0f,0.0f,
+					0.0f,0.2f,0.0f,0.0f,
+					0.0f,0.0f,0.2f,0.0f
+				};
+				memcpy(&instance.world, &origin, sizeof(instance.world));
+				instance.packedGeo = m_renderer->getGeometries().data() + 3; // sphere
+				m_renderer->render(cb, viewParams);
+			}
+			cb->endRenderPass();
+			cb->endDebugMarker();
+		}
+		{
+			cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame");
+			{
+				auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
+				const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
+				{
+					.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex),
+					.colorClearValues = &clearValue,
+					.depthStencilClearValues = nullptr,
+					.renderArea = {
+						.offset = {0,0},
+						.extent = {m_window->getWidth(),m_window->getHeight()}
+					}
+				};
+				beginRenderpass(cb, renderpassInfo);
+			}
+			// draw ImGUI
+			{
+				auto* imgui = interface.imGUI.get();
+				auto* pipeline = imgui->getPipeline();
+				cb->bindGraphicsPipeline(pipeline);
+				// note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx
+				const auto* ds = interface.subAllocDS->getDescriptorSet();
+				cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds);
+				// a timepoint in the future to release streaming resources for geometry
+				const ISemaphore::SWaitInfo drawFinished = { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1u };
+				if (!imgui->render(cb, drawFinished))
+				{
+					m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR);
+					return {};
+				}
+			}
+			cb->endRenderPass();
+			cb->endDebugMarker();
+		}
+		cb->end();
+
+		IQueue::SSubmitInfo::SSemaphoreInfo retval =
+		{
+			.semaphore = m_semaphore.get(),
+			.value = ++m_realFrameIx,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS
+		};
+		const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+		{
+			{.cmdbuf = cb }
+		};
+		const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = {
+			{
+				.semaphore = device_base_t::getCurrentAcquire().semaphore,
+				.value = device_base_t::getCurrentAcquire().acquireCount,
+				.stageMask = PIPELINE_STAGE_FLAGS::NONE
+			}
+		};
+		const IQueue::SSubmitInfo infos[] =
+		{
+			{
+				.waitSemaphores = acquired,
+				.commandBuffers = commandBuffers,
+				.signalSemaphores = {&retval,1}
+			}
+		};
+
+		if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS)
+		{
+			retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal
+			m_realFrameIx--;
+		}
+
+
+		m_window->setCaption("[Nabla Engine] UI App Test Demo");
+		return retval;
+	}
+
+protected:
+	const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override
+	{
+		// Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present
+		const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = {
+			// don't want any writes to be available, we'll clear, only thing to worry about is the layout transition
+			{
+				.srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.dstSubpass = 0,
+				.memoryBarrier = {
+					.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway 
+					.srcAccessMask = ACCESS_FLAGS::NONE,
+					// layout transition needs to finish before the color write
+					.dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+					.dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				}
+			// leave view offsets and flags default
+			},
+			// want layout transition to begin after all color output is done
+			{
+				.srcSubpass = 0,
+				.dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External,
+				.memoryBarrier = {
+				// last place where the color can get modified, depth is implicitly earlier
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT,
+				// only write ops, reads can't be made available
+				.srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT
+				// spec says nothing is needed when presentation is the destination
+			}
+			// leave view offsets and flags default
+		},
+		IGPURenderpass::SCreationParams::DependenciesEnd
+		};
+		return dependencies;
+	}
+
+private:
+	inline void update(const std::chrono::microseconds nextPresentationTimestamp)
+	{
+		auto& camera = interface.camera;
+		camera.setMoveSpeed(interface.moveSpeed);
+		camera.setRotateSpeed(interface.rotateSpeed);
+
+
+		m_inputSystem->getDefaultMouse(&mouse);
+		m_inputSystem->getDefaultKeyboard(&keyboard);
+
+		struct
+		{
+			std::vector<SMouseEvent> mouse{};
+			std::vector<SKeyboardEvent> keyboard{};
+		} uiEvents;
+
+		// TODO: should be a member really
+		static std::chrono::microseconds previousEventTimestamp{};
+
+		// I think begin/end should always be called on camera, just events shouldn't be fed, why?
+		// If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to
+		// `perActionDt` becoming obnoxiously large the first time the even processing resumes due to
+		// `timeDiff` being computed since `lastVirtualUpTimeStamp` 
+		camera.beginInputProcessing(nextPresentationTimestamp);
+		{
+			mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void
+				{
+					if (interface.move)
+						camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						uiEvents.mouse.emplace_back(e);
+
+						//if (e.type == nbl::ui::SMouseEvent::EET_SCROLL && m_renderer)
+						//{
+						//	interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll));
+						//	interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1);
+						//}
+					}
+				},
+				m_logger.get()
+			);
+			keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
+				{
+					//if (interface.move)
+						camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
+
+					for (const auto& e : events) // here capture
+					{
+						if (e.timeStamp < previousEventTimestamp)
+							continue;
+
+						previousEventTimestamp = e.timeStamp;
+						uiEvents.keyboard.emplace_back(e);
+					}
+				},
+				m_logger.get()
+			);
+		}
+		camera.endInputProcessing(nextPresentationTimestamp);
+
+		const auto cursorPosition = m_window->getCursorControl()->getPosition();
+
+		ext::imgui::UI::SUpdateParameters params =
+		{
+			.mousePosition = float32_t2(cursorPosition.x,cursorPosition.y) - float32_t2(m_window->getX(),m_window->getY()),
+			.displaySize = {m_window->getWidth(),m_window->getHeight()},
+			.mouseEvents = uiEvents.mouse,
+			.keyboardEvents = uiEvents.keyboard
+		};
+
+		//interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex];
+		interface.imGUI->update(params);
+	}
+
+	void recreateFramebuffer(const uint16_t2 resolution)
+	{
+		auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr<IGPUImageView>
+			{
+				auto image = m_device->createImage({ {
+					.type = IGPUImage::ET_2D,
+					.samples = IGPUImage::ESCF_1_BIT,
+					.format = format,
+					.extent = {resolution.x,resolution.y,1},
+					.mipLevels = 1,
+					.arrayLayers = 1,
+					.usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT
+				} });
+				if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid())
+					return nullptr;
+				IGPUImageView::SCreationParams params = {
+					.image = std::move(image),
+					.viewType = IGPUImageView::ET_2D,
+					.format = format
+				};
+				params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT;
+				return m_device->createImageView(std::move(params));
+			};
+
+		smart_refctd_ptr<IGPUImageView> solidAngleView;
+		smart_refctd_ptr<IGPUImageView> mainView;
+		// detect window minimization
+		if (resolution.x < 0x4000 && resolution.y < 0x4000)
+		{
+			solidAngleView = createImageAndView(finalSceneRenderFormat);
+			auto solidAngleDepthView = createImageAndView(sceneRenderDepthFormat);
+			m_solidAngleViewFramebuffer = m_device->createFramebuffer({ {
+				.renderpass = m_solidAngleRenderpass,
+				.depthStencilAttachments = &solidAngleDepthView.get(),
+				.colorAttachments = &solidAngleView.get(),
+				.width = resolution.x,
+				.height = resolution.y
+			} });
+
+			mainView = createImageAndView(finalSceneRenderFormat);
+			auto mainDepthView = createImageAndView(sceneRenderDepthFormat);
+			m_mainViewFramebuffer = m_device->createFramebuffer({ {
+					.renderpass = m_mainRenderpass,
+					.depthStencilAttachments = &mainDepthView.get(),
+					.colorAttachments = &mainView.get(),
+					.width = resolution.x,
+					.height = resolution.y
+				} });
+
+		}
+		else
+		{
+			m_solidAngleViewFramebuffer = nullptr;
+			m_mainViewFramebuffer = nullptr;
+		}
+
+		// release previous slot and its image
+		interface.subAllocDS->multi_deallocate(0, static_cast<int>(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx });
+		//
+		if (solidAngleView)
+		{
+			interface.subAllocDS->multi_allocate(0, static_cast<int>(CInterface::Count), interface.renderColorViewDescIndices);
+			// update descriptor set
+			IGPUDescriptorSet::SDescriptorInfo infos[static_cast<int>(CInterface::Count)] = {};
+			infos[0].desc = solidAngleView;
+			infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
+			infos[1].desc = mainView;
+			infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
+			const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast<int>(CInterface::Count)] = {
+				{.dstSet = interface.subAllocDS->getDescriptorSet(),
+				.binding = TexturesImGUIBindingIndex,
+				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_SOLID_ANGLE_VIEW)],
+				.count = 1,
+				.info = &infos[static_cast<int>(CInterface::ERV_MAIN_VIEW)]
+				},
+				{
+				.dstSet = interface.subAllocDS->getDescriptorSet(),
+				.binding = TexturesImGUIBindingIndex,
+				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_MAIN_VIEW)],
+				.count = 1,
+				.info = &infos[1]
+				}
+			};
+			m_device->updateDescriptorSets({ write, static_cast<int>(CInterface::Count) }, {});
+		}
+		interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW];
+	}
+
+	inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info)
+	{
+		cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE);
+		cb->setScissor(0, 1, &info.renderArea);
+		const SViewport viewport = {
+			.x = 0,
+			.y = 0,
+			.width = static_cast<float>(info.renderArea.extent.width),
+			.height = static_cast<float>(info.renderArea.extent.height)
+		};
+		cb->setViewport(0u, 1u, &viewport);
+	}
+
+	// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
+	constexpr static inline uint32_t MaxFramesInFlight = 3u;
+	constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT;
+	constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB;
+	constexpr static inline auto TexturesImGUIBindingIndex = 0u;
+	// we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes
+	constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight;
+
+	//
+	smart_refctd_ptr<CGeometryCreatorScene> m_scene;
+	smart_refctd_ptr<IGPURenderpass> m_solidAngleRenderpass;
+	smart_refctd_ptr<IGPURenderpass> m_mainRenderpass;
+	smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
+	smart_refctd_ptr<IGPUFramebuffer> m_solidAngleViewFramebuffer;
+	smart_refctd_ptr<IGPUFramebuffer> m_mainViewFramebuffer;
+	smart_refctd_ptr<video::IGPUGraphicsPipeline> m_visualizationPipeline;
+	//
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	//
+	InputSystem::ChannelReader<IMouseEventChannel> mouse;
+	InputSystem::ChannelReader<IKeyboardEventChannel> keyboard;
+	// UI stuff
+	struct CInterface
+	{
+		void cameraToHome()
+		{
+			core::vectorSIMDf cameraPosition(-3.0f, 3.0f, 6.0f);
+			core::vectorSIMDf cameraTarget(0.f, 0.f, 6.f);
+			const static core::vectorSIMDf up(0.f, 1.f, 0.f);
+
+			camera.setPosition(cameraPosition);
+			camera.setTarget(cameraTarget);
+			camera.setBackupUpVector(up);
+
+			camera.recomputeViewMatrix();
+		}
+
+		void operator()()
+		{
+			ImGuiIO& io = ImGui::GetIO();
+
+			// TODO: why is this a lambda and not just an assignment in a scope ?
+			camera.setProjectionMatrix([&]()
+				{
+					matrix4SIMD projection;
+
+					if (isPerspective)
+						if (isLH)
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+						else
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+					else
+					{
+						float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x;
+
+						if (isLH)
+							projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar);
+						else
+							projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar);
+					}
+
+					return projection;
+				}());
+
+			ImGuizmo::SetOrthographic(false);
+			ImGuizmo::BeginFrame();
+
+			ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
+			ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing);
+
+			// create a window and insert the inspector
+			ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing);
+			ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing);
+			ImGui::Begin("Editor");
+
+			//if (ImGui::RadioButton("Full view", !transformParams.useWindow))
+			//	transformParams.useWindow = false;
+
+			//ImGui::SameLine();
+
+			//if (ImGui::RadioButton("Window", transformParams.useWindow))
+			//	transformParams.useWindow = true;
+
+			ImGui::Text("Camera");
+			bool viewDirty = false;
+
+			if (ImGui::RadioButton("LH", isLH))
+				isLH = true;
+
+			ImGui::SameLine();
+
+			if (ImGui::RadioButton("RH", !isLH))
+				isLH = false;
+
+			if (ImGui::RadioButton("Perspective", isPerspective))
+				isPerspective = true;
+
+			ImGui::SameLine();
+
+			if (ImGui::RadioButton("Orthographic", !isPerspective))
+				isPerspective = false;
+
+			ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate);
+			//ImGui::Checkbox("Enable camera movement", &move);
+			ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f);
+			ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f);
+
+			// ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case
+
+			if (isPerspective)
+				ImGui::SliderFloat("Fov", &fov, 20.f, 150.f);
+			else
+				ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20);
+
+			ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
+			ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
+
+			viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f);
+
+			if (viewDirty || firstFrame)
+			{
+				cameraToHome();
+			}
+			firstFrame = false;
+
+			ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
+			if (ImGuizmo::IsUsing())
+			{
+				ImGui::Text("Using gizmo");
+			}
+			else
+			{
+				ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : "");
+				ImGui::SameLine();
+				ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : "");
+				ImGui::SameLine();
+				ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : "");
+				ImGui::SameLine();
+				ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : "");
+			}
+			ImGui::Separator();
+
+			/*
+			* ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout
+			* and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection
+
+			- VIEW:
+
+				ImGuizmo
+
+				|     X[0]          Y[0]          Z[0]         0.0f |
+				|     X[1]          Y[1]          Z[1]         0.0f |
+				|     X[2]          Y[2]          Z[2]         0.0f |
+				| -Dot(X, eye)  -Dot(Y, eye)  -Dot(Z, eye)     1.0f |
+
+				Nabla
+
+				|     X[0]         X[1]           X[2]     -Dot(X, eye)  |
+				|     Y[0]         Y[1]           Y[2]     -Dot(Y, eye)  |
+				|     Z[0]         Z[1]           Z[2]     -Dot(Z, eye)  |
+
+				<ImGuizmo View Matrix> = transpose(nbl::core::matrix4SIMD(<Nabla View Matrix>))
+
+			- PERSPECTIVE [PROJECTION CASE]:
+
+				ImGuizmo
+
+				|      (temp / temp2)                 (0.0)                       (0.0)                   (0.0)  |
+				|          (0.0)                  (temp / temp3)                  (0.0)                   (0.0)  |
+				| ((right + left) / temp2)   ((top + bottom) / temp3)    ((-zfar - znear) / temp4)       (-1.0f) |
+				|          (0.0)                      (0.0)               ((-temp * zfar) / temp4)        (0.0)  |
+
+				Nabla
+
+				|            w                        (0.0)                       (0.0)                   (0.0)               |
+				|          (0.0)                       -h                         (0.0)                   (0.0)               |
+				|          (0.0)                      (0.0)               (-zFar/(zFar-zNear))     (-zNear*zFar/(zFar-zNear)) |
+				|          (0.0)                      (0.0)                      (-1.0)                   (0.0)               |
+
+				<ImGuizmo Projection Matrix> = transpose(<Nabla Projection Matrix>)
+
+			*
+			* the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object,
+			* note it also modifies input view matrix but projection matrix is immutable
+			*/
+
+			if (ImGui::IsKeyPressed(ImGuiKey_Home))
+			{
+				cameraToHome();
+			}
+
+			if (ImGui::IsKeyPressed(ImGuiKey_End))
+			{
+				m_OBBModelMatrix = {
+					1.0f, 0.0f, 0.0f, 0.0f,
+					0.0f, 1.0f, 0.0f, 0.0f,
+					0.0f, 0.0f, 1.0f, 0.0f,
+					0.0f, 0.0f, 12.0f, 1.0f
+				};
+			}
+
+			static struct
+			{
+				float32_t4x4 view, projection, model;
+			} imguizmoM16InOut;
+
+			ImGuizmo::SetID(0u);
+
+			// TODO: camera will return hlsl::float32_tMxN 
+			auto view = *reinterpret_cast<const float32_t3x4*>(camera.getViewMatrix().pointer());
+			imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view));
+
+			// TODO: camera will return hlsl::float32_tMxN 
+			imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast<const float32_t4x4*>(camera.getProjectionMatrix().pointer()));
+			imguizmoM16InOut.model = m_OBBModelMatrix;
+
+			{
+				if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates
+					imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/	
+
+				transformParams.editTransformDecomposition = true;
+				transformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams);
+
+				// TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used
+				move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || transformReturnInfo.isGizmoWindowHovered) && (!transformReturnInfo.isGizmoBeingUsed);
+			}
+
+			// to Nabla + update camera & model matrices
+			// TODO: make it more nicely, extract:
+			// - Position by computing inverse of the view matrix and grabbing its translation
+			// - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position
+			// But then set the view matrix this way anyway, because up-vector may not be compatible
+			//const auto& view = camera.getViewMatrix();
+			//const_cast<core::matrix3x4SIMD&>(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok)
+			m_OBBModelMatrix = imguizmoM16InOut.model;
+
+			// object meta display
+			//{
+			//	ImGui::Begin("Object");
+			//	ImGui::Text("type: \"%s\"", objectName.data());
+			//	ImGui::End();
+			//}
+
+			// solid angle view window
+			{
+				ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing);
+				ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing);
+				static bool isOpen = true;
+				ImGui::Begin("Solid angle view", &isOpen, 0);
+
+				ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
+				ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize);
+				ImGui::End();
+			}
+
+			// view matrices editor
+			{
+				ImGui::Begin("Matrices");
+
+				auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true)
+					{
+						ImGui::Text(topText);
+						if (ImGui::BeginTable(tableName, columns))
+						{
+							for (int y = 0; y < rows; ++y)
+							{
+								ImGui::TableNextRow();
+								for (int x = 0; x < columns; ++x)
+								{
+									ImGui::TableSetColumnIndex(x);
+									ImGui::Text("%.3f", *(pointer + (y * columns) + x));
+								}
+							}
+							ImGui::EndTable();
+						}
+
+						if (withSeparator)
+							ImGui::Separator();
+					};
+
+				addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]);
+				addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, camera.getViewMatrix().pointer());
+				addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, camera.getProjectionMatrix().pointer(), false);
+
+				ImGui::End();
+			}
+
+			// Nabla Imgui backend MDI buffer info
+			// To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time,
+			// so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer.
+			{
+				auto* streaminingBuffer = imGUI->getStreamingBuffer();
+
+				const size_t total = streaminingBuffer->get_total_size();			// total memory range size for which allocation can be requested
+				const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size();		// max total free bloock memory size we can still allocate from total memory available
+				const size_t consumedMemory = total - freeSize;			// memory currently consumed by streaming buffer
+
+				float freePercentage = 100.0f * (float)(freeSize) / (float)total;
+				float allocatedPercentage = (float)(consumedMemory) / (float)total;
+
+				ImVec2 barSize = ImVec2(400, 30);
+				float windowPadding = 10.0f;
+				float verticalPadding = ImGui::GetStyle().FramePadding.y;
+
+				ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always);
+				ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar);
+
+				ImGui::Text("Total Allocated Size: %zu bytes", total);
+				ImGui::Text("In use: %zu bytes", consumedMemory);
+				ImGui::Text("Buffer Usage:");
+
+				ImGui::SetCursorPosX(windowPadding);
+
+				if (freePercentage > 70.0f)
+					ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f));  // Green
+				else if (freePercentage > 30.0f)
+					ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f));  // Yellow
+				else
+					ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f));  // Red
+
+				ImGui::ProgressBar(allocatedPercentage, barSize, "");
+
+				ImGui::PopStyleColor();
+
+				ImDrawList* drawList = ImGui::GetWindowDrawList();
+
+				ImVec2 progressBarPos = ImGui::GetItemRectMin();
+				ImVec2 progressBarSize = ImGui::GetItemRectSize();
+
+				const char* text = "%.2f%% free";
+				char textBuffer[64];
+				snprintf(textBuffer, sizeof(textBuffer), text, freePercentage);
+
+				ImVec2 textSize = ImGui::CalcTextSize(textBuffer);
+				ImVec2 textPos = ImVec2
+				(
+					progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f,
+					progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f
+				);
+
+				ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg);
+				drawList->AddRectFilled
+				(
+					ImVec2(textPos.x - 5, textPos.y - 2),
+					ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2),
+					ImGui::GetColorU32(bgColor)
+				);
+
+				ImGui::SetCursorScreenPos(textPos);
+				ImGui::Text("%s", textBuffer);
+
+				ImGui::Dummy(ImVec2(0.0f, verticalPadding));
+
+				ImGui::End();
+			}
+			ImGui::End();
+		}
+
+		smart_refctd_ptr<ext::imgui::UI> imGUI;
+
+		// descriptor set
+		smart_refctd_ptr<SubAllocatedDescriptorSet> subAllocDS;
+		enum E_RENDER_VIEWS : uint8_t
+		{
+			ERV_MAIN_VIEW,
+			ERV_SOLID_ANGLE_VIEW,
+			Count
+		};
+		SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = { SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value };
+		//
+		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+		// mutables
+		float32_t4x4 m_OBBModelMatrix{
+			1.0f, 0.0f, 0.0f, 0.0f,
+			0.0f, 1.0f, 0.0f, 0.0f,
+			0.0f, 0.0f, 1.0f, 0.0f,
+			0.0f, 0.0f, 12.0f, 1.0f
+		};
+
+		//std::string_view objectName;
+		TransformRequestParams transformParams;
+		TransformReturnInfo transformReturnInfo;
+
+		float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
+		float viewWidth = 10.f;
+		float camYAngle = 90.f / 180.f * 3.14159f;
+		float camXAngle = 0.f / 180.f * 3.14159f;
+		//uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed
+		bool isPerspective = true, isLH = true, flipGizmoY = true, move = true;
+		bool firstFrame = true;
+	} interface;
+};
+
+NBL_MAIN_FUNC(SolidAngleVisualizer)
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/pipeline.groovy b/72_SolidAngleVisualizer/pipeline.groovy
new file mode 100644
index 000000000..7b7c9702a
--- /dev/null
+++ b/72_SolidAngleVisualizer/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CUIBuilder extends IBuilder
+{
+	public CUIBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CUIBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/src/transform.cpp b/72_SolidAngleVisualizer/src/transform.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 574925e97..fddafdac1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES)
 
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
+	add_subdirectory(72_SolidAngleVisualizer)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 4969227114e9fa0775d65ca6ddc960d381da92a3 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Sun, 16 Nov 2025 00:23:29 -0300
Subject: [PATCH 24/57] Work on cooperative binary search

---
 72_CooperativeBinarySearch/CMakeLists.txt     |  24 ++
 .../app_resources/binarySearch.comp.hlsl      |  20 ++
 .../app_resources/common.h                    |  19 ++
 .../app_resources/present.frag.hlsl           |  19 ++
 .../config.json.template                      |  28 +++
 .../include/nbl/this_example/common.hpp       |  11 +
 72_CooperativeBinarySearch/main.cpp           | 232 ++++++++++++++++++
 72_CooperativeBinarySearch/pipeline.groovy    |  50 ++++
 CMakeLists.txt                                |   1 +
 9 files changed, 404 insertions(+)
 create mode 100644 72_CooperativeBinarySearch/CMakeLists.txt
 create mode 100644 72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
 create mode 100644 72_CooperativeBinarySearch/app_resources/common.h
 create mode 100644 72_CooperativeBinarySearch/app_resources/present.frag.hlsl
 create mode 100644 72_CooperativeBinarySearch/config.json.template
 create mode 100644 72_CooperativeBinarySearch/include/nbl/this_example/common.hpp
 create mode 100644 72_CooperativeBinarySearch/main.cpp
 create mode 100644 72_CooperativeBinarySearch/pipeline.groovy

diff --git a/72_CooperativeBinarySearch/CMakeLists.txt b/72_CooperativeBinarySearch/CMakeLists.txt
new file mode 100644
index 000000000..b7e52875d
--- /dev/null
+++ b/72_CooperativeBinarySearch/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources") 
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
new file mode 100644
index 000000000..f44a35b21
--- /dev/null
+++ b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
@@ -0,0 +1,20 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma wave shader_stage(compute)
+
+#include "common.h"
+using namespace nbl::hlsl;
+
+[[vk::push_constant]] ConstantBuffer<PushConstants> Constants;
+[[vk::binding(0)]] StructuredBuffer<uint> Histogram;
+[[vk::binding(1)]] RWStructuredBuffer<uint> Output;
+
+static const uint32_t GroupsharedSize = 256;
+
+[numthreads(256, 1, 1)]
+void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID)
+{
+
+}
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/app_resources/common.h b/72_CooperativeBinarySearch/app_resources/common.h
new file mode 100644
index 000000000..4a3cacaa4
--- /dev/null
+++ b/72_CooperativeBinarySearch/app_resources/common.h
@@ -0,0 +1,19 @@
+#ifndef _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
+#define _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+
+using namespace nbl::hlsl;
+namespace nbl {
+namespace hlsl {
+
+struct PushConstants
+{
+	uint32_t EntityCount;
+};
+
+};
+};
+
+#endif // _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
diff --git a/72_CooperativeBinarySearch/app_resources/present.frag.hlsl b/72_CooperativeBinarySearch/app_resources/present.frag.hlsl
new file mode 100644
index 000000000..22695657c
--- /dev/null
+++ b/72_CooperativeBinarySearch/app_resources/present.frag.hlsl
@@ -0,0 +1,19 @@
+// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma wave shader_stage(fragment)
+
+// vertex shader is provided by the fullScreenTriangle extension
+#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+using namespace nbl::hlsl;
+using namespace ext::FullScreenTriangle;
+
+// binding 0 set 0
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
+[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
+
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
+{
+    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
+}
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/config.json.template b/72_CooperativeBinarySearch/config.json.template
new file mode 100644
index 000000000..24adf54fb
--- /dev/null
+++ b/72_CooperativeBinarySearch/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp b/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp
new file mode 100644
index 000000000..3745ca512
--- /dev/null
+++ b/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp
@@ -0,0 +1,11 @@
+#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
+
+#include "nbl/examples/examples.hpp"
+
+// example's own headers
+#include "nbl/ui/ICursorControl.h" // TODO: why not in nabla.h ?
+#include "nbl/ext/ImGui/ImGui.h"
+#include "imgui/imgui_internal.h"
+
+#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp
new file mode 100644
index 000000000..fda1a63c1
--- /dev/null
+++ b/72_CooperativeBinarySearch/main.cpp
@@ -0,0 +1,232 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/examples/examples.hpp"
+#include "nbl/system/IApplicationFramework.h"
+#include "app_resources/common.h"
+
+#include <iostream>
+#include <cstdio>
+#include <assert.h>
+
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+//using namespace glm;
+
+void cpu_tests();
+
+class CooperativeBinarySearch final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+    using device_base_t = application_templates::MonoDeviceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
+public:
+    CooperativeBinarySearch(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+        if (!asset_base_t::onAppInitialized(std::move(system)))
+            return false;
+
+        m_queue = m_device->getQueue(0, 0);
+        m_commandPool = m_device->createCommandPool(m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        m_commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &m_cmdbuf,1 }, smart_refctd_ptr(m_logger));
+
+        smart_refctd_ptr<IShader> shader;
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            auto assetBundle = m_assetMgr->getAsset("app_resources/binarySearch.comp.hlsl", lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return logFail("Could not load shader!");
+
+            auto source = IAsset::castDown<IShader>(assets[0]);
+            // The down-cast should not fail!
+            assert(source);
+
+            // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
+            shader = m_device->compileShader({ source.get() });
+            if (!shader)
+                return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
+        }
+
+		const uint32_t bindingCount = 2u;
+		IGPUDescriptorSetLayout::SBinding bindings[bindingCount] = {};
+		bindings[0].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(0)]] StructuredBuffer<uint> Histogram;
+		bindings[1].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(1)]] RWStructuredBuffer<uint> Output;
+        
+        for(int i = 0; i < bindingCount; ++i)
+        {
+            bindings[i].stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+            bindings[i].count = 1;
+            bindings[i].binding = i;
+        }
+		m_descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);
+        {
+		    SPushConstantRange pcRange = {};
+		    pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+		    pcRange.offset = 0u;
+		    pcRange.size = 2 * sizeof(uint32_t);
+            auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout));
+            IGPUComputePipeline::SCreationParams params = {};
+            params.layout = layout.get();
+            params.shader.shader = shader.get();
+            params.shader.entryPoint = "main";
+            if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+                return logFail("Failed to create compute pipeline!\n");
+        }
+
+        for (uint32_t i = 0; i < bindingCount; i++)
+        {
+            m_buffers[i] = m_device->createBuffer(IGPUBuffer::SCreationParams {
+                {.size = 500000, .usage = 
+                    IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | 
+                    IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT,
+                }
+            });
+
+            auto reqs = m_buffers[i]->getMemoryReqs();
+            reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits();
+            m_device->allocate(reqs, m_buffers[i].get());
+        }
+
+		smart_refctd_ptr<IDescriptorPool> descriptorPool = nullptr;
+		{
+            IDescriptorPool::SCreateInfo createInfo = {};
+            createInfo.maxSets = 1;
+            createInfo.maxDescriptorCount[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1;
+            descriptorPool = m_device->createDescriptorPool(std::move(createInfo));
+        }
+
+        m_descriptorSet = descriptorPool->createDescriptorSet(smart_refctd_ptr(m_descriptorSetLayout));
+
+        IGPUDescriptorSet::SDescriptorInfo descriptorInfos[bindingCount] = {};
+        IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSets[bindingCount] = {};
+        
+        for(int i = 0; i < bindingCount; ++i)
+        {
+            writeDescriptorSets[i].info = &descriptorInfos[i];
+            writeDescriptorSets[i].dstSet = m_descriptorSet.get();
+            writeDescriptorSets[i].binding = i;
+            writeDescriptorSets[i].count = bindings[i].count;
+
+			descriptorInfos[i].desc = m_buffers[i];
+			descriptorInfos[i].info.buffer.size = ~0ull;
+        }
+
+        m_device->updateDescriptorSets(bindingCount, writeDescriptorSets, 0u, nullptr);
+       
+        // In contrast to fences, we just need one semaphore to rule all dispatches
+        return true;
+    }
+
+    void onAppTerminated_impl() override
+    {
+        m_device->waitIdle();
+    }
+
+    void workLoopBody() override
+    {
+        cpu_tests();
+
+        constexpr auto StartedValue = 0;
+
+        smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(StartedValue);
+
+        m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+        m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+
+        IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t layoutBufferBarrier[1] = { {
+            .barrier = {
+                .dep = {
+                    .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+                    .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT,
+                    .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+                    .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS
+                }
+            },
+			// whole buffer because we transferred the contents into it
+			.range = {.offset = 0,.size = m_buffers[1]->getCreationParams().size,.buffer = m_buffers[1]}
+        } };
+
+        const IGPUCommandBuffer::SPipelineBarrierDependencyInfo depInfo = { .bufBarriers = layoutBufferBarrier };
+        m_cmdbuf->pipelineBarrier(EDF_NONE, depInfo);
+        
+
+        const uint32_t pushConstants[2] = { 1920, 1080 };
+        const IGPUDescriptorSet* set = m_descriptorSet.get();
+        m_cmdbuf->bindComputePipeline(m_pipeline.get());
+        m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set);
+        m_cmdbuf->dispatch(240, 135, 1u);
+
+		layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT);
+        m_cmdbuf->pipelineBarrier(EDF_NONE,depInfo);
+        
+        m_cmdbuf->end();
+
+        {
+            constexpr auto FinishedValue = 69;
+            IQueue::SSubmitInfo submitInfos[1] = {};
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
+            submitInfos[0].commandBuffers = cmdbufs;
+            const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
+            submitInfos[0].signalSemaphores = signals;
+            m_api->startCapture();
+            m_queue->submit(submitInfos);
+            m_api->endCapture();
+            const ISemaphore::SWaitInfo waitInfos[] = { {
+                    .semaphore = progress.get(),
+                    .value = FinishedValue
+                } };
+            m_device->blockForSemaphores(waitInfos);
+        }
+
+		auto mem = m_buffers[1]->getBoundMemory();
+		assert(mem.memory->isMappable());
+		auto* ptr = mem.memory->map({ .offset = 0, .length = mem.memory->getAllocationSize() });
+        printf("readback ptr %p\n", ptr);
+
+        m_keepRunning = false;
+    }
+
+    bool keepRunning() override
+    {
+        return m_keepRunning;
+    }
+
+
+private:
+    smart_refctd_ptr<IGPUComputePipeline> m_pipeline = nullptr;
+    smart_refctd_ptr<IGPUDescriptorSetLayout> m_descriptorSetLayout;
+    smart_refctd_ptr<IGPUDescriptorSet> m_descriptorSet;
+
+    smart_refctd_ptr<IGPUBuffer> m_buffers[2];
+    smart_refctd_ptr<IGPUCommandBuffer> m_cmdbuf = nullptr;
+    IQueue* m_queue;
+    smart_refctd_ptr<IGPUCommandPool> m_commandPool;
+    uint64_t m_iteration = 0;
+    constexpr static inline uint64_t MaxIterations = 200;
+
+    bool m_keepRunning = true;
+};
+
+NBL_MAIN_FUNC(CooperativeBinarySearch)
+
+void cpu_tests()
+{
+}
diff --git a/72_CooperativeBinarySearch/pipeline.groovy b/72_CooperativeBinarySearch/pipeline.groovy
new file mode 100644
index 000000000..eb20d0c5a
--- /dev/null
+++ b/72_CooperativeBinarySearch/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CComputeShaderPathTracerBuilder extends IBuilder
+{
+	public CComputeShaderPathTracerBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CComputeShaderPathTracerBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8ce94f93..39f3275ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,7 @@ if(NBL_BUILD_EXAMPLES)
 
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
+	add_subdirectory(72_CooperativeBinarySearch)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From e54642803cd47e47adfe9a20318ca8c634c86643 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 3 Dec 2025 18:32:45 -0300
Subject: [PATCH 25/57] Patch things for cooperative binary search test

---
 .../app_resources/binarySearch.comp.hlsl      |  103 +-
 72_CooperativeBinarySearch/main.cpp           |   28 +-
 72_CooperativeBinarySearch/testCaseData.h     | 1192 +++++++++++++++++
 3 files changed, 1316 insertions(+), 7 deletions(-)
 create mode 100644 72_CooperativeBinarySearch/testCaseData.h

diff --git a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
index f44a35b21..05c0d8464 100644
--- a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
+++ b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
@@ -5,16 +5,115 @@
 #pragma wave shader_stage(compute)
 
 #include "common.h"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl"
 using namespace nbl::hlsl;
 
-[[vk::push_constant]] ConstantBuffer<PushConstants> Constants;
+[[vk::push_constant]] PushConstants Constants;
 [[vk::binding(0)]] StructuredBuffer<uint> Histogram;
 [[vk::binding(1)]] RWStructuredBuffer<uint> Output;
 
 static const uint32_t GroupsharedSize = 256;
 
+uint getNextPowerOfTwo(uint number) {
+	return 2 << firstbithigh(number - 1);
+}
+
+uint getLaneWithFirstBitSet(bool condition) {
+	uint4 ballot = WaveActiveBallot(condition);
+	if (all(ballot == 0)) {
+		return WaveGetLaneCount();
+	}
+	return nbl::hlsl::glsl::subgroupBallotFindLSB(ballot);
+}
+
+// findValue must be the same across the entire wave
+// Could use something like WaveReadFirstLane to be fully sure
+uint binarySearchLowerBoundFindValue(uint findValue, StructuredBuffer<uint> searchBuffer, uint searchBufferSize) {
+	uint lane = WaveGetLaneIndex();
+	
+	uint left = 0;
+	uint right = searchBufferSize - 1;
+
+	uint32_t range = getNextPowerOfTwo(right - left);
+	// do pivots as long as we can't coalesced load
+	while (range > WaveGetLaneCount())
+	{
+		// there must be at least 1 gap between subsequent pivots 
+		const uint32_t step = range / WaveGetLaneCount(); 
+		const uint32_t halfStep = step >> 1;
+		const uint32_t pivotOffset = lane * step+halfStep;
+		const uint32_t pivotIndex = left + pivotOffset;
+
+		uint4 notGreaterPivots = WaveActiveBallot(pivotIndex < right && !(findValue < searchBuffer[pivotIndex]));
+		uint partition = nbl::hlsl::glsl::subgroupBallotBitCount(notGreaterPivots);
+		// only move left if needed
+		if (partition != 0)
+			left += partition * step - halfStep;
+		// if we go into final half partition, the range becomes less too
+		range = partition != WaveGetLaneCount() ? step : halfStep;
+	}
+
+	uint threadSearchIndex = left + lane;
+	bool laneValid = threadSearchIndex < searchBufferSize;
+	uint histAtIndex = laneValid ? searchBuffer[threadSearchIndex] : -1;
+	uint firstLaneGreaterThan = getLaneWithFirstBitSet(histAtIndex > findValue);
+
+	return left + firstLaneGreaterThan - 1;
+}
+
+groupshared uint shared_groupSearchBufferMinIndex;
+groupshared uint shared_groupSearchBufferMaxIndex;
+groupshared uint shared_groupSearchValues[GroupsharedSize];
+
+// Binary search using the entire workgroup, making it log32 or log64 (every iteration, the possible set of 
+// values is divided by the number of lanes in a wave)
+uint binarySearchLowerBoundCooperative(uint groupIndex, uint groupThread, StructuredBuffer<uint> searchBuffer, uint searchBufferSize) {
+	uint minSearchValue = groupIndex.x * GroupsharedSize;
+	uint maxSearchValue = ((groupIndex.x + 1) * GroupsharedSize) - 1;
+
+	// On each workgroup, two subgroups do the search
+	// - One searches for the minimum, the other searches for the maximum
+	// - Store the minimum and maximum on groupshared memory, then do a barrier
+	uint wave = groupThread / WaveGetLaneCount();
+	if (wave < 2) {
+		uint search = wave == 0 ? minSearchValue : maxSearchValue;
+		uint searchResult = binarySearchLowerBoundFindValue(search, searchBuffer, searchBufferSize);
+		if (WaveIsFirstLane()) {
+			if (wave == 0) shared_groupSearchBufferMinIndex = searchResult;
+			else shared_groupSearchBufferMaxIndex = searchResult;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	// Since every instance has at least one triangle, we know that having workgroup values 
+	// for each value in the range of minimum to maximum will suffice.
+
+	// Write every value in the range to groupshared memory and barrier.
+	uint idx = shared_groupSearchBufferMinIndex + groupThread.x;
+	if (idx <= shared_groupSearchBufferMaxIndex) {
+		shared_groupSearchValues[groupThread.x] = searchBuffer[idx];
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	uint maxValueIndex = shared_groupSearchBufferMaxIndex - shared_groupSearchBufferMinIndex;
+
+	uint searchValue = minSearchValue + groupThread;
+	uint currentSearchValueIndex = 0;
+	uint laneValue = shared_groupSearchBufferMaxIndex;
+	while (currentSearchValueIndex <= maxValueIndex) {
+		uint curValue = shared_groupSearchValues[currentSearchValueIndex];
+		if (curValue > searchValue) {
+			laneValue = shared_groupSearchBufferMinIndex + currentSearchValueIndex - 1;
+			break;
+		}
+		currentSearchValueIndex ++;
+	}
+
+	return laneValue;
+}
+
 [numthreads(256, 1, 1)]
 void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID)
 {
-
+    Output[thread.x] = binarySearchLowerBoundCooperative(group.x, groupThread.x, Histogram, Constants.EntityCount);
 }
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp
index fda1a63c1..e2611dea7 100644
--- a/72_CooperativeBinarySearch/main.cpp
+++ b/72_CooperativeBinarySearch/main.cpp
@@ -22,6 +22,11 @@ using namespace nbl::examples;
 
 //using namespace glm;
 
+static constexpr uint32_t TestCaseIndices[] = {
+#include "testCaseData.h"
+};
+
+
 void cpu_tests();
 
 class CooperativeBinarySearch final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
@@ -101,14 +106,19 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
 
             auto reqs = m_buffers[i]->getMemoryReqs();
             reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits();
-            m_device->allocate(reqs, m_buffers[i].get());
+
+            m_allocations[i] = m_device->allocate(reqs, m_buffers[i].get());
+            
+            auto allocationType = i == 0 ? IDeviceMemoryAllocation::EMCAF_WRITE : IDeviceMemoryAllocation::EMCAF_READ;
+            auto mapResult = m_allocations[i].memory->map({ 0ull,m_allocations[i].memory->getAllocationSize() }, allocationType);
+            assert(mapResult);
         }
 
 		smart_refctd_ptr<IDescriptorPool> descriptorPool = nullptr;
 		{
             IDescriptorPool::SCreateInfo createInfo = {};
             createInfo.maxSets = 1;
-            createInfo.maxDescriptorCount[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1;
+            createInfo.maxDescriptorCount[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = bindingCount;
             descriptorPool = m_device->createDescriptorPool(std::move(createInfo));
         }
 
@@ -130,6 +140,14 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
 
         m_device->updateDescriptorSets(bindingCount, writeDescriptorSets, 0u, nullptr);
        
+        // Write test data to the m_buffers[0]
+        auto outPtr = m_allocations[0].memory->getMappedPointer();
+        assert(outPtr);
+        memcpy(
+            reinterpret_cast<void*>(outPtr), 
+            reinterpret_cast<const void*>(&TestCaseIndices[0]), 
+            sizeof(TestCaseIndices));
+
         // In contrast to fences, we just need one semaphore to rule all dispatches
         return true;
     }
@@ -196,9 +214,8 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
             m_device->blockForSemaphores(waitInfos);
         }
 
-		auto mem = m_buffers[1]->getBoundMemory();
-		assert(mem.memory->isMappable());
-		auto* ptr = mem.memory->map({ .offset = 0, .length = mem.memory->getAllocationSize() });
+        auto ptr = m_allocations[1].memory->getMappedPointer();
+        assert(ptr);
         printf("readback ptr %p\n", ptr);
 
         m_keepRunning = false;
@@ -216,6 +233,7 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
     smart_refctd_ptr<IGPUDescriptorSet> m_descriptorSet;
 
     smart_refctd_ptr<IGPUBuffer> m_buffers[2];
+	nbl::video::IDeviceMemoryAllocator::SAllocation m_allocations[2] = {};
     smart_refctd_ptr<IGPUCommandBuffer> m_cmdbuf = nullptr;
     IQueue* m_queue;
     smart_refctd_ptr<IGPUCommandPool> m_commandPool;
diff --git a/72_CooperativeBinarySearch/testCaseData.h b/72_CooperativeBinarySearch/testCaseData.h
new file mode 100644
index 000000000..16153780e
--- /dev/null
+++ b/72_CooperativeBinarySearch/testCaseData.h
@@ -0,0 +1,1192 @@
+0,
+298,
+554,
+582,
+912,
+1074,
+1076,
+1078,
+1170,
+1188,
+2140,
+2414,
+2736,
+2738,
+3980,
+4800,
+5898,
+5900,
+6936,
+8106,
+8152,
+8650,
+8844,
+8930,
+9504,
+10244,
+10826,
+10828,
+11126,
+11430,
+12206,
+13764,
+14010,
+15302,
+15624,
+15656,
+16414,
+16494,
+17368,
+17432,
+18312,
+18948,
+19376,
+19818,
+20146,
+20604,
+21240,
+22446,
+23482,
+24914,
+25042,
+25538,
+26764,
+27564,
+27566,
+28472,
+29450,
+30202,
+31474,
+32160,
+32676,
+33792,
+33794,
+34704,
+36540,
+37456,
+37950,
+38364,
+39274,
+40442,
+40518,
+41412,
+41590,
+41950,
+42022,
+42714,
+43464,
+43790,
+43792,
+44876,
+44878,
+46188,
+46572,
+47352,
+47650,
+48242,
+49856,
+49858,
+50506,
+50968,
+50970,
+51152,
+51154,
+52870,
+52884,
+53332,
+53334,
+53904,
+53964,
+53966,
+53968,
+53970,
+53972,
+53974,
+53976,
+53978,
+53980,
+54514,
+54516,
+54518,
+54520,
+54762,
+55866,
+56462,
+56478,
+56480,
+56482,
+57510,
+57568,
+57570,
+57572,
+57846,
+57848,
+58760,
+59408,
+59438,
+60198,
+60200,
+60202,
+60204,
+60284,
+60938,
+61274,
+61720,
+62296,
+63116,
+63378,
+63380,
+63382,
+63384,
+63386,
+63388,
+63904,
+64572,
+65142,
+65144,
+65146,
+65554,
+65738,
+66052,
+67016,
+67424,
+67566,
+68270,
+68272,
+68610,
+69240,
+69870,
+70988,
+72622,
+73258,
+73260,
+73580,
+74524,
+74880,
+74958,
+74960,
+74962,
+75114,
+75116,
+75622,
+77144,
+77798,
+77800,
+78314,
+79566,
+79568,
+79570,
+79572,
+79850,
+79852,
+81576,
+81684,
+81686,
+82492,
+82494,
+82496,
+82498,
+83990,
+84860,
+84988,
+84990,
+85138,
+85772,
+86120,
+86122,
+86564,
+87402,
+87404,
+87602,
+88676,
+88714,
+88780,
+89560,
+89732,
+90786,
+91128,
+91130,
+91272,
+91522,
+91804,
+92588,
+92590,
+92834,
+93268,
+93736,
+94448,
+94704,
+94706,
+95074,
+95076,
+96706,
+97040,
+97770,
+98000,
+98676,
+99968,
+100074,
+100318,
+100602,
+100914,
+101020,
+101872,
+101878,
+103078,
+104246,
+104266,
+105436,
+106332,
+106954,
+107856,
+108954,
+110320,
+110780,
+111588,
+111882,
+112502,
+112676,
+113496,
+114070,
+115204,
+115422,
+115424,
+115858,
+116420,
+117426,
+118504,
+118870,
+119296,
+119618,
+119650,
+120408,
+120488,
+121362,
+121426,
+122306,
+122942,
+123370,
+123812,
+124140,
+124598,
+125234,
+126440,
+127476,
+128908,
+129036,
+129532,
+130758,
+131558,
+131560,
+132466,
+133444,
+134196,
+135468,
+136154,
+136670,
+137786,
+137788,
+138698,
+140534,
+140832,
+141608,
+142422,
+143220,
+143468,
+143714,
+144504,
+145078,
+145670,
+146224,
+146874,
+147726,
+148692,
+149536,
+151032,
+151126,
+153382,
+154128,
+155190,
+155212,
+156324,
+156484,
+156526,
+157026,
+158242,
+158446,
+158448,
+158594,
+159256,
+160350,
+160444,
+161040,
+161624,
+162418,
+162524,
+162768,
+163052,
+163364,
+163470,
+164322,
+164328,
+165528,
+166696,
+166716,
+167886,
+168782,
+169404,
+170306,
+171404,
+172770,
+173230,
+174038,
+174332,
+174952,
+175126,
+175946,
+176520,
+177654,
+177872,
+177874,
+178308,
+178870,
+179876,
+180954,
+181320,
+181746,
+182160,
+183070,
+184238,
+184314,
+185208,
+185386,
+185746,
+185818,
+186510,
+187260,
+187586,
+187588,
+188672,
+188674,
+189984,
+190368,
+191148,
+191446,
+192038,
+193652,
+193654,
+194302,
+194764,
+194766,
+194948,
+194950,
+196666,
+196680,
+197128,
+197130,
+197700,
+198048,
+198824,
+199638,
+200436,
+200684,
+200930,
+201720,
+202294,
+202886,
+203440,
+204090,
+204942,
+205908,
+206752,
+208248,
+208342,
+210598,
+211344,
+212406,
+212428,
+213540,
+213700,
+213742,
+214242,
+215458,
+215662,
+215664,
+215810,
+216472,
+217566,
+217660,
+218256,
+218316,
+218318,
+218320,
+218322,
+218324,
+218326,
+218328,
+218330,
+218332,
+218866,
+218868,
+218870,
+218872,
+219114,
+220218,
+220814,
+220830,
+220832,
+220834,
+221862,
+221920,
+221922,
+221924,
+222198,
+222200,
+223112,
+223760,
+223790,
+224550,
+224552,
+224554,
+224556,
+225140,
+225794,
+226130,
+226576,
+227152,
+227972,
+228234,
+228236,
+228238,
+228240,
+228242,
+228244,
+228760,
+229428,
+229998,
+230000,
+230002,
+230410,
+230594,
+230908,
+231872,
+232280,
+232422,
+233126,
+233128,
+233466,
+234096,
+234726,
+235844,
+237478,
+238114,
+238116,
+238512,
+239256,
+239812,
+240660,
+241950,
+243244,
+243366,
+244346,
+244412,
+244710,
+245202,
+246504,
+246728,
+246988,
+247592,
+248630,
+249562,
+250962,
+251964,
+252562,
+253140,
+253412,
+254672,
+255276,
+256084,
+256160,
+256378,
+257104,
+257602,
+257776,
+258240,
+258556,
+258614,
+259208,
+260496,
+261202,
+261398,
+262284,
+262610,
+262976,
+263578,
+264622,
+265558,
+266692,
+266756,
+268110,
+268994,
+269158,
+269718,
+270388,
+270768,
+271098,
+271786,
+272398,
+272996,
+273140,
+273612,
+274226,
+274660,
+275070,
+275416,
+275634,
+275680,
+276088,
+276408,
+276410,
+276852,
+277690,
+277692,
+277890,
+278964,
+279002,
+279068,
+279848,
+280020,
+281074,
+281416,
+281418,
+281560,
+281810,
+282092,
+282876,
+282878,
+283122,
+283556,
+284024,
+284736,
+284992,
+284994,
+285362,
+285364,
+286994,
+287328,
+288058,
+288288,
+288964,
+289708,
+289746,
+290266,
+291136,
+292152,
+292740,
+292834,
+293708,
+293768,
+293936,
+294846,
+295028,
+295040,
+295130,
+295372,
+296154,
+296736,
+297250,
+297606,
+298068,
+298310,
+299420,
+300362,
+301176,
+301502,
+301878,
+302702,
+303576,
+303896,
+305170,
+305928,
+306070,
+306150,
+307094,
+307450,
+307528,
+307530,
+307532,
+307684,
+307686,
+308192,
+309714,
+310368,
+310370,
+310884,
+312136,
+312138,
+312140,
+312142,
+312420,
+312422,
+314146,
+314254,
+314256,
+315062,
+315064,
+315066,
+315068,
+316560,
+317430,
+317558,
+317560,
+317708,
+318342,
+319182,
+319992,
+320612,
+320956,
+321068,
+321076,
+322784,
+322914,
+323106,
+324036,
+324708,
+326092,
+326994,
+327332,
+328080,
+328444,
+329022,
+329256,
+330454,
+331304,
+331610,
+332432,
+332440,
+333298,
+334300,
+334478,
+334622,
+335370,
+335818,
+336456,
+336618,
+337930,
+338932,
+339158,
+339258,
+339746,
+340226,
+340254,
+340256,
+340988,
+341638,
+342674,
+343168,
+343440,
+344024,
+344026,
+344106,
+345118,
+346124,
+347350,
+348560,
+348878,
+349066,
+350192,
+350840,
+351388,
+353610,
+354562,
+355208,
+356084,
+356966,
+358222,
+359304,
+359470,
+360054,
+360710,
+360920,
+361896,
+362930,
+362962,
+363128,
+363234,
+363272,
+363284,
+363456,
+363732,
+364418,
+364926,
+365096,
+365170,
+365920,
+366796,
+367838,
+368232,
+368940,
+369508,
+369530,
+370886,
+371156,
+371348,
+372384,
+372680,
+372690,
+373252,
+373676,
+374168,
+374424,
+374452,
+374782,
+374944,
+374946,
+374948,
+375040,
+375058,
+376010,
+376284,
+376606,
+376608,
+377850,
+378670,
+379768,
+379770,
+380806,
+381976,
+382022,
+382520,
+382714,
+382800,
+383374,
+384114,
+384696,
+384698,
+384996,
+385300,
+386076,
+387634,
+387880,
+388796,
+389290,
+389302,
+389314,
+389338,
+389406,
+389434,
+389470,
+389840,
+389952,
+390908,
+391076,
+391188,
+392118,
+392458,
+392472,
+392622,
+392766,
+393448,
+394586,
+394816,
+394824,
+395486,
+396218,
+396880,
+396910,
+397066,
+397076,
+397124,
+397678,
+398050,
+399160,
+400080,
+401696,
+401762,
+402400,
+402500,
+402512,
+403152,
+404038,
+404444,
+404648,
+404740,
+405322,
+406252,
+407076,
+408252,
+408634,
+409354,
+410112,
+411138,
+411672,
+411880,
+412232,
+412926,
+412956,
+413864,
+414624,
+415770,
+415978,
+417234,
+417256,
+417264,
+418562,
+418812,
+418824,
+418836,
+418860,
+418928,
+418956,
+418992,
+419362,
+419474,
+420430,
+420598,
+420710,
+421640,
+421980,
+421994,
+422144,
+422288,
+422970,
+424108,
+424338,
+424346,
+425008,
+425740,
+426402,
+426432,
+426588,
+426598,
+426646,
+427200,
+427572,
+428682,
+429602,
+430346,
+430412,
+431050,
+431150,
+431162,
+431802,
+432688,
+433094,
+433298,
+433390,
+433972,
+434902,
+435726,
+436902,
+437284,
+438004,
+438762,
+439788,
+440322,
+440530,
+440882,
+441576,
+441606,
+442514,
+443274,
+444420,
+444628,
+445884,
+445906,
+445914,
+447212,
+447462,
+448464,
+448690,
+448790,
+449278,
+449758,
+449786,
+449788,
+450520,
+451170,
+452206,
+452700,
+452972,
+453556,
+453558,
+453638,
+454650,
+455656,
+456882,
+458092,
+458410,
+458598,
+459724,
+460372,
+460920,
+463142,
+464094,
+464740,
+465616,
+466498,
+467754,
+468836,
+469002,
+469586,
+470180,
+471468,
+472174,
+472370,
+473256,
+473582,
+473948,
+474550,
+475594,
+476530,
+477664,
+477728,
+479082,
+479966,
+480130,
+480690,
+481360,
+481740,
+482070,
+482758,
+483370,
+483968,
+484112,
+484584,
+485198,
+485632,
+486042,
+486388,
+486606,
+486652,
+487060,
+488676,
+489420,
+489976,
+490824,
+492114,
+493408,
+493530,
+494510,
+494576,
+494874,
+495366,
+496668,
+496892,
+497152,
+497756,
+498794,
+499726,
+501126,
+502128,
+502726,
+503304,
+503576,
+504836,
+505440,
+506248,
+506324,
+506542,
+507268,
+507766,
+507940,
+508404,
+508720,
+509514,
+510170,
+510380,
+511356,
+512390,
+512422,
+512588,
+512694,
+512732,
+512744,
+512916,
+513192,
+513878,
+514386,
+514556,
+514630,
+515380,
+516256,
+517298,
+517692,
+518400,
+518968,
+518990,
+520346,
+520616,
+520808,
+521844,
+522140,
+522150,
+522712,
+523136,
+523628,
+524468,
+525278,
+525898,
+526242,
+526354,
+526362,
+528070,
+528200,
+528392,
+529322,
+529994,
+531378,
+532280,
+532618,
+533366,
+533730,
+534308,
+534542,
+535740,
+536590,
+536896,
+537718,
+537726,
+538584,
+539586,
+539764,
+539908,
+540656,
+541104,
+541742,
+541904,
+543216,
+543612,
+543650,
+544170,
+545040,
+546056,
+546644,
+546738,
+547612,
+547672,
+547840,
+548750,
+548932,
+548944,
+549034,
+549276,
+550058,
+550640,
+551154,
+551510,
+551972,
+552214,
+553324,
+554266,
+555080,
+555406,
+555782,
+556606,
+557480,
+557800,
+559074,
+559832,
+559974,
+550468,
+551276,
+552568,
+552866,
+553798,
+554120,
+554294,
+555554,
+556448,
+556874,
+557328,
+557680,
+558532,
+559844,
+560774,
+561050,
+561458,
+562684,
+563910,
+564026,
+564542,
+565294,
+565434,
+566278,
+567580,
+568006,
+568328,
+569626,
+570350,
+570998,
+572812,
+573008,
+573500,
+573828,
+573840,
+573842,
+574798,
+576066,
+576774,
+577182,
+577184,
+577522,
+577524,
+578734,
+579854,
+579856,
+581128,
+581278,
+582296,
+583496,
+583944,
+584160,
+584844,
+584954,
+584968,
+585486,
+586592,
+586594,
+587158,
+587320,
+588006,
+589012,
+590302,
+590366,
+590444,
+590944,
+581786,
+582234,
+582920,
+582922,
+564780,
+565486,
+565684,
+566570,
+566896,
+567262,
+567864,
+568958,
+570268,
+570844,
+572014,
+573368,
+574252,
+574416,
+574976,
+575646,
+576026,
+576356,
+577044,
+577046,
+577644,
+577788,
+578260,
+578874,
+579308,
+579718,
+580288,
+580942,
+581534,
+581536,
+576350,
+576352
\ No newline at end of file

From 5886b3024d761b087232da0e52aef4877481ef36 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 3 Dec 2025 20:51:18 -0300
Subject: [PATCH 26/57] Fix test

---
 72_CooperativeBinarySearch/main.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp
index e2611dea7..828adf34f 100644
--- a/72_CooperativeBinarySearch/main.cpp
+++ b/72_CooperativeBinarySearch/main.cpp
@@ -85,7 +85,7 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
 		    SPushConstantRange pcRange = {};
 		    pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
 		    pcRange.offset = 0u;
-		    pcRange.size = 2 * sizeof(uint32_t);
+		    pcRange.size = sizeof(nbl::hlsl::PushConstants);
             auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout));
             IGPUComputePipeline::SCreationParams params = {};
             params.layout = layout.get();
@@ -186,11 +186,18 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
         m_cmdbuf->pipelineBarrier(EDF_NONE, depInfo);
         
 
-        const uint32_t pushConstants[2] = { 1920, 1080 };
         const IGPUDescriptorSet* set = m_descriptorSet.get();
+        const uint32_t numIndices = sizeof(TestCaseIndices) / sizeof(TestCaseIndices[0]);
+        const uint32_t lastValue = TestCaseIndices[numIndices - 1];
+        const uint32_t totalValues = lastValue + 100;
+        nbl::hlsl::PushConstants coopBinarySearchPC = {
+            .EntityCount = numIndices,
+        };
+
         m_cmdbuf->bindComputePipeline(m_pipeline.get());
         m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set);
-        m_cmdbuf->dispatch(240, 135, 1u);
+        m_cmdbuf->pushConstants(m_pipeline->getLayout(), nbl::hlsl::ShaderStage::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::PushConstants), &coopBinarySearchPC);
+        m_cmdbuf->dispatch((totalValues + 255u) / 256u, 1u, 1u);
 
 		layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT);
         m_cmdbuf->pipelineBarrier(EDF_NONE,depInfo);
@@ -216,7 +223,14 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
 
         auto ptr = m_allocations[1].memory->getMappedPointer();
         assert(ptr);
-        printf("readback ptr %p\n", ptr);
+
+        uint32_t* valuesPtr = reinterpret_cast<uint32_t*>(ptr);
+        for (uint32_t i = 0; i < totalValues; i++) {
+            uint32_t value = valuesPtr[i];
+            const uint32_t* binarySearchResult = std::upper_bound(TestCaseIndices, TestCaseIndices + numIndices, i);
+            uint32_t lowerBoundIndex = std::distance(TestCaseIndices, binarySearchResult) - 1;
+            assert(value == lowerBoundIndex);
+        }
 
         m_keepRunning = false;
     }

From 795066393d9b7918991800b4dda5b482cc9085b3 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 3 Dec 2025 20:58:08 -0300
Subject: [PATCH 27/57] Remove unecessary leftover file

---
 72_CooperativeBinarySearch/pipeline.groovy | 50 ----------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 72_CooperativeBinarySearch/pipeline.groovy

diff --git a/72_CooperativeBinarySearch/pipeline.groovy b/72_CooperativeBinarySearch/pipeline.groovy
deleted file mode 100644
index eb20d0c5a..000000000
--- a/72_CooperativeBinarySearch/pipeline.groovy
+++ /dev/null
@@ -1,50 +0,0 @@
-import org.DevshGraphicsProgramming.Agent
-import org.DevshGraphicsProgramming.BuilderInfo
-import org.DevshGraphicsProgramming.IBuilder
-
-class CComputeShaderPathTracerBuilder extends IBuilder
-{
-	public CComputeShaderPathTracerBuilder(Agent _agent, _info)
-	{
-		super(_agent, _info)
-	}
-	
-	@Override
-	public boolean prepare(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-  	public boolean build(Map axisMapping)
-	{
-		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
-		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
-		
-		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
-		def nameOfConfig = getNameOfConfig(config)
-		
-		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
-		
-		return true
-	}
-	
-	@Override
-  	public boolean test(Map axisMapping)
-	{
-		return true
-	}
-	
-	@Override
-	public boolean install(Map axisMapping)
-	{
-		return true
-	}
-}
-
-def create(Agent _agent, _info)
-{
-	return new CComputeShaderPathTracerBuilder(_agent, _info)
-}
-
-return this

From eb7d4fe788fb5e88b8b475c979586e050e202b00 Mon Sep 17 00:00:00 2001
From: Przemog1 <minikers21@gmail.com>
Date: Fri, 5 Dec 2025 12:58:59 +0100
Subject: [PATCH 28/57] Removed forced -O3 optimization

---
 05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt | 1 -
 07_StagingAndMultipleQueues/CMakeLists.txt           | 1 -
 10_CountingSort/CMakeLists.txt                       | 1 -
 11_FFT/CMakeLists.txt                                | 1 -
 24_ColorSpaceTest/CMakeLists.txt                     | 1 -
 62_CAD/CMakeLists.txt                                | 1 -
 64_EmulatedFloatTest/CMakeLists.txt                  | 1 -
 67_RayQueryGeometry/CMakeLists.txt                   | 1 -
 70_FLIPFluids/CMakeLists.txt                         | 1 -
 71_RayTracingPipeline/CMakeLists.txt                 | 1 -
 10 files changed, 10 deletions(-)

diff --git a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
index a342ac3d5..55ebaf41d 100644
--- a/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
+++ b/05_StreamingAndBufferDeviceAddressApp/CMakeLists.txt
@@ -44,7 +44,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/07_StagingAndMultipleQueues/CMakeLists.txt b/07_StagingAndMultipleQueues/CMakeLists.txt
index 19515454d..fe063be7c 100644
--- a/07_StagingAndMultipleQueues/CMakeLists.txt
+++ b/07_StagingAndMultipleQueues/CMakeLists.txt
@@ -44,7 +44,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/10_CountingSort/CMakeLists.txt b/10_CountingSort/CMakeLists.txt
index 3acc73022..14bde428d 100644
--- a/10_CountingSort/CMakeLists.txt
+++ b/10_CountingSort/CMakeLists.txt
@@ -66,7 +66,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/11_FFT/CMakeLists.txt b/11_FFT/CMakeLists.txt
index 9a2ee5a21..ca9fe8428 100644
--- a/11_FFT/CMakeLists.txt
+++ b/11_FFT/CMakeLists.txt
@@ -44,7 +44,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/24_ColorSpaceTest/CMakeLists.txt b/24_ColorSpaceTest/CMakeLists.txt
index a2c5e752b..a2feb2cb8 100644
--- a/24_ColorSpaceTest/CMakeLists.txt
+++ b/24_ColorSpaceTest/CMakeLists.txt
@@ -55,7 +55,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/62_CAD/CMakeLists.txt b/62_CAD/CMakeLists.txt
index dd181ff87..0928d3b61 100644
--- a/62_CAD/CMakeLists.txt
+++ b/62_CAD/CMakeLists.txt
@@ -107,7 +107,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/64_EmulatedFloatTest/CMakeLists.txt b/64_EmulatedFloatTest/CMakeLists.txt
index 6470cdc74..af46da896 100644
--- a/64_EmulatedFloatTest/CMakeLists.txt
+++ b/64_EmulatedFloatTest/CMakeLists.txt
@@ -56,7 +56,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/67_RayQueryGeometry/CMakeLists.txt b/67_RayQueryGeometry/CMakeLists.txt
index 503c5a31a..1fdfc03ce 100644
--- a/67_RayQueryGeometry/CMakeLists.txt
+++ b/67_RayQueryGeometry/CMakeLists.txt
@@ -48,7 +48,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/70_FLIPFluids/CMakeLists.txt b/70_FLIPFluids/CMakeLists.txt
index 19a561f78..842492167 100644
--- a/70_FLIPFluids/CMakeLists.txt
+++ b/70_FLIPFluids/CMakeLists.txt
@@ -95,7 +95,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 
diff --git a/71_RayTracingPipeline/CMakeLists.txt b/71_RayTracingPipeline/CMakeLists.txt
index 5c853040e..d7bb13671 100644
--- a/71_RayTracingPipeline/CMakeLists.txt
+++ b/71_RayTracingPipeline/CMakeLists.txt
@@ -110,7 +110,6 @@ string(CONFIGURE "${JSON}" JSON)
 
 set(COMPILE_OPTIONS
     -I "${CMAKE_CURRENT_SOURCE_DIR}"
-    -O3
     -T lib_${SM}
 )
 

From e35e61dbb9b8ea91bbc42540cf58e9e92548dd27 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 5 Dec 2025 22:26:15 +0700
Subject: [PATCH 29/57] Example 73 to 15 and fix compile error

---
 {73_Mortons => 14_Mortons}/CMakeLists.txt     |   0
 14_Mortons/CTester.h                          | 405 ++++++++++++++++++
 {73_Mortons => 14_Mortons}/ITester.h          |   0
 .../app_resources/common.hlsl                 |  41 ++
 .../app_resources/test.comp.hlsl              |   0
 14_Mortons/app_resources/testCommon.hlsl      | 253 +++++++++++
 .../config.json.template                      |   0
 {73_Mortons => 14_Mortons}/main.cpp           |   0
 {73_Mortons => 14_Mortons}/pipeline.groovy    |   0
 73_Mortons/CTester.h                          |  84 ++--
 73_Mortons/app_resources/testCommon.hlsl      |  89 ++--
 CMakeLists.txt                                |   2 +-
 12 files changed, 793 insertions(+), 81 deletions(-)
 rename {73_Mortons => 14_Mortons}/CMakeLists.txt (100%)
 create mode 100644 14_Mortons/CTester.h
 rename {73_Mortons => 14_Mortons}/ITester.h (100%)
 rename {73_Mortons => 14_Mortons}/app_resources/common.hlsl (90%)
 rename {73_Mortons => 14_Mortons}/app_resources/test.comp.hlsl (100%)
 create mode 100644 14_Mortons/app_resources/testCommon.hlsl
 rename {73_Mortons => 14_Mortons}/config.json.template (100%)
 rename {73_Mortons => 14_Mortons}/main.cpp (100%)
 rename {73_Mortons => 14_Mortons}/pipeline.groovy (100%)

diff --git a/73_Mortons/CMakeLists.txt b/14_Mortons/CMakeLists.txt
similarity index 100%
rename from 73_Mortons/CMakeLists.txt
rename to 14_Mortons/CMakeLists.txt
diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
new file mode 100644
index 000000000..4c8b4276e
--- /dev/null
+++ b/14_Mortons/CTester.h
@@ -0,0 +1,405 @@
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+
+#include <nabla.h>
+#include "app_resources/testCommon.hlsl"
+#include "ITester.h"
+
+using namespace nbl;
+
+class CTester final : public ITester
+{
+public:
+    void performTests()
+    {
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                uint64_t generatedA = longDistribution(mt);
+                uint64_t generatedB = longDistribution(mt);
+
+                testInput.generatedA = generatedA;
+                testInput.generatedB = generatedB;
+
+                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
+                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
+                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
+                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
+                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
+                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedUnaryMinus = _static_cast<emulated_int64_t>(-generatedA);
+                expected.emulatedLess = uint32_t(generatedA < generatedB);
+                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
+                expected.emulatedGreater = uint32_t(generatedA > generatedB);
+                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
+
+                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
+                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
+                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
+            }
+            {
+                testInput.coordX = longDistribution(mt);
+                testInput.coordY = longDistribution(mt);
+                testInput.coordZ = longDistribution(mt);
+                testInput.coordW = longDistribution(mt);
+
+                uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
+                uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
+
+                uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 );
+                uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 );
+                uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2);
+                uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2);
+                uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2);
+                uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2);
+
+                uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
+                uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
+
+                uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3);
+                uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3);
+                uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3);
+                uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3);
+                uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3);
+                uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3);
+
+                uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
+                uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
+
+                uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4);
+                uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4);
+                uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4);
+                uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4);
+                uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4);
+                uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4);
+
+                // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them
+                // so their highest bits are all 0s or 1s depending on the sign of the number they encode
+
+                int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+                int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+
+                int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+                int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+
+                int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+                int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+
+                // Plus
+                expected.mortonPlus_small_2 = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2ASmall + Vec2BSmall);
+                expected.mortonPlus_medium_2 = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2AMedium + Vec2BMedium);
+                expected.mortonPlus_full_2 = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2AFull + Vec2BFull);
+                expected.mortonPlus_emulated_2 = createMortonFromU64Vec<false, fullBits_2, 2, emulated_uint64_t>(Vec2AFull + Vec2BFull);
+
+                expected.mortonPlus_small_3 = createMortonFromU64Vec<false, smallBits_3, 3>(Vec3ASmall + Vec3BSmall);
+                expected.mortonPlus_medium_3 = createMortonFromU64Vec<false, mediumBits_3, 3>(Vec3AMedium + Vec3BMedium);
+                expected.mortonPlus_full_3 = createMortonFromU64Vec<false, fullBits_3, 3>(Vec3AFull + Vec3BFull);
+                expected.mortonPlus_emulated_3 = createMortonFromU64Vec<false, fullBits_3, 3, emulated_uint64_t>(Vec3AFull + Vec3BFull);
+
+                expected.mortonPlus_small_4 = createMortonFromU64Vec<false, smallBits_4, 4>(Vec4ASmall + Vec4BSmall);
+                expected.mortonPlus_medium_4 = createMortonFromU64Vec<false, mediumBits_4, 4>(Vec4AMedium + Vec4BMedium);
+                expected.mortonPlus_full_4 = createMortonFromU64Vec<false, fullBits_4, 4>(Vec4AFull + Vec4BFull);
+                expected.mortonPlus_emulated_4 = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4AFull + Vec4BFull);
+
+                // Minus
+                expected.mortonMinus_small_2 = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2ASmall - Vec2BSmall);
+                expected.mortonMinus_medium_2 = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2AMedium - Vec2BMedium);
+                expected.mortonMinus_full_2 = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2AFull - Vec2BFull);
+                expected.mortonMinus_emulated_2 = createMortonFromU64Vec<false, fullBits_2, 2, emulated_uint64_t>(Vec2AFull - Vec2BFull);
+
+                expected.mortonMinus_small_3 = createMortonFromU64Vec<false, smallBits_3, 3>(Vec3ASmall - Vec3BSmall);
+                expected.mortonMinus_medium_3 = createMortonFromU64Vec<false, mediumBits_3, 3>(Vec3AMedium - Vec3BMedium);
+                expected.mortonMinus_full_3 = createMortonFromU64Vec<false, fullBits_3, 3>(Vec3AFull - Vec3BFull);
+                expected.mortonMinus_emulated_3 = createMortonFromU64Vec<false, fullBits_3, 3, emulated_uint64_t>(Vec3AFull - Vec3BFull);
+
+                expected.mortonMinus_small_4 = createMortonFromU64Vec<false, smallBits_4, 4>(Vec4ASmall - Vec4BSmall);
+                expected.mortonMinus_medium_4 = createMortonFromU64Vec<false, mediumBits_4, 4>(Vec4AMedium - Vec4BMedium);
+                expected.mortonMinus_full_4 = createMortonFromU64Vec<false, fullBits_4, 4>(Vec4AFull - Vec4BFull);
+                expected.mortonMinus_emulated_4 = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4AFull - Vec4BFull);
+
+                // Coordinate-wise equality
+                expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall));
+                expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium));
+                expected.mortonEqual_full_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+                expected.mortonEqual_emulated_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+
+                expected.mortonEqual_small_3 = uint32_t3(glm::equal(Vec3ASmall, Vec3BSmall));
+                expected.mortonEqual_medium_3 = uint32_t3(glm::equal(Vec3AMedium, Vec3BMedium));
+                expected.mortonEqual_full_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+                expected.mortonEqual_emulated_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+
+                expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall));
+                expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium));
+                expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise unsigned inequality (just testing with less)
+                expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall));
+                expected.mortonUnsignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2AMedium, Vec2BMedium));
+                expected.mortonUnsignedLess_full_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+                expected.mortonUnsignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+
+                expected.mortonUnsignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASmall, Vec3BSmall));
+                expected.mortonUnsignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3AMedium, Vec3BMedium));
+                expected.mortonUnsignedLess_full_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+                expected.mortonUnsignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+
+                expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall));
+                expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium));
+                expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise signed inequality
+                expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall));
+                expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium));
+                expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
+
+                expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall));
+                expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium));
+                expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
+
+                expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall));
+                expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium));
+                expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
+
+                uint16_t castedShift = uint16_t(generatedShift);
+                // Left-shift
+                expected.mortonLeftShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonLeftShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonLeftShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonLeftShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+
+                expected.mortonLeftShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonLeftShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonLeftShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonLeftShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+
+                expected.mortonLeftShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonLeftShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonLeftShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonLeftShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+
+                // Unsigned right-shift
+                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2));
+
+                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3));
+
+                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
+            
+                // Signed right-shift
+                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
+                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
+                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
+
+                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
+                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
+                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
+
+                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
+                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
+                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
+            }
+
+            performCpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
+        }
+        m_logger->log("FIRST TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+private:
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+
+        fillTestValues(commonTestInputValues, cpuTestValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
+        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
+        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
+        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
+        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
+        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
+        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
+        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
+        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
+        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
+        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
+        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
+        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+        verifyTestValue("emulatedUnaryMinus", expectedTestValues.emulatedUnaryMinus, testValues.emulatedUnaryMinus, testType);
+
+        // // Morton Plus
+        verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
+        verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType);
+        verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType);
+        verifyTestValue("mortonPlus_emulated_2", expectedTestValues.mortonPlus_emulated_2, testValues.mortonPlus_emulated_2, testType);
+        
+        verifyTestValue("mortonPlus_small_3", expectedTestValues.mortonPlus_small_3, testValues.mortonPlus_small_3, testType);
+        verifyTestValue("mortonPlus_medium_3", expectedTestValues.mortonPlus_medium_3, testValues.mortonPlus_medium_3, testType);
+        verifyTestValue("mortonPlus_full_3", expectedTestValues.mortonPlus_full_3, testValues.mortonPlus_full_3, testType);
+        verifyTestValue("mortonPlus_emulated_3", expectedTestValues.mortonPlus_emulated_3, testValues.mortonPlus_emulated_3, testType);
+        
+        verifyTestValue("mortonPlus_small_4", expectedTestValues.mortonPlus_small_4, testValues.mortonPlus_small_4, testType);
+        verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType);
+        verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType);
+        verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType);
+
+        // // Morton Minus
+        verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType);
+        verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType);
+        verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType);
+        verifyTestValue("mortonMinus_emulated_2", expectedTestValues.mortonMinus_emulated_2, testValues.mortonMinus_emulated_2, testType);
+        
+        verifyTestValue("mortonMinus_small_3", expectedTestValues.mortonMinus_small_3, testValues.mortonMinus_small_3, testType);
+        verifyTestValue("mortonMinus_medium_3", expectedTestValues.mortonMinus_medium_3, testValues.mortonMinus_medium_3, testType);
+        verifyTestValue("mortonMinus_full_3", expectedTestValues.mortonMinus_full_3, testValues.mortonMinus_full_3, testType);
+        verifyTestValue("mortonMinus_emulated_3", expectedTestValues.mortonMinus_emulated_3, testValues.mortonMinus_emulated_3, testType);
+        
+        verifyTestValue("mortonMinus_small_4", expectedTestValues.mortonMinus_small_4, testValues.mortonMinus_small_4, testType);
+        verifyTestValue("mortonMinus_medium_4", expectedTestValues.mortonMinus_medium_4, testValues.mortonMinus_medium_4, testType);
+        verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType);
+        verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType);
+        
+        // // Morton coordinate-wise equality
+        verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType);
+        verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType);
+        verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType);
+        verifyTestValue("mortonEqual_emulated_2", expectedTestValues.mortonEqual_emulated_2, testValues.mortonEqual_emulated_2, testType);
+        
+        verifyTestValue("mortonEqual_small_3", expectedTestValues.mortonEqual_small_3, testValues.mortonEqual_small_3, testType);
+        verifyTestValue("mortonEqual_medium_3", expectedTestValues.mortonEqual_medium_3, testValues.mortonEqual_medium_3, testType);
+        verifyTestValue("mortonEqual_full_3", expectedTestValues.mortonEqual_full_3, testValues.mortonEqual_full_3, testType);
+        verifyTestValue("mortonEqual_emulated_3", expectedTestValues.mortonEqual_emulated_3, testValues.mortonEqual_emulated_3, testType);
+        
+        verifyTestValue("mortonEqual_small_4", expectedTestValues.mortonEqual_small_4, testValues.mortonEqual_small_4, testType);
+        verifyTestValue("mortonEqual_medium_4", expectedTestValues.mortonEqual_medium_4, testValues.mortonEqual_medium_4, testType);
+        verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType);
+        verifyTestValue("mortonEqual_emulated_4", expectedTestValues.mortonEqual_emulated_4, testValues.mortonEqual_emulated_4, testType);
+        
+        // // Morton coordinate-wise unsigned inequality
+        verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType);
+        verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType);
+        verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_2", expectedTestValues.mortonUnsignedLess_emulated_2, testValues.mortonUnsignedLess_emulated_2, testType);
+        
+        verifyTestValue("mortonUnsignedLess_small_3", expectedTestValues.mortonUnsignedLess_small_3, testValues.mortonUnsignedLess_small_3, testType);
+        verifyTestValue("mortonUnsignedLess_medium_3", expectedTestValues.mortonUnsignedLess_medium_3, testValues.mortonUnsignedLess_medium_3, testType);
+        verifyTestValue("mortonUnsignedLess_full_3", expectedTestValues.mortonUnsignedLess_full_3, testValues.mortonUnsignedLess_full_3, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_3", expectedTestValues.mortonUnsignedLess_emulated_3, testValues.mortonUnsignedLess_emulated_3, testType);
+        
+        verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType);
+        verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
+        verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
+        
+        // // Morton coordinate-wise signed inequality
+        verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
+        verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
+        verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
+        verifyTestValue("mortonSignedLess_emulated_2", expectedTestValues.mortonSignedLess_emulated_2, testValues.mortonSignedLess_emulated_2, testType);
+        
+        verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
+        verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
+        verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
+        verifyTestValue("mortonSignedLess_emulated_3", expectedTestValues.mortonSignedLess_emulated_3, testValues.mortonSignedLess_emulated_3, testType);
+        
+        verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
+        verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
+        verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
+        verifyTestValue("mortonSignedLess_emulated_4", expectedTestValues.mortonSignedLess_emulated_4, testValues.mortonSignedLess_emulated_4, testType);
+        
+        // // Morton left-shift
+        verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
+        verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType);
+        verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType);
+        verifyTestValue("mortonLeftShift_emulated_2", expectedTestValues.mortonLeftShift_emulated_2, testValues.mortonLeftShift_emulated_2, testType);
+        
+        verifyTestValue("mortonLeftShift_small_3", expectedTestValues.mortonLeftShift_small_3, testValues.mortonLeftShift_small_3, testType);
+        verifyTestValue("mortonLeftShift_medium_3", expectedTestValues.mortonLeftShift_medium_3, testValues.mortonLeftShift_medium_3, testType);
+        verifyTestValue("mortonLeftShift_full_3", expectedTestValues.mortonLeftShift_full_3, testValues.mortonLeftShift_full_3, testType);
+        verifyTestValue("mortonLeftShift_emulated_3", expectedTestValues.mortonLeftShift_emulated_3, testValues.mortonLeftShift_emulated_3, testType);
+        
+        verifyTestValue("mortonLeftShift_small_4", expectedTestValues.mortonLeftShift_small_4, testValues.mortonLeftShift_small_4, testType);
+        verifyTestValue("mortonLeftShift_medium_4", expectedTestValues.mortonLeftShift_medium_4, testValues.mortonLeftShift_medium_4, testType);
+        verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType);
+        verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType);
+        
+        // // Morton unsigned right-shift
+        verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_2", expectedTestValues.mortonUnsignedRightShift_emulated_2, testValues.mortonUnsignedRightShift_emulated_2, testType);
+        
+        verifyTestValue("mortonUnsignedRightShift_small_3", expectedTestValues.mortonUnsignedRightShift_small_3, testValues.mortonUnsignedRightShift_small_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_3", expectedTestValues.mortonUnsignedRightShift_medium_3, testValues.mortonUnsignedRightShift_medium_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_3", expectedTestValues.mortonUnsignedRightShift_full_3, testValues.mortonUnsignedRightShift_full_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_3", expectedTestValues.mortonUnsignedRightShift_emulated_3, testValues.mortonUnsignedRightShift_emulated_3, testType);
+        
+        verifyTestValue("mortonUnsignedRightShift_small_4", expectedTestValues.mortonUnsignedRightShift_small_4, testValues.mortonUnsignedRightShift_small_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_4", expectedTestValues.mortonUnsignedRightShift_medium_4, testValues.mortonUnsignedRightShift_medium_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType);
+        
+        // // Morton signed right-shift
+        verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
+        verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
+        verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
+        
+        verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType);
+        verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType);
+        verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType);
+        
+        verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType);
+        verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType);
+        verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/73_Mortons/ITester.h b/14_Mortons/ITester.h
similarity index 100%
rename from 73_Mortons/ITester.h
rename to 14_Mortons/ITester.h
diff --git a/73_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
similarity index 90%
rename from 73_Mortons/app_resources/common.hlsl
rename to 14_Mortons/app_resources/common.hlsl
index 18cdc058f..237e3260e 100644
--- a/73_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -19,6 +19,10 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4;
 NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
 NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
+template <typename T, uint16_t Bits>
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR T bitMask = (uint64_t(1) << Bits) - 1;
+
+
 #ifndef __HLSL_VERSION
 
 constexpr uint64_t smallBitsMask_2 = (uint64_t(1) << smallBits_2) - 1;
@@ -36,6 +40,42 @@ constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1;
 #endif
 
 using namespace nbl::hlsl;
+template <typename T, bool Signed, uint16_t Bits>
+T createAnyBitIntegerFromU64(uint64_t val)
+{
+  if(Signed && (_static_cast<int64_t>(val) < 0))
+  {
+    // fill excess bit with one
+    return T(val) | ~bitMask<T, Bits>;
+  } else
+  {
+    return T(val) & bitMask<T, Bits>;
+    
+  }
+}
+
+template <typename T, bool Signed, uint16_t Bits, uint16_t D>
+vector<T, D> createAnyBitIntegerVecFromU64Vec(vector<uint64_t, D> val)
+{
+  array_get<portable_vector_t<T, D>, T> getter;
+  array_set<portable_vector_t<T, D>, T> setter;
+	vector<T, D> output;
+  NBL_UNROLL
+	for (uint16_t i = 0; i < D; i++)
+	{
+		setter(output, i, createAnyBitIntegerFromU64<T, Signed, Bits>(getter(val, i)));
+	}
+	return output;
+}
+
+template <bool Signed, uint16_t Bits, uint16_t D, typename _uint64_t = uint64_t>
+morton::code<Signed, Bits, D, _uint64_t> createMortonFromU64Vec(const vector<uint64_t, D> vec)
+{
+	using morton_code_t = morton::code<Signed, Bits, D, _uint64_t>;
+	using decode_component_t = typename morton_code_t::decode_component_t;
+	return morton_code_t::create(createAnyBitIntegerVecFromU64Vec<decode_component_t, Signed, Bits, D>(vec));
+}
+
 struct InputTestValues
 {
 	// Both tests
@@ -203,6 +243,7 @@ struct TestValues
 	morton::code<true, fullBits_4, 4>					  mortonSignedRightShift_full_4;
 	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
+	
 	/*
 	void fillSecondTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
 	{
diff --git a/73_Mortons/app_resources/test.comp.hlsl b/14_Mortons/app_resources/test.comp.hlsl
similarity index 100%
rename from 73_Mortons/app_resources/test.comp.hlsl
rename to 14_Mortons/app_resources/test.comp.hlsl
diff --git a/14_Mortons/app_resources/testCommon.hlsl b/14_Mortons/app_resources/testCommon.hlsl
new file mode 100644
index 000000000..dbe6ddbd2
--- /dev/null
+++ b/14_Mortons/app_resources/testCommon.hlsl
@@ -0,0 +1,253 @@
+#include "common.hlsl"
+
+
+void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
+{
+	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
+	emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+	emulated_int64_t signedEmulatedA = _static_cast<emulated_int64_t>(input.generatedA);
+
+	// Emulated int tests
+	output.emulatedAnd = emulatedA & emulatedB;
+	output.emulatedOr = emulatedA | emulatedB;
+	output.emulatedXor = emulatedA ^ emulatedB;
+	output.emulatedNot = emulatedA.operator~();
+	output.emulatedPlus = emulatedA + emulatedB;
+	output.emulatedMinus = emulatedA - emulatedB;
+	output.emulatedLess = uint32_t(emulatedA < emulatedB);
+	output.emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
+	output.emulatedGreater = uint32_t(emulatedA > emulatedB);
+	output.emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
+
+	left_shift_operator<emulated_uint64_t> leftShift;
+	output.emulatedLeftShifted = leftShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
+	output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
+	output.emulatedSignedRightShifted = signedRightShift(signedEmulatedA, input.shift);
+
+	output.emulatedUnaryMinus = signedEmulatedA.operator-();
+
+	// Morton tests
+	uint64_t2 Vec2A = { input.coordX, input.coordY };
+	uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+	uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+	uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+	morton::code<false, smallBits_2, 2> morton_small_2A = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2A);
+	morton::code<false, mediumBits_2, 2> morton_medium_2A = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2A);
+	morton::code<false, fullBits_2, 2> morton_full_2A = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2A);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = createMortonFromU64Vec<false, fullBits_2, 2, emulated_uint64_t>(Vec2A);
+	morton::code<false, smallBits_2, 2> morton_small_2B = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2B);
+	morton::code<false, mediumBits_2, 2> morton_medium_2B = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2B);
+	morton::code<false, fullBits_2, 2> morton_full_2B = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2B);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = createMortonFromU64Vec<false, fullBits_2, 2, emulated_uint64_t>(Vec2B);
+	
+	morton::code<false, smallBits_3, 3> morton_small_3A = createMortonFromU64Vec<false, smallBits_3, 3>(Vec3A);
+	morton::code<false, mediumBits_3, 3> morton_medium_3A = createMortonFromU64Vec<false, mediumBits_3, 3>(Vec3A);
+	morton::code<false, fullBits_3, 3> morton_full_3A = createMortonFromU64Vec<false, fullBits_3, 3>(Vec3A);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = createMortonFromU64Vec<false, fullBits_3, 3, emulated_uint64_t>(Vec3A);
+	morton::code<false, smallBits_3, 3> morton_small_3B = createMortonFromU64Vec<false, smallBits_3, 3>(Vec3B);
+	morton::code<false, mediumBits_3, 3> morton_medium_3B = createMortonFromU64Vec<false, mediumBits_3, 3>(Vec3B);
+	morton::code<false, fullBits_3, 3> morton_full_3B = createMortonFromU64Vec<false, fullBits_3, 3>(Vec3B);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = createMortonFromU64Vec<false, fullBits_3, 3, emulated_uint64_t>(Vec3B);
+	
+	morton::code<false, smallBits_4, 4> morton_small_4A = createMortonFromU64Vec<false, smallBits_4, 4>(Vec4A);
+	morton::code<false, mediumBits_4, 4> morton_medium_4A = createMortonFromU64Vec<false, mediumBits_4, 4>(Vec4A);
+	morton::code<false, fullBits_4, 4> morton_full_4A = createMortonFromU64Vec<false, fullBits_4, 4>(Vec4A);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4A);
+	morton::code<false, smallBits_4, 4> morton_small_4B = createMortonFromU64Vec<false, smallBits_4, 4>(Vec4B);
+	morton::code<false, mediumBits_4, 4> morton_medium_4B = createMortonFromU64Vec<false, mediumBits_4, 4>(Vec4B);
+	morton::code<false, fullBits_4, 4> morton_full_4B = createMortonFromU64Vec<false, fullBits_4, 4>(Vec4B);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4B);
+	
+	morton::code<true, smallBits_2, 2> morton_small_2_signed = createMortonFromU64Vec<true, smallBits_2, 2>(Vec2A);
+	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = createMortonFromU64Vec<true, mediumBits_2, 2>(Vec2A);
+	morton::code<true, fullBits_2, 2> morton_full_2_signed = createMortonFromU64Vec<true, fullBits_2, 2>(Vec2A);
+	morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2A);
+	
+	morton::code<true, smallBits_3, 3> morton_small_3_signed = createMortonFromU64Vec<true, smallBits_3, 3>(Vec3A);
+	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = createMortonFromU64Vec<true, mediumBits_3, 3>(Vec3A);
+	morton::code<true, fullBits_3, 3> morton_full_3_signed = createMortonFromU64Vec<true, fullBits_3, 3>(Vec3A);
+	morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3_signed = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3A);
+	
+	morton::code<true, smallBits_4, 4> morton_small_4_signed = createMortonFromU64Vec<true, smallBits_4, 4>(Vec4A);
+	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = createMortonFromU64Vec<true, mediumBits_4, 4>(Vec4A);
+	morton::code<true, fullBits_4, 4> morton_full_4_signed = createMortonFromU64Vec<true, fullBits_4, 4>(Vec4A);
+	morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A);
+	
+	// Plus
+	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
+	output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
+	output.mortonPlus_full_2 = morton_full_2A + morton_full_2B;
+	output.mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
+	
+	output.mortonPlus_small_3 = morton_small_3A + morton_small_3B;
+	output.mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
+	output.mortonPlus_full_3 = morton_full_3A + morton_full_3B;
+	output.mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
+	
+	output.mortonPlus_small_4 = morton_small_4A + morton_small_4B;
+	output.mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
+	output.mortonPlus_full_4 = morton_full_4A + morton_full_4B;
+	output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
+	
+	// // Minus
+	output.mortonMinus_small_2 = morton_small_2A - morton_small_2B;
+	output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
+	output.mortonMinus_full_2 = morton_full_2A - morton_full_2B;
+	output.mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
+	
+	output.mortonMinus_small_3 = morton_small_3A - morton_small_3B;
+	output.mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
+	output.mortonMinus_full_3 = morton_full_3A - morton_full_3B;
+	output.mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
+	
+	output.mortonMinus_small_4 = morton_small_4A - morton_small_4B;
+	output.mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
+	output.mortonMinus_full_4 = morton_full_4A - morton_full_4B;
+	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
+	
+	// // Coordinate-wise equality
+	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
+	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+	
+	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
+	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+	
+	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+	// output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
+	
+	// Coordinate-wise unsigned inequality (just testing with less)
+	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
+	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+	
+	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
+	// output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+	
+	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+	// output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+	// less(Vec4A, Vec4B);
+	
+	// Coordinate-wise signed inequality
+	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2B)));
+	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2B)));
+	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2B)));
+	// output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(int32_t2(Vec2B))); 
+	
+	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3B)));
+	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3B)));
+	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3B)));
+	output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(int32_t3(Vec3B))); 
+	
+	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4B)));
+	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4B)));
+	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4B)));
+	// output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(int16_t4(Vec4B))); 
+	
+	// // Cast to uint16_t which is what left shift for Mortons expect
+	uint16_t castedShift = uint16_t(input.shift);
+	// // Each left shift clamps to correct bits so the result kinda makes sense
+	// // Left-shift
+	left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
+	output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
+	output.mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
+	output.mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
+	output.mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+	
+	left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
+	output.mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
+	output.mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
+	output.mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
+	output.mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+	
+	left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
+	output.mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
+	output.mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
+	output.mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
+	output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// // Unsigned right-shift
+	arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
+	output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
+	output.mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
+	output.mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
+	output.mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+	
+	arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
+	output.mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
+	output.mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
+	output.mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
+	output.mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+	
+	arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
+	output.mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
+	output.mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
+	output.mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
+	output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// // Signed right-shift
+	arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
+	output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
+	output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
+	output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2);
+	
+	arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
+	output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
+	output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
+	output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3);
+	
+	arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
+	output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
+	output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
+	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
+
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
+	// output.mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); 
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
+	// output.mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); 
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
+	// output.mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); 
+}
\ No newline at end of file
diff --git a/73_Mortons/config.json.template b/14_Mortons/config.json.template
similarity index 100%
rename from 73_Mortons/config.json.template
rename to 14_Mortons/config.json.template
diff --git a/73_Mortons/main.cpp b/14_Mortons/main.cpp
similarity index 100%
rename from 73_Mortons/main.cpp
rename to 14_Mortons/main.cpp
diff --git a/73_Mortons/pipeline.groovy b/14_Mortons/pipeline.groovy
similarity index 100%
rename from 73_Mortons/pipeline.groovy
rename to 14_Mortons/pipeline.groovy
diff --git a/73_Mortons/CTester.h b/73_Mortons/CTester.h
index fa29f3c9c..b4097dad6 100644
--- a/73_Mortons/CTester.h
+++ b/73_Mortons/CTester.h
@@ -113,37 +113,37 @@ class CTester final : public ITester
                 int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
 
                 // Plus
-                expected.mortonPlus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall + Vec2BSmall);
-                expected.mortonPlus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium + Vec2BMedium);
-                expected.mortonPlus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull + Vec2BFull);
-                expected.mortonPlus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull + Vec2BFull);
-
-                expected.mortonPlus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall + Vec3BSmall);
-                expected.mortonPlus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium + Vec3BMedium);
-                expected.mortonPlus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull + Vec3BFull);
-                expected.mortonPlus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull + Vec3BFull);
-
-                expected.mortonPlus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall + Vec4BSmall);
-                expected.mortonPlus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium + Vec4BMedium);
-                expected.mortonPlus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull + Vec4BFull);
-                expected.mortonPlus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull + Vec4BFull);
-
-                // Minus
-                expected.mortonMinus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall - Vec2BSmall);
-                expected.mortonMinus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium - Vec2BMedium);
-                expected.mortonMinus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull - Vec2BFull);
-                expected.mortonMinus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull - Vec2BFull);
-
-                expected.mortonMinus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall - Vec3BSmall);
-                expected.mortonMinus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium - Vec3BMedium);
-                expected.mortonMinus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull - Vec3BFull);
-                expected.mortonMinus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull - Vec3BFull);
-
-                expected.mortonMinus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall - Vec4BSmall);
-                expected.mortonMinus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium - Vec4BMedium);
-                expected.mortonMinus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull - Vec4BFull);
-                expected.mortonMinus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull - Vec4BFull);
-
+                expected.mortonPlus_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall + Vec2BSmall) & static_cast<uint16_t>(smallBitsMask_2));
+                expected.mortonPlus_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium + Vec2BMedium) & static_cast<uint16_t>(mediumBitsMask_2));
+                expected.mortonPlus_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull + Vec2BFull) & static_cast<uint32_t>(fullBitsMask_2));
+                expected.mortonPlus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull + Vec2BFull) & static_cast<uint32_t>(fullBitsMask_2));
+
+                expected.mortonPlus_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall + Vec3BSmall) & static_cast<uint16_t>(smallBitsMask_3));
+                expected.mortonPlus_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium + Vec3BMedium) & static_cast<uint16_t>(mediumBitsMask_3));
+                expected.mortonPlus_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull + Vec3BFull) & static_cast<uint32_t>(fullBitsMask_3));
+                expected.mortonPlus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull + Vec3BFull) & static_cast<uint32_t>(fullBitsMask_3));
+
+                expected.mortonPlus_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall + Vec4BSmall) & static_cast<uint16_t>(smallBitsMask_4));
+                expected.mortonPlus_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium + Vec4BMedium) & static_cast<uint16_t>(mediumBitsMask_4));
+                expected.mortonPlus_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull + Vec4BFull) & static_cast<uint16_t>(fullBitsMask_4));
+                expected.mortonPlus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull + Vec4BFull) & static_cast<uint16_t>(fullBitsMask_4));
+
+                // // Minus
+                // expected.mortonMinus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall - Vec2BSmall);
+                // expected.mortonMinus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium - Vec2BMedium);
+                // expected.mortonMinus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull - Vec2BFull);
+                // expected.mortonMinus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull - Vec2BFull);
+                //
+                // expected.mortonMinus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall - Vec3BSmall);
+                // expected.mortonMinus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium - Vec3BMedium);
+                // expected.mortonMinus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull - Vec3BFull);
+                // expected.mortonMinus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull - Vec3BFull);
+                //
+                // expected.mortonMinus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall - Vec4BSmall);
+                // expected.mortonMinus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium - Vec4BMedium);
+                // expected.mortonMinus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull - Vec4BFull);
+                // expected.mortonMinus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull - Vec4BFull);
+                //
                 // Coordinate-wise equality
                 expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall));
                 expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium));
@@ -221,17 +221,17 @@ class CTester final : public ITester
                 expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
             
                 // Signed right-shift
-                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
-                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
-                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
-
-                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
-                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
-                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
-
-                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
-                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
-                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
+                // expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
+                // expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
+                // expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
+                //
+                // expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
+                // expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
+                // expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
+                //
+                // expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
+                // expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
+                // expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
             }
 
             performCpuTests(testInput, expected);
diff --git a/73_Mortons/app_resources/testCommon.hlsl b/73_Mortons/app_resources/testCommon.hlsl
index 4ca2b859d..93205db62 100644
--- a/73_Mortons/app_resources/testCommon.hlsl
+++ b/73_Mortons/app_resources/testCommon.hlsl
@@ -1,5 +1,17 @@
 #include "common.hlsl"
 
+template <bool Signed, uint16_t Bits, uint16_t Dim, typename _uint64_t = uint64_t>
+morton::code<Signed, Bits, Dim, _uint64_t> createMortonFromAnyVec(vector<conditional_t<Signed, int64_t, uint64_t>, Dim> val)
+{
+	using morton_code_t = morton::code<Signed, Bits, Dim, _uint64_t>;
+	using decode_element_t = typename morton_code_t::decode_component_t ;
+	NBL_IF_CONSTEXPR(Signed)
+	{
+    return morton_code_t::create(_static_cast<vector<decode_element_t, Dim> >(val & ));
+	  
+	}
+}
+
 void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
 {
 	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
@@ -48,44 +60,44 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	int64_t4 Vec4ASigned = int64_t4(Vec4A);
 	int64_t4 Vec4BSigned = int64_t4(Vec4B);
 
-	morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
-	morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
-	morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
-	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
-	morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
-	morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
-	morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
-	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
-
-	morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
-	morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
-	morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
-	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
-	morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
-	morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
-	morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
-	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
-
-	morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
-	morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
-	morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
-	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
-	morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
-	morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
-	morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
-	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
-
-	morton::code<true, smallBits_2, 2> morton_small_2_signed = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
-	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
-	morton::code<true, fullBits_2, 2> morton_full_2_signed = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
-
-	morton::code<true, smallBits_3, 3> morton_small_3_signed = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
-	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
-	morton::code<true, fullBits_3, 3> morton_full_3_signed = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
-
-	morton::code<true, smallBits_4, 4> morton_small_4_signed = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
-	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
-	morton::code<true, fullBits_4, 4> morton_full_4_signed = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
+	morton::code<false, smallBits_2, 2> morton_small_2A = createMortonFromAnyVec<false, smallBits_2, 2>(Vec2A);
+	morton::code<false, mediumBits_2, 2> morton_medium_2A = createMortonFromAnyVec<false, mediumBits_2, 2>(Vec2A);
+	morton::code<false, fullBits_2, 2> morton_full_2A = createMortonFromAnyVec<false, fullBits_2, 2>(Vec2A);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = createMortonFromAnyVec<false, fullBits_2, 2, emulated_uint64_t>(Vec2A);
+	morton::code<false, smallBits_2, 2> morton_small_2B = createMortonFromAnyVec<false, smallBits_2, 2>(Vec2B);
+	morton::code<false, mediumBits_2, 2> morton_medium_2B = createMortonFromAnyVec<false, mediumBits_2, 2>(Vec2B);
+	morton::code<false, fullBits_2, 2> morton_full_2B = createMortonFromAnyVec<false, fullBits_2, 2>(Vec2B);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = createMortonFromAnyVec<false, fullBits_2, 2, emulated_uint64_t>(Vec2B);
+
+	morton::code<false, smallBits_3, 3> morton_small_3A = createMortonFromAnyVec<false, smallBits_3, 3>(Vec3A);
+	morton::code<false, mediumBits_3, 3> morton_medium_3A = createMortonFromAnyVec<false, mediumBits_3, 3>(Vec3A);
+	morton::code<false, fullBits_3, 3> morton_full_3A = createMortonFromAnyVec<false, fullBits_3, 3>(Vec3A);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = createMortonFromAnyVec<false, fullBits_3, 3, emulated_uint64_t>(Vec3A);
+	morton::code<false, smallBits_3, 3> morton_small_3B = createMortonFromAnyVec<false, smallBits_3, 3>(Vec3B);
+	morton::code<false, mediumBits_3, 3> morton_medium_3B = createMortonFromAnyVec<false, mediumBits_3, 3>(Vec3B);
+	morton::code<false, fullBits_3, 3> morton_full_3B = createMortonFromAnyVec<false, fullBits_3, 3>(Vec3B);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = createMortonFromAnyVec<false, fullBits_3, 3, emulated_uint64_t>(Vec3B);
+
+	morton::code<false, smallBits_4, 4> morton_small_4A = createMortonFromAnyVec<false, smallBits_4, 4>(Vec4A);
+	morton::code<false, mediumBits_4, 4> morton_medium_4A = createMortonFromAnyVec<false, mediumBits_4, 4>(Vec4A);
+	morton::code<false, fullBits_4, 4> morton_full_4A = createMortonFromAnyVec<false, fullBits_4, 4>(Vec4A);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = createMortonFromAnyVec<false, fullBits_4, 4, emulated_uint64_t>(Vec4A);
+	morton::code<false, smallBits_4, 4> morton_small_4B = createMortonFromAnyVec<false, smallBits_4, 4>(Vec4B);
+	morton::code<false, mediumBits_4, 4> morton_medium_4B = createMortonFromAnyVec<false, mediumBits_4, 4>(Vec4B);
+	morton::code<false, fullBits_4, 4> morton_full_4B = createMortonFromAnyVec<false, fullBits_4, 4>(Vec4B);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = createMortonFromAnyVec<false, fullBits_4, 4, emulated_uint64_t>(Vec4B);
+
+	morton::code<true, smallBits_2, 2> morton_small_2_signed = createMortonFromAnyVec<true, smallBits_2, 2>(Vec2ASigned);
+	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = createMortonFromAnyVec<true, mediumBits_2, 2>(Vec2ASigned);
+	morton::code<true, fullBits_2, 2> morton_full_2_signed = createMortonFromAnyVec<true, fullBits_2, 2>(Vec2ASigned);
+
+	morton::code<true, smallBits_3, 3> morton_small_3_signed = createMortonFromAnyVec<true, smallBits_3, 3>(Vec3ASigned);
+	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = createMortonFromAnyVec<true, mediumBits_3, 3>(Vec3ASigned);
+	morton::code<true, fullBits_3, 3> morton_full_3_signed = createMortonFromAnyVec<true, fullBits_3, 3>(Vec3ASigned);
+
+	morton::code<true, smallBits_4, 4> morton_small_4_signed = createMortonFromAnyVec<true, smallBits_4, 4>(Vec4ASigned);
+	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = createMortonFromAnyVec<true, mediumBits_4, 4>(Vec4ASigned);
+	morton::code<true, fullBits_4, 4> morton_full_4_signed = createMortonFromAnyVec<true, fullBits_4, 4>(Vec4ASigned);
 
 	// Plus
 	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
@@ -133,6 +145,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
 	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
 	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
 	
 	// Coordinate-wise unsigned inequality (just testing with less)
 	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b85577144..0f3c6bcb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(12_MeshLoaders)
 	#
 	add_subdirectory(13_MaterialCompilerTest)
+	add_subdirectory(14_Mortons EXCLUDE_FROM_ALL)
 
 	# Waiting for a refactor
 	#add_subdirectory(27_PLYSTLDemo)
@@ -87,7 +88,6 @@ if(NBL_BUILD_EXAMPLES)
 
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
-	add_subdirectory(73_Mortons EXCLUDE_FROM_ALL)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 93861bd59f85721993472e3de67f23bec6170363 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 6 Dec 2025 21:02:46 +0300
Subject: [PATCH 30/57] Make camera account for up direction, corrected
 framebuffer resolutions for both views, solid angle shader now outputs
 correct cube vertices correctly

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 157 +++++++++++-------
 72_SolidAngleVisualizer/include/transform.hpp |   2 +-
 72_SolidAngleVisualizer/main.cpp              | 134 ++++++++-------
 .../include/nbl/examples/cameras/CCamera.hpp  |  50 +++---
 4 files changed, 190 insertions(+), 153 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index d783a5b37..2ad766c8a 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -9,7 +9,7 @@ using namespace ext::FullScreenTriangle;
 
 [[vk::push_constant]] struct PushConstants pc;
 
-static const float CIRCLE_RADIUS = 0.45f;
+static const float CIRCLE_RADIUS = 0.75f;
 
 // --- Geometry Utils ---
 
@@ -33,17 +33,23 @@ static float3 corners[8];
 static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), 
                             float3(0,0,0), float3(0,0,0), float3(0,0,0) };
 static float2 projCorners[8];
+static bool   cornerVisible[8];
 
 
 // Converts UV into centered, aspect-corrected NDC circle space
 float2 toCircleSpace(float2 uv)
 {
-    float aspect = pc.viewport.z / pc.viewport.w;
-    float2 centered = uv - 0.5f;
-    centered.x *= aspect;
-    return centered;
+    // Map [0,1] UV to [-1,1]
+    float2 p = uv * 2.0f - 1.0f;
+
+    // Correct aspect ratio
+    float aspect = pc.viewport.z / pc.viewport.w; // width / height
+    p.x *= aspect;
+
+    return p;
 }
 
+
 // Distance to a 2D line segment
 float sdSegment(float2 p, float2 a, float2 b)
 {
@@ -54,9 +60,18 @@ float sdSegment(float2 p, float2 a, float2 b)
 }
 
 // TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection)
-float2 project(float3 p) 
+bool projectToOrthoSphere(float3 p, out float2 uv)
 {
-    return normalize(p).xy;
+    float3 n = normalize(p);   // direction to sphere
+
+    // hemisphere (Z > 0)
+    if (n.z <= 0.0)
+        return false;
+
+    // orthographic projection (drop Z)
+    uv = n.xy;
+
+    return true; // valid
 }
 
 void computeCubeGeo()
@@ -66,71 +81,72 @@ void computeCubeGeo()
         float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f;
         float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
         
-        corners[i] = worldPos;
+        corners[i] = worldPos.xyz;
         
         faceCenters[i/4]      += worldPos / 4.0f; 
         faceCenters[2+i%2]    += worldPos / 4.0f; 
         faceCenters[4+(i/2)%2] += worldPos / 4.0f; 
 
-        float3 viewPos = worldPos; 
-        projCorners[i] = project(viewPos);
+        float3 viewPos = worldPos.xyz; 
+        cornerVisible[i] = projectToOrthoSphere(viewPos, projCorners[i]);
+        projCorners[i] *= CIRCLE_RADIUS; // scale to circle radius
     }
 }
 
-int getVisibilityCount(int2 faces, float3 cameraPos)
-{
-    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
-    float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
-    float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
+// int getVisibilityCount(int2 faces, float3 cameraPos)
+// {
+//     float3x3 rotMatrix = (float3x3)pc.modelMatrix;
+//     float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
+//     float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
     
-    float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; 
-    float3 viewVec_f2 = faceCenters[faces.y] - cameraPos;
+//     float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; 
+//     float3 viewVec_f2 = faceCenters[faces.y] - cameraPos;
 
-    // Face is visible if its outward normal points towards the origin (camera).
-    bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f;
-    bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f;
+//     // Face is visible if its outward normal points towards the origin (camera).
+//     bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f;
+//     bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f;
 
-    // Determine Line Style:
-    bool isSilhouette = visible1 != visible2; // One face visible, the other hidden
-    bool isInner = visible1 && visible2;      // Both faces visible
+//     // Determine Line Style:
+//     bool isSilhouette = visible1 != visible2; // One face visible, the other hidden
+//     bool isInner = visible1 && visible2;      // Both faces visible
     
-    int visibilityCount = 0;
-    if (isSilhouette) 
-    {
-        visibilityCount = 1;
-    }
-    else if (isInner)
-    {
-        visibilityCount = 2;
-    }
-
-    return visibilityCount;
-}
-
-void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth)
-{
-    if (visibilityCount > 0)
-    {
-        float3 A = corners[a];
-        float3 B = corners[b];
-
-        float avgDepth = (length(A) + length(B)) * 0.5f;
-        float referenceDepth = 3.0f;
-        float depthScale = referenceDepth / avgDepth;
-
-        float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f;
-        float intensity = (visibilityCount == 1) ? 1.0f : 0.5f;
-        float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red
+//     int visibilityCount = 0;
+//     if (isSilhouette) 
+//     {
+//         visibilityCount = 1;
+//     }
+//     else if (isInner)
+//     {
+//         visibilityCount = 2;
+//     }
+
+//     return visibilityCount;
+// }
+
+// void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth)
+// {
+//     if (visibilityCount > 0)
+//     {
+//         float3 A = corners[a];
+//         float3 B = corners[b];
+
+//         float avgDepth = (length(A) + length(B)) * 0.5f;
+//         float referenceDepth = 3.0f;
+//         float depthScale = referenceDepth / avgDepth;
+
+//         float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f;
+//         float intensity = (visibilityCount == 1) ? 1.0f : 0.5f;
+//         float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red
         
-        float width = min(baseWidth * depthScale, 0.03f); 
+//         float width = min(baseWidth * depthScale, 0.03f); 
         
-        float dist = sdSegment(p, projCorners[a], projCorners[b]);
+//         float dist = sdSegment(p, projCorners[a], projCorners[b]);
         
-        float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
+//         float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
         
-        color += edgeColor * alpha * intensity;
-    }
-}
+//         color += edgeColor * alpha * intensity;
+//     }
+// }
 
 void drawRing(float2 p, inout float4 color, float aaWidth)
 {
@@ -149,6 +165,12 @@ void drawRing(float2 p, inout float4 color, float aaWidth)
     color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); 
 }
 
+float plotPoint(float2 uv, float2 p, float r)
+{
+    return step(length(uv - p), r);
+}
+
+
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
 {
     float3 cameraPos = float3(0, 0, 0); // Camera at origin
@@ -159,16 +181,25 @@ void drawRing(float2 p, inout float4 color, float aaWidth)
     
     float aaWidth = max(fwidth(p.x), fwidth(p.y)); 
 
-    for (int j = 0; j < 12; j++)
+    float pointMask = 0.0;
+    for (int i=0; i<8; i++)
     {
-        int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
-        int b = a + (4 >> (j / 4));
-
-        int2 faces = edgeToFaces[j];
-        int visibilityCount = getVisibilityCount(faces, cameraPos);
-        drawLine(p, a, b, visibilityCount, color, aaWidth);
+        if (cornerVisible[i])
+            pointMask += plotPoint(p, projCorners[i], 0.015f);
     }
 
+    color += pointMask * float4(1,0,0,1); // red points
+
+    // for (int j = 0; j < 12; j++)
+    // {
+    //     int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
+    //     int b = a + (4 >> (j / 4));
+
+    //     // int2 faces = edgeToFaces[j];
+    //     // int visibilityCount = getVisibilityCount(faces, cameraPos);
+    //     // drawLine(p, a, b, visibilityCount, color, aaWidth);
+    // }
+
     drawRing(p, color, aaWidth);
 
     return color;
diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp
index 002a9d215..5061ebd49 100644
--- a/72_SolidAngleVisualizer/include/transform.hpp
+++ b/72_SolidAngleVisualizer/include/transform.hpp
@@ -19,7 +19,7 @@ struct TransformRequestParams
 
 struct TransformReturnInfo
 {
-	nbl::hlsl::uint16_t2 sceneResolution = { 2048,1024 };
+	nbl::hlsl::uint16_t2 sceneResolution = { 0, 0 };
 	bool isGizmoWindowHovered;
 	bool isGizmoBeingUsed;
 };
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index b6d723e70..1025eb067 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.hpp"
 #include "app_resources/hlsl/common.hlsl"
-
 #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h"
 
 /*
@@ -319,10 +318,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		// CPU events
 		update(nextPresentationTimestamp);
 
-		const auto& virtualWindowRes = interface.transformReturnInfo.sceneResolution;
-		// TODO: check main frame buffer too
-		if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualWindowRes[1])
-			recreateFramebuffer(virtualWindowRes);
+		{
+			const auto& virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution;
+			const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution;
+			if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] ||
+				!m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1])
+				recreateFramebuffer();
+		}
 
 		//
 		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
@@ -334,6 +336,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
 		if (m_solidAngleViewFramebuffer)
 		{
+			auto creationParams = m_solidAngleViewFramebuffer->getCreationParameters();
 			cb->beginDebugMarker("Draw Circle View Frame");
 			{
 				const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f };
@@ -344,7 +347,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					.depthStencilClearValues = &farValue,
 					.renderArea = {
 						.offset = {0,0},
-						.extent = {virtualWindowRes[0],virtualWindowRes[1]}
+						.extent = {creationParams.width, creationParams.height}
 					}
 				};
 				beginRenderpass(cb, renderpassInfo);
@@ -353,7 +356,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			{
 				PushConstants pc{
 					.modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)),
-					.viewport = { 0.f,0.f,static_cast<float>(virtualWindowRes[0]),static_cast<float>(virtualWindowRes[1]) }
+					.viewport = { 0.f,0.f,static_cast<float>(creationParams.width),static_cast<float>(creationParams.height) }
 				};
 				auto pipeline = m_visualizationPipeline;
 				cb->bindGraphicsPipeline(pipeline.get());
@@ -369,6 +372,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		{
 			cb->beginDebugMarker("Main Scene Frame");
 			{
+				auto creationParams = m_mainViewFramebuffer->getCreationParameters();
 				const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f };
 				const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo =
 				{
@@ -377,7 +381,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					.depthStencilClearValues = &farValue,
 					.renderArea = {
 						.offset = {0,0},
-						.extent = {virtualWindowRes[0],virtualWindowRes[1]}
+						.extent = {creationParams.width, creationParams.height}
 					}
 				};
 				beginRenderpass(cb, renderpassInfo);
@@ -404,12 +408,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 				// TODO: a better way to get identity matrix
 				float32_t3x4 origin = {
-					0.2f,0.0f,0.0f,0.0f,
-					0.0f,0.2f,0.0f,0.0f,
-					0.0f,0.0f,0.2f,0.0f
+					1.0f,0.0f,0.0f,0.0f,
+					0.0f,1.0f,0.0f,0.0f,
+					0.0f,0.0f,1.0f,0.0f
 				};
 				memcpy(&instance.world, &origin, sizeof(instance.world));
-				instance.packedGeo = m_renderer->getGeometries().data() + 3; // sphere
+				instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk
 				m_renderer->render(cb, viewParams);
 			}
 			cb->endRenderPass();
@@ -575,7 +579,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			);
 			keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void
 				{
-					//if (interface.move)
+					if (interface.move)
 						camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl
 
 					for (const auto& e : events) // here capture
@@ -606,9 +610,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		interface.imGUI->update(params);
 	}
 
-	void recreateFramebuffer(const uint16_t2 resolution)
+	void recreateFramebuffer()
 	{
-		auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr<IGPUImageView>
+
+		auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format)->smart_refctd_ptr<IGPUImageView>
 			{
 				auto image = m_device->createImage({ {
 					.type = IGPUImage::ET_2D,
@@ -632,29 +637,32 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 		smart_refctd_ptr<IGPUImageView> solidAngleView;
 		smart_refctd_ptr<IGPUImageView> mainView;
+		const uint16_t2 solidAngleViewRes = interface.solidAngleViewTransformReturnInfo.sceneResolution;
+		const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution;
+
 		// detect window minimization
-		if (resolution.x < 0x4000 && resolution.y < 0x4000)
+		if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 ||
+			mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000)
 		{
-			solidAngleView = createImageAndView(finalSceneRenderFormat);
-			auto solidAngleDepthView = createImageAndView(sceneRenderDepthFormat);
+			solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat);
+			auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat);
 			m_solidAngleViewFramebuffer = m_device->createFramebuffer({ {
 				.renderpass = m_solidAngleRenderpass,
 				.depthStencilAttachments = &solidAngleDepthView.get(),
 				.colorAttachments = &solidAngleView.get(),
-				.width = resolution.x,
-				.height = resolution.y
+				.width = solidAngleViewRes.x,
+				.height = solidAngleViewRes.y
 			} });
 
-			mainView = createImageAndView(finalSceneRenderFormat);
-			auto mainDepthView = createImageAndView(sceneRenderDepthFormat);
+			mainView = createImageAndView(mainViewRes, finalSceneRenderFormat);
+			auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat);
 			m_mainViewFramebuffer = m_device->createFramebuffer({ {
 					.renderpass = m_mainRenderpass,
 					.depthStencilAttachments = &mainDepthView.get(),
 					.colorAttachments = &mainView.get(),
-					.width = resolution.x,
-					.height = resolution.y
+					.width = mainViewRes.x,
+					.height = mainViewRes.y
 				} });
-
 		}
 		else
 		{
@@ -715,6 +723,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 	// we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes
 	constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight;
 
+	constexpr static inline float32_t4x4 OBBModelMatrixDefault
+	{
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 6.0f, 1.0f
+	};
 	//
 	smart_refctd_ptr<CGeometryCreatorScene> m_scene;
 	smart_refctd_ptr<IGPURenderpass> m_solidAngleRenderpass;
@@ -722,7 +737,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 	smart_refctd_ptr<CSimpleDebugRenderer> m_renderer;
 	smart_refctd_ptr<IGPUFramebuffer> m_solidAngleViewFramebuffer;
 	smart_refctd_ptr<IGPUFramebuffer> m_mainViewFramebuffer;
-	smart_refctd_ptr<video::IGPUGraphicsPipeline> m_visualizationPipeline;
+	smart_refctd_ptr<IGPUGraphicsPipeline> m_visualizationPipeline;
 	//
 	smart_refctd_ptr<ISemaphore> m_semaphore;
 	uint64_t m_realFrameIx = 0;
@@ -733,19 +748,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 	// UI stuff
 	struct CInterface
 	{
-		void cameraToHome()
-		{
-			core::vectorSIMDf cameraPosition(-3.0f, 3.0f, 6.0f);
-			core::vectorSIMDf cameraTarget(0.f, 0.f, 6.f);
-			const static core::vectorSIMDf up(0.f, 1.f, 0.f);
-
-			camera.setPosition(cameraPosition);
-			camera.setTarget(cameraTarget);
-			camera.setBackupUpVector(up);
-
-			camera.recomputeViewMatrix();
-		}
-
 		void operator()()
 		{
 			ImGuiIO& io = ImGui::GetIO();
@@ -773,7 +775,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					return projection;
 				}());
 
-			ImGuizmo::SetOrthographic(false);
+			ImGuizmo::SetOrthographic(!isPerspective);
 			ImGuizmo::BeginFrame();
 
 			ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing);
@@ -830,7 +832,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 			if (viewDirty || firstFrame)
 			{
-				cameraToHome();
+				camera.setPosition(cameraIntialPosition);
+				camera.setTarget(cameraInitialTarget);
+				camera.setBackupUpVector(cameraInitialUp);
+				camera.setUpVector(cameraInitialUp);
+
+				camera.recomputeViewMatrix();
 			}
 			firstFrame = false;
 
@@ -895,19 +902,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			* note it also modifies input view matrix but projection matrix is immutable
 			*/
 
-			if (ImGui::IsKeyPressed(ImGuiKey_Home))
-			{
-				cameraToHome();
-			}
+			// No need because camera already has this functionality
+			// if (ImGui::IsKeyPressed(ImGuiKey_Home))
+			// {
+			// 	cameraToHome();
+			// }
 
 			if (ImGui::IsKeyPressed(ImGuiKey_End))
 			{
-				m_OBBModelMatrix = {
-					1.0f, 0.0f, 0.0f, 0.0f,
-					0.0f, 1.0f, 0.0f, 0.0f,
-					0.0f, 0.0f, 1.0f, 0.0f,
-					0.0f, 0.0f, 12.0f, 1.0f
-				};
+				m_OBBModelMatrix = OBBModelMatrixDefault;
 			}
 
 			static struct
@@ -930,10 +933,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/	
 
 				transformParams.editTransformDecomposition = true;
-				transformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams);
+				mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams);
+				// MODEL: Zup -> Yup
+
+				m_OBBModelMatrix = imguizmoM16InOut.model;
 
 				// TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used
-				move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || transformReturnInfo.isGizmoWindowHovered) && (!transformReturnInfo.isGizmoBeingUsed);
+				move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed);
+
 			}
 
 			// to Nabla + update camera & model matrices
@@ -957,9 +964,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing);
 				ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing);
 				static bool isOpen = true;
-				ImGui::Begin("Solid angle view", &isOpen, 0);
+				ImGui::Begin("Projected Solid Angle View", &isOpen, 0);
 
 				ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
+				solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast<uint16_t>(contentRegionSize.x), static_cast<uint16_t>(contentRegionSize.y));
+				solidAngleViewTransformReturnInfo.isGizmoBeingUsed = false; // not used in this view
+				solidAngleViewTransformReturnInfo.isGizmoWindowHovered = false; // not used in this view
 				ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize);
 				ImGui::End();
 			}
@@ -1081,21 +1091,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		//
 		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
 		// mutables
-		float32_t4x4 m_OBBModelMatrix{
-			1.0f, 0.0f, 0.0f, 0.0f,
-			0.0f, 1.0f, 0.0f, 0.0f,
-			0.0f, 0.0f, 1.0f, 0.0f,
-			0.0f, 0.0f, 12.0f, 1.0f
-		};
+		float32_t4x4 m_OBBModelMatrix = OBBModelMatrixDefault;
 
 		//std::string_view objectName;
 		TransformRequestParams transformParams;
-		TransformReturnInfo transformReturnInfo;
+		TransformReturnInfo mainViewTransformReturnInfo;
+		TransformReturnInfo solidAngleViewTransformReturnInfo;
+
+		const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f };
+		const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f };
+		const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f };
 
 		float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f;
 		float viewWidth = 10.f;
-		float camYAngle = 90.f / 180.f * 3.14159f;
-		float camXAngle = 0.f / 180.f * 3.14159f;
 		//uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed
 		bool isPerspective = true, isLH = true, flipGizmoY = true, move = true;
 		bool firstFrame = true;
diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp
index 3b3cd38d8..f35cd341a 100644
--- a/common/include/nbl/examples/cameras/CCamera.hpp
+++ b/common/include/nbl/examples/cameras/CCamera.hpp
@@ -149,38 +149,36 @@ class Camera
 			if(ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) 
 			{
 				nbl::core::vectorSIMDf pos = getPosition();
-				nbl::core::vectorSIMDf localTarget = getTarget() - pos;
-
-				// Get Relative Rotation for localTarget in Radians
-				float relativeRotationX, relativeRotationY;
-				relativeRotationY = atan2(localTarget.X, localTarget.Z);
-				const double z1 = nbl::core::sqrt(localTarget.X*localTarget.X + localTarget.Z*localTarget.Z);
-				relativeRotationX = atan2(z1, localTarget.Y) - nbl::core::PI<float>()/2;
-				
-				constexpr float RotateSpeedScale = 0.003f; 
-				relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f;
-				float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f;
+				nbl::core::vectorSIMDf upVector = getUpVector();
+				nbl::core::vectorSIMDf forward = nbl::core::normalize(getTarget() - pos);
+
+				nbl::core::vectorSIMDf right = nbl::core::normalize(nbl::core::cross(forward, upVector));
+				nbl::core::vectorSIMDf up = nbl::core::normalize(nbl::core::cross(right, forward));
+
+				constexpr float RotateSpeedScale = 0.003f;
+				float pitchDelta = ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f;
+				float yawDelta = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f;
 
 				if (leftHanded)
-					relativeRotationY -= tmpYRot;
-				else
-					relativeRotationY += tmpYRot;
+					yawDelta = -yawDelta;
 
-				const double MaxVerticalAngle = nbl::core::radians<float>(88.0f);
+				// Clamp pitch BEFORE applying rotation
+				const float MaxVerticalAngle = nbl::core::radians<float>(88.0f);
+				float currentPitch = asin(nbl::core::dot(forward, upVector).X);
+				float newPitch = nbl::core::clamp(currentPitch + pitchDelta, -MaxVerticalAngle, MaxVerticalAngle);
+				pitchDelta = newPitch - currentPitch;
 
-				if (relativeRotationX > MaxVerticalAngle*2 && relativeRotationX < 2 * nbl::core::PI<float>()-MaxVerticalAngle)
-					relativeRotationX = 2 * nbl::core::PI<float>()-MaxVerticalAngle;
-				else
-					if (relativeRotationX > MaxVerticalAngle && relativeRotationX < 2 * nbl::core::PI<float>()-MaxVerticalAngle)
-						relativeRotationX = MaxVerticalAngle;
+				// Create rotation quaternions using axis-angle method
+				nbl::core::quaternion pitchRot = nbl::core::quaternion::fromAngleAxis(pitchDelta, right);
+				nbl::core::quaternion yawRot = nbl::core::quaternion::fromAngleAxis(yawDelta, upVector); 
+				nbl::core::quaternion combinedRot = yawRot * pitchRot;
 
-				localTarget.set(0,0, nbl::core::max(1.f, nbl::core::length(pos)[0]), 1.f);
+				// Apply to forward vector
+				forward = nbl::core::normalize(combinedRot.transformVect(forward));
 
-				nbl::core::matrix3x4SIMD mat;
-				mat.setRotation(nbl::core::quaternion(relativeRotationX, relativeRotationY, 0));
-				mat.transformVect(localTarget);
-				
-				setTarget(localTarget + pos);
+				// Set new target
+				float targetDistance = nbl::core::length(getTarget() - pos).X;
+				setTarget(pos + forward * targetDistance);
 			}
 		}
 	}

From adb15edd201e82cbc9ed3526bbfccfc67ccdf4ff Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sun, 7 Dec 2025 00:12:56 +0300
Subject: [PATCH 31/57] sphere arc "cube edge" in solid angle view, more
 reliable resizing of windows

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 218 ++++++++----------
 72_SolidAngleVisualizer/main.cpp              |  24 +-
 2 files changed, 107 insertions(+), 135 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index 2ad766c8a..badf1e4be 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -32,8 +32,7 @@ static const float3 localNormals[6] = {
 static float3 corners[8];
 static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), 
                             float3(0,0,0), float3(0,0,0), float3(0,0,0) };
-static float2 projCorners[8];
-static bool   cornerVisible[8];
+
 
 
 // Converts UV into centered, aspect-corrected NDC circle space
@@ -46,32 +45,7 @@ float2 toCircleSpace(float2 uv)
     float aspect = pc.viewport.z / pc.viewport.w; // width / height
     p.x *= aspect;
 
-    return p;
-}
-
-
-// Distance to a 2D line segment
-float sdSegment(float2 p, float2 a, float2 b)
-{
-    float2 pa = p - a;
-    float2 ba = b - a;
-    float h = clamp(dot(pa, ba) / dot(ba, ba), 0.0f, 1.0f);
-    return length(pa - ba * h);
-}
-
-// TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection)
-bool projectToOrthoSphere(float3 p, out float2 uv)
-{
-    float3 n = normalize(p);   // direction to sphere
-
-    // hemisphere (Z > 0)
-    if (n.z <= 0.0)
-        return false;
-
-    // orthographic projection (drop Z)
-    uv = n.xy;
-
-    return true; // valid
+    return p * CIRCLE_RADIUS;
 }
 
 void computeCubeGeo()
@@ -86,121 +60,121 @@ void computeCubeGeo()
         faceCenters[i/4]      += worldPos / 4.0f; 
         faceCenters[2+i%2]    += worldPos / 4.0f; 
         faceCenters[4+(i/2)%2] += worldPos / 4.0f; 
-
-        float3 viewPos = worldPos.xyz; 
-        cornerVisible[i] = projectToOrthoSphere(viewPos, projCorners[i]);
-        projCorners[i] *= CIRCLE_RADIUS; // scale to circle radius
     }
 }
 
-// int getVisibilityCount(int2 faces, float3 cameraPos)
-// {
-//     float3x3 rotMatrix = (float3x3)pc.modelMatrix;
-//     float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
-//     float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
-    
-//     float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; 
-//     float3 viewVec_f2 = faceCenters[faces.y] - cameraPos;
-
-//     // Face is visible if its outward normal points towards the origin (camera).
-//     bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f;
-//     bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f;
-
-//     // Determine Line Style:
-//     bool isSilhouette = visible1 != visible2; // One face visible, the other hidden
-//     bool isInner = visible1 && visible2;      // Both faces visible
-    
-//     int visibilityCount = 0;
-//     if (isSilhouette) 
-//     {
-//         visibilityCount = 1;
-//     }
-//     else if (isInner)
-//     {
-//         visibilityCount = 2;
-//     }
-
-//     return visibilityCount;
-// }
-
-// void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth)
-// {
-//     if (visibilityCount > 0)
-//     {
-//         float3 A = corners[a];
-//         float3 B = corners[b];
-
-//         float avgDepth = (length(A) + length(B)) * 0.5f;
-//         float referenceDepth = 3.0f;
-//         float depthScale = referenceDepth / avgDepth;
-
-//         float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f;
-//         float intensity = (visibilityCount == 1) ? 1.0f : 0.5f;
-//         float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red
-        
-//         float width = min(baseWidth * depthScale, 0.03f); 
-        
-//         float dist = sdSegment(p, projCorners[a], projCorners[b]);
-        
-//         float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
-        
-//         color += edgeColor * alpha * intensity;
-//     }
-// }
-
-void drawRing(float2 p, inout float4 color, float aaWidth)
+float4 drawRing(float2 p, float aaWidth)
 {
     float positionLength = length(p);
-
-    // Mask to cut off drawing outside the circle
-    // float circleMask = 1.0f - smoothstep(CIRCLE_RADIUS, CIRCLE_RADIUS + aaWidth, positionLength);
-    // color *= circleMask;
     
     // Add a white background circle ring
-    float ringWidth = 0.005f;
+    float ringWidth = 0.01f;
     float ringDistance = abs(positionLength - CIRCLE_RADIUS);
     float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
     
-    // Ring color is now white
-    color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); 
+    return ringAlpha.xxxx; 
 }
 
-float plotPoint(float2 uv, float2 p, float r)
+// Check if a face on the hemisphere is visible from camera at origin
+bool isFaceVisible(float3 faceCenter, float3 faceNormal)
 {
-    return step(length(uv - p), r);
+    // Face is visible if normal points toward camera (at origin)
+    float3 viewVec = -normalize(faceCenter); // Vector from face to camera
+    return dot(faceNormal, viewVec) > 0.0f;
 }
 
+int getEdgeVisibility(int edgeIdx, float3 cameraPos)
+{
+    int2 faces = edgeToFaces[edgeIdx];
+    
+    // Transform normals to world space
+    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
+    float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
+    float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
+    
+    bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1);
+    bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2);
+    
+    // Silhouette: exactly one face visible
+    if (visible1 != visible2) return 1;
+    
+    // Inner edge: both faces visible
+    if (visible1 && visible2) return 2;
+    
+    // Hidden edge: both faces hidden
+    return 0;
+}
+
+// Draw great circle arc in fragment shader
+float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth)
+{
+    if (visibility == 0) return float4(0,0,0,0); // Hidden edge
+    
+    float3 v0 = normalize(corners[edgeVerts.x]);
+    float3 v1 = normalize(corners[edgeVerts.y]);
+    float3 p = normalize(fragPos); // Current point on hemisphere
+    
+    // Great circle plane normal
+    float3 arcNormal = normalize(cross(v0, v1));
+    
+    // Distance to great circle
+    float dist = abs(dot(p, arcNormal));
+    
+    // Check if point is within arc bounds
+    float dotMid = dot(v0, v1);
+    bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid);
+    
+    if (!onArc) return float4(0,0,0,0);
+    
+    // Depth-based width scaling
+    float avgDepth = (length(corners[edgeVerts.x]) + length(corners[edgeVerts.y])) * 0.5f;
+    float depthScale = 3.0f / avgDepth;
+    
+    float baseWidth = (visibility == 1) ? 0.01f : 0.005f;
+    float width = min(baseWidth * depthScale, 0.02f);
+    
+    float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
+    
+    float4 edgeColor = (visibility == 1) ? 
+        float4(0.0f, 0.5f, 1.0f, 1.0f) :  // Silhouette: blue
+        float4(1.0f, 0.0f, 0.0f, 1.0f);   // Inner: red
+    
+    float intensity = (visibility == 1) ? 1.0f : 0.5f;
+    return edgeColor * alpha * intensity;
+}
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
 {
-    float3 cameraPos = float3(0, 0, 0); // Camera at origin
-    float2 p = toCircleSpace(vx.uv);
+    float3 cameraPos = float3(0, 0, 0);
     float4 color = float4(0, 0, 0, 0);
-
-    computeCubeGeo();
+    float2 p = toCircleSpace(vx.uv);
     
-    float aaWidth = max(fwidth(p.x), fwidth(p.y)); 
-
-    float pointMask = 0.0;
-    for (int i=0; i<8; i++)
+    // Convert 2D disk position to 3D hemisphere position
+    // p is in range [-CIRCLE_RADIUS, CIRCLE_RADIUS]
+    float2 normalized = p / CIRCLE_RADIUS; // Now in range [-1, 1]
+    float r2 = dot(normalized, normalized);
+    
+    if (r2 > 1.0f)
+        discard;
+    
+    // Convert UV to 3D position on hemisphere
+    float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
+    
+    computeCubeGeo(); // Your existing function
+    
+    float aaWidth = length(float2(ddx(p.x), ddy(p.y))); 
+    
+    // Draw edges as great circle arcs
+    for (int j = 0; j < 12; j++) 
     {
-        if (cornerVisible[i])
-            pointMask += plotPoint(p, projCorners[i], 0.015f);
+        int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
+        int b = a + (4 >> (j / 4));
+        
+        int visibility = getEdgeVisibility(j, cameraPos);
+        color += drawGreatCircleArc(spherePos, int2(a, b), visibility, aaWidth);
     }
-
-    color += pointMask * float4(1,0,0,1); // red points
-
-    // for (int j = 0; j < 12; j++)
-    // {
-    //     int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
-    //     int b = a + (4 >> (j / 4));
-
-    //     // int2 faces = edgeToFaces[j];
-    //     // int visibilityCount = getVisibilityCount(faces, cameraPos);
-    //     // drawLine(p, a, b, visibilityCount, color, aaWidth);
-    // }
-
-    drawRing(p, color, aaWidth);
-
+    
+    color += drawRing(p, aaWidth);
+    
     return color;
 }
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index 1025eb067..8fb8bf144 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -323,7 +323,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution;
 			if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] ||
 				!m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1])
-				recreateFramebuffer();
+				recreateFramebuffers();
 		}
 
 		//
@@ -402,10 +402,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				auto& instance = m_renderer->m_instances[0];
 				auto transposed = hlsl::transpose(interface.m_OBBModelMatrix);
 				memcpy(&instance.world, &transposed, sizeof(instance.world));
-				instance.packedGeo = m_renderer->getGeometries().data();// +interface.gcIndex;
+				instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex;
 				m_renderer->render(cb, viewParams); // draw the cube/OBB
 
-
 				// TODO: a better way to get identity matrix
 				float32_t3x4 origin = {
 					1.0f,0.0f,0.0f,0.0f,
@@ -536,7 +535,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		camera.setMoveSpeed(interface.moveSpeed);
 		camera.setRotateSpeed(interface.rotateSpeed);
 
-
 		m_inputSystem->getDefaultMouse(&mouse);
 		m_inputSystem->getDefaultKeyboard(&keyboard);
 
@@ -610,7 +608,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		interface.imGUI->update(params);
 	}
 
-	void recreateFramebuffer()
+	void recreateFramebuffers()
 	{
 
 		auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format)->smart_refctd_ptr<IGPUImageView>
@@ -671,30 +669,30 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		}
 
 		// release previous slot and its image
-		interface.subAllocDS->multi_deallocate(0, static_cast<int>(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx });
+		interface.subAllocDS->multi_deallocate(0, static_cast<int>(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1 });
 		//
-		if (solidAngleView)
+		if (solidAngleView && mainView)
 		{
 			interface.subAllocDS->multi_allocate(0, static_cast<int>(CInterface::Count), interface.renderColorViewDescIndices);
 			// update descriptor set
 			IGPUDescriptorSet::SDescriptorInfo infos[static_cast<int>(CInterface::Count)] = {};
-			infos[0].desc = solidAngleView;
+			infos[0].desc = mainView;
 			infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
-			infos[1].desc = mainView;
+			infos[1].desc = solidAngleView;
 			infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL;
 			const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast<int>(CInterface::Count)] = {
 				{.dstSet = interface.subAllocDS->getDescriptorSet(),
 				.binding = TexturesImGUIBindingIndex,
-				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_SOLID_ANGLE_VIEW)],
+				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_MAIN_VIEW)],
 				.count = 1,
 				.info = &infos[static_cast<int>(CInterface::ERV_MAIN_VIEW)]
 				},
 				{
 				.dstSet = interface.subAllocDS->getDescriptorSet(),
 				.binding = TexturesImGUIBindingIndex,
-				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_MAIN_VIEW)],
+				.arrayElement = interface.renderColorViewDescIndices[static_cast<int>(CInterface::ERV_SOLID_ANGLE_VIEW)],
 				.count = 1,
-				.info = &infos[1]
+				.info = &infos[static_cast<int>(CInterface::ERV_SOLID_ANGLE_VIEW)]
 				}
 			};
 			m_device->updateDescriptorSets({ write, static_cast<int>(CInterface::Count) }, {});
@@ -728,7 +726,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		1.0f, 0.0f, 0.0f, 0.0f,
 		0.0f, 1.0f, 0.0f, 0.0f,
 		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 6.0f, 1.0f
+		0.0f, 0.0f, 3.0f, 1.0f
 	};
 	//
 	smart_refctd_ptr<CGeometryCreatorScene> m_scene;

From 008e2ee154b6cf5ba725752a3f1b4dac5d37ff42 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sun, 7 Dec 2025 00:29:22 +0300
Subject: [PATCH 32/57] Scaling by pressing G to prevent conflict with WASD
 camera movement, also added Q and E for moving up and down

---
 72_SolidAngleVisualizer/include/transform.hpp   | 4 +++-
 common/include/nbl/examples/cameras/CCamera.hpp | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp
index 5061ebd49..639c0fa3a 100644
--- a/72_SolidAngleVisualizer/include/transform.hpp
+++ b/72_SolidAngleVisualizer/include/transform.hpp
@@ -35,13 +35,15 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 	static bool boundSizing = false;
 	static bool boundSizingSnap = false;
 
+	ImGui::Text("Press T/R/G to change gizmo mode");
+
 	if (params.editTransformDecomposition)
 	{
 		if (ImGui::IsKeyPressed(ImGuiKey_T))
 			mCurrentGizmoOperation = ImGuizmo::TRANSLATE;
 		if (ImGui::IsKeyPressed(ImGuiKey_R))
 			mCurrentGizmoOperation = ImGuizmo::ROTATE;
-		if (ImGui::IsKeyPressed(ImGuiKey_S))
+		if (ImGui::IsKeyPressed(ImGuiKey_G))
 			mCurrentGizmoOperation = ImGuizmo::SCALE;
 		if (ImGui::RadioButton("Translate", mCurrentGizmoOperation == ImGuizmo::TRANSLATE))
 			mCurrentGizmoOperation = ImGuizmo::TRANSLATE;
diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp
index f35cd341a..e5f077e46 100644
--- a/common/include/nbl/examples/cameras/CCamera.hpp
+++ b/common/include/nbl/examples/cameras/CCamera.hpp
@@ -39,6 +39,8 @@ class Camera
 	enum E_CAMERA_MOVE_KEYS : uint8_t
 	{
 		ECMK_MOVE_FORWARD = 0,
+		ECMK_MOVE_UP,
+		ECMK_MOVE_DOWN,
 		ECMK_MOVE_BACKWARD,
 		ECMK_MOVE_LEFT,
 		ECMK_MOVE_RIGHT,
@@ -47,6 +49,8 @@ class Camera
 
 	inline void mapKeysToWASD()
 	{
+		keysMap[ECMK_MOVE_UP] = nbl::ui::EKC_E;
+		keysMap[ECMK_MOVE_DOWN] = nbl::ui::EKC_Q;
 		keysMap[ECMK_MOVE_FORWARD] = nbl::ui::EKC_W;
 		keysMap[ECMK_MOVE_BACKWARD] = nbl::ui::EKC_S;
 		keysMap[ECMK_MOVE_LEFT] = nbl::ui::EKC_A;
@@ -211,7 +215,7 @@ class Camera
 			assert(timeDiff >= 0);
 
 			// handle camera movement
-			for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT })
+			for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_UP, ECMK_MOVE_DOWN, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT })
 			{
 				const auto code = keysMap[logicalKey];
 
@@ -275,6 +279,9 @@ class Camera
 				up = nbl::core::normalize(backupUpVector);
 			}
 
+			pos += up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_UP] * moveSpeed * MoveSpeedScale;
+			pos -= up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_DOWN] * moveSpeed * MoveSpeedScale;
+
 			nbl::core::vectorSIMDf strafevect = localTarget;
 			if (leftHanded)
 				strafevect = nbl::core::cross(strafevect, up);

From 4290f4ab26360fbf8dac4c45c395fc4a20faf6e3 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sun, 7 Dec 2025 16:33:09 +0300
Subject: [PATCH 33/57] better clipping of arcs behind the hemisphere

---
 .../app_resources/hlsl/SolidAngleVis.frag.hlsl                | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index badf1e4be..c12c007a0 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -114,6 +114,10 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
     float3 v1 = normalize(corners[edgeVerts.y]);
     float3 p = normalize(fragPos); // Current point on hemisphere
     
+    // Skip fragment if not in front of hemisphere or edge if both endpoints are behind horizon
+    if (p.z < 0.0f || (v0.z < 0.0f && v1.z < 0.0f)) 
+        return float4(0,0,0,0);
+    
     // Great circle plane normal
     float3 arcNormal = normalize(cross(v0, v1));
     

From ba068c44c08a777bb6794b3e0f019cbdc3605480 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Mon, 8 Dec 2025 08:47:02 +0300
Subject: [PATCH 34/57] WIP quick push for shader code

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 154 +++++++++++++++---
 1 file changed, 135 insertions(+), 19 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index c12c007a0..7c96a8316 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -20,6 +20,25 @@ static const int2 edgeToFaces[12] = {
     {0,4}, {5,0}, {4,1}, {1,5} 
 };
 
+//float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f
+static const float3 constCorners[8] = {
+    float3(-1, -1, -1), // 0
+    float3( 1, -1, -1), // 1
+    float3(-1,  1, -1), // 2
+    float3( 1,  1, -1), // 3
+    float3(-1, -1,  1), // 4
+    float3( 1, -1,  1), // 5
+    float3(-1,  1,  1), // 6
+    float3( 1,  1,  1)  // 7
+};
+
+// All 12 edges of the cube (vertex index pairs)
+static const int2 allEdges[12] = {
+    {0, 1}, {2, 3}, {4, 5}, {6, 7}, // Edges along X axis
+    {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Edges along Y axis
+    {0, 4}, {1, 5}, {2, 6}, {3, 7}  // Edges along Z axis
+};
+
 static const float3 localNormals[6] = {
     float3(0, 0, -1), // Face 0 (Z-)
     float3(0, 0, 1),  // Face 1 (Z+)
@@ -34,6 +53,30 @@ static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0),
                             float3(0,0,0), float3(0,0,0), float3(0,0,0) };
 
 
+static const float3 colorLUT[8] = {
+    float3(0, 0, 0),        // 0: Black
+    float3(1, 0, 0),       // 1: Red
+    float3(0, 1, 0),       // 2: Green
+    float3(1, 1, 0),       // 3: Yellow
+    float3(0, 0, 1),       // 4: Blue
+    float3(1, 0, 1),       // 5: Magenta
+    float3(0, 1, 1),       // 6: Cyan
+    float3(1, 1, 1)        // 7: White
+};
+
+
+    
+// Vertices are ordered CCW relative to the camera view.
+static const int silhouettes[8][6] = {
+    {2, 3, 1, 5, 4, 6}, // 0: Black
+    {6, 7, 5, 1, 0, 2}, // 1: Red
+    {7, 6, 4, 0, 1, 3}, // 2: Green
+    {3, 7, 5, 4, 0, 2}, // 3: Yellow
+    {3, 2, 0, 4, 5, 7}, // 4: Cyan
+    {1, 3, 7, 6, 4, 0}, // 5: Magenta
+    {0, 1, 5, 7, 6, 2}, // 6: White
+    {4, 6, 2, 3, 1, 5}  // 7: Gray
+};
 
 // Converts UV into centered, aspect-corrected NDC circle space
 float2 toCircleSpace(float2 uv)
@@ -52,7 +95,7 @@ void computeCubeGeo()
 {
     for (int i = 0; i < 8; i++)
     {
-        float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f;
+        float3 localPos = constCorners[i]; //float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f;
         float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
         
         corners[i] = worldPos.xyz;
@@ -72,7 +115,7 @@ float4 drawRing(float2 p, float aaWidth)
     float ringDistance = abs(positionLength - CIRCLE_RADIUS);
     float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
     
-    return ringAlpha.xxxx; 
+    return ringAlpha * float4(1, 1, 1, 1); 
 }
 
 // Check if a face on the hemisphere is visible from camera at origin
@@ -105,7 +148,7 @@ int getEdgeVisibility(int edgeIdx, float3 cameraPos)
     return 0;
 }
 
-// Draw great circle arc in fragment shader
+// Draw great circle arc in fragment shader with horizon clipping
 float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth)
 {
     if (visibility == 0) return float4(0,0,0,0); // Hidden edge
@@ -114,8 +157,12 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
     float3 v1 = normalize(corners[edgeVerts.y]);
     float3 p = normalize(fragPos); // Current point on hemisphere
     
-    // Skip fragment if not in front of hemisphere or edge if both endpoints are behind horizon
-    if (p.z < 0.0f || (v0.z < 0.0f && v1.z < 0.0f)) 
+    // HORIZON CLIPPING: Current fragment must be on front hemisphere
+    if (p.z < 0.0f) 
+        return float4(0,0,0,0);
+    
+    // HORIZON CLIPPING: Skip edge if both endpoints are behind horizon
+    if (v0.z < 0.0f && v1.z < 0.0f) 
         return float4(0,0,0,0);
     
     // Great circle plane normal
@@ -149,36 +196,105 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
 {
-    float3 cameraPos = float3(0, 0, 0);
     float4 color = float4(0, 0, 0, 0);
     float2 p = toCircleSpace(vx.uv);
     
     // Convert 2D disk position to 3D hemisphere position
-    // p is in range [-CIRCLE_RADIUS, CIRCLE_RADIUS]
-    float2 normalized = p / CIRCLE_RADIUS; // Now in range [-1, 1]
+    float2 normalized = p / CIRCLE_RADIUS;
     float r2 = dot(normalized, normalized);
     
-    if (r2 > 1.0f)
-        discard;
-    
     // Convert UV to 3D position on hemisphere
     float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
     
-    computeCubeGeo(); // Your existing function
+    computeCubeGeo();
+    
+    float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
+    
+    float3 viewDir = obbCenter; 
+    
+    // Is this correct?
+    float dotX = dot(viewDir, float3(pc.modelMatrix[0][0], pc.modelMatrix[1][0], pc.modelMatrix[2][0]));
+    float dotY = dot(viewDir, float3(pc.modelMatrix[0][1], pc.modelMatrix[1][1], pc.modelMatrix[2][1]));
+    float dotZ = dot(viewDir, float3(pc.modelMatrix[0][2], pc.modelMatrix[1][2], pc.modelMatrix[2][2]));
+
+    // Determine octant from ray direction signs
+    int octant = (dotX >= 0 ? 4 : 0) + 
+                 (dotY >= 0 ? 2 : 0) + 
+                 (dotZ >= 0 ? 1 : 0);
+
+    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
+    {
+        return float4(colorLUT[octant], 1.0f);
+    }
+    
+    float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); 
     
-    float aaWidth = length(float2(ddx(p.x), ddy(p.y))); 
+
+    // Draw the 6 silhouette edges
+    for (int i = 0; i < 6; i++) 
+    {
+        int v0Idx = silhouettes[octant][i];
+        int v1Idx = silhouettes[octant][(i + 1) % 6];
+        
+        float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth);
+        color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
+    }
     
-    // Draw edges as great circle arcs
-    for (int j = 0; j < 12; j++) 
+    // Draw the remaining edges (non-silhouette) in a different color
+    float3 hiddenEdgeColor = float3(0.3, 0.3, 0.3); // Gray color for hidden edges
+    
+    for (int i = 0; i < 12; i++)
     {
-        int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0);
-        int b = a + (4 >> (j / 4));
+        int2 edge = allEdges[i];
+        
+        // Check if this edge is already drawn as a silhouette edge
+        bool isSilhouette = false;
+        for (int j = 0; j < 6; j++)
+        {
+            int v0 = silhouettes[octant][j];
+            int v1 = silhouettes[octant][(j + 1) % 6];
+            
+            if ((edge.x == v0 && edge.y == v1) || (edge.x == v1 && edge.y == v0))
+            {
+                isSilhouette = true;
+                break;
+            }
+        }
         
-        int visibility = getEdgeVisibility(j, cameraPos);
-        color += drawGreatCircleArc(spherePos, int2(a, b), visibility, aaWidth);
+        // Only draw if it's not a silhouette edge
+        if (!isSilhouette)
+        {
+            float4 edgeContribution = drawGreatCircleArc(spherePos, edge, 1, aaWidth);
+            color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a);
+        }
+    }
+
+    // Draw corner labels for debugging
+    for (int i = 0; i < 8; i++)
+    {
+        float3 corner = normalize(corners[i]);
+        float2 cornerPos = corner.xy;
+        // Project corner onto 2D circle space
+        
+        // Distance from current fragment to corner
+        float dist = length(spherePos.xy - cornerPos);
+        
+        // Draw a small colored dot at the corner
+        float dotSize = 0.03f;
+        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
+        
+        if (dotAlpha > 0.0f)
+        {
+            float brightness = float(i) / 7.0f;
+            float3 dotColor = colorLUT[i];
+            color += float4(dotColor * dotAlpha, dotAlpha);
+        }
     }
     
     color += drawRing(p, aaWidth);
+
+    // if (r2 > 1.1f)
+    //     color.a = 0.0f; // Outside circle, make transparent
     
     return color;
 }
\ No newline at end of file

From 2e5642ab9614132821624235eda634fb23b4c609 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 8 Dec 2025 20:57:45 +0700
Subject: [PATCH 35/57] Remove example 73 mortons

---
 73_Mortons/CTester.h                     | 401 -----------------------
 73_Mortons/app_resources/testCommon.hlsl | 258 ---------------
 2 files changed, 659 deletions(-)
 delete mode 100644 73_Mortons/CTester.h
 delete mode 100644 73_Mortons/app_resources/testCommon.hlsl

diff --git a/73_Mortons/CTester.h b/73_Mortons/CTester.h
deleted file mode 100644
index b4097dad6..000000000
--- a/73_Mortons/CTester.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
-
-#include <nabla.h>
-#include "app_resources/testCommon.hlsl"
-#include "ITester.h"
-
-using namespace nbl;
-
-class CTester final : public ITester
-{
-public:
-    void performTests()
-    {
-        std::random_device rd;
-        std::mt19937 mt(rd());
-
-        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
-        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
-        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
-
-        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
-        for (int i = 0; i < Iterations; ++i)
-        {
-            // Set input thest values that will be used in both CPU and GPU tests
-            InputTestValues testInput;
-            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
-            TestValues expected;
-
-            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
-            testInput.shift = generatedShift;
-            {
-                uint64_t generatedA = longDistribution(mt);
-                uint64_t generatedB = longDistribution(mt);
-
-                testInput.generatedA = generatedA;
-                testInput.generatedB = generatedB;
-
-                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
-                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
-                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
-                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
-                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
-                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
-                expected.emulatedUnaryMinus = _static_cast<emulated_int64_t>(-generatedA);
-                expected.emulatedLess = uint32_t(generatedA < generatedB);
-                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
-                expected.emulatedGreater = uint32_t(generatedA > generatedB);
-                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
-
-                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
-                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
-                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
-            }
-            {
-                testInput.coordX = longDistribution(mt);
-                testInput.coordY = longDistribution(mt);
-                testInput.coordZ = longDistribution(mt);
-                testInput.coordW = longDistribution(mt);
-
-                uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
-                uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
-
-                uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 );
-                uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 );
-                uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2);
-                uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2);
-                uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2);
-                uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2);
-
-                uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
-                uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
-
-                uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3);
-                uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3);
-                uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3);
-                uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3);
-                uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3);
-                uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3);
-
-                uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
-                uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
-
-                uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4);
-                uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4);
-                uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4);
-                uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4);
-                uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4);
-                uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4);
-
-                // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them
-                // so their highest bits are all 0s or 1s depending on the sign of the number they encode
-
-                int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
-                int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
-                int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
-                int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
-                int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
-                int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
-
-                int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
-                int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
-                int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
-                int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
-                int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
-                int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
-
-                int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
-                int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
-                int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
-                int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
-                int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
-                int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
-
-                // Plus
-                expected.mortonPlus_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall + Vec2BSmall) & static_cast<uint16_t>(smallBitsMask_2));
-                expected.mortonPlus_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium + Vec2BMedium) & static_cast<uint16_t>(mediumBitsMask_2));
-                expected.mortonPlus_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull + Vec2BFull) & static_cast<uint32_t>(fullBitsMask_2));
-                expected.mortonPlus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull + Vec2BFull) & static_cast<uint32_t>(fullBitsMask_2));
-
-                expected.mortonPlus_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall + Vec3BSmall) & static_cast<uint16_t>(smallBitsMask_3));
-                expected.mortonPlus_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium + Vec3BMedium) & static_cast<uint16_t>(mediumBitsMask_3));
-                expected.mortonPlus_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull + Vec3BFull) & static_cast<uint32_t>(fullBitsMask_3));
-                expected.mortonPlus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull + Vec3BFull) & static_cast<uint32_t>(fullBitsMask_3));
-
-                expected.mortonPlus_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall + Vec4BSmall) & static_cast<uint16_t>(smallBitsMask_4));
-                expected.mortonPlus_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium + Vec4BMedium) & static_cast<uint16_t>(mediumBitsMask_4));
-                expected.mortonPlus_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull + Vec4BFull) & static_cast<uint16_t>(fullBitsMask_4));
-                expected.mortonPlus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull + Vec4BFull) & static_cast<uint16_t>(fullBitsMask_4));
-
-                // // Minus
-                // expected.mortonMinus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall - Vec2BSmall);
-                // expected.mortonMinus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium - Vec2BMedium);
-                // expected.mortonMinus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull - Vec2BFull);
-                // expected.mortonMinus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull - Vec2BFull);
-                //
-                // expected.mortonMinus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall - Vec3BSmall);
-                // expected.mortonMinus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium - Vec3BMedium);
-                // expected.mortonMinus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull - Vec3BFull);
-                // expected.mortonMinus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull - Vec3BFull);
-                //
-                // expected.mortonMinus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall - Vec4BSmall);
-                // expected.mortonMinus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium - Vec4BMedium);
-                // expected.mortonMinus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull - Vec4BFull);
-                // expected.mortonMinus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull - Vec4BFull);
-                //
-                // Coordinate-wise equality
-                expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall));
-                expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium));
-                expected.mortonEqual_full_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
-                expected.mortonEqual_emulated_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
-
-                expected.mortonEqual_small_3 = uint32_t3(glm::equal(Vec3ASmall, Vec3BSmall));
-                expected.mortonEqual_medium_3 = uint32_t3(glm::equal(Vec3AMedium, Vec3BMedium));
-                expected.mortonEqual_full_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
-                expected.mortonEqual_emulated_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
-
-                expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall));
-                expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium));
-                expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
-
-                // Coordinate-wise unsigned inequality (just testing with less)
-                expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall));
-                expected.mortonUnsignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2AMedium, Vec2BMedium));
-                expected.mortonUnsignedLess_full_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
-                expected.mortonUnsignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
-
-                expected.mortonUnsignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASmall, Vec3BSmall));
-                expected.mortonUnsignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3AMedium, Vec3BMedium));
-                expected.mortonUnsignedLess_full_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
-                expected.mortonUnsignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
-
-                expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall));
-                expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium));
-                expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
-
-                // Coordinate-wise signed inequality
-                expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall));
-                expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium));
-                expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
-
-                expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall));
-                expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium));
-                expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
-
-                expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall));
-                expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium));
-                expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
-
-                uint16_t castedShift = uint16_t(generatedShift);
-                // Left-shift
-                expected.mortonLeftShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
-                expected.mortonLeftShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
-                expected.mortonLeftShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-                expected.mortonLeftShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-
-                expected.mortonLeftShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
-                expected.mortonLeftShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
-                expected.mortonLeftShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-                expected.mortonLeftShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-
-                expected.mortonLeftShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
-                expected.mortonLeftShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
-                expected.mortonLeftShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-                expected.mortonLeftShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-
-                // Unsigned right-shift
-                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
-                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
-                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2));
-
-                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
-                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
-                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3));
-
-                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
-                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
-                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
-            
-                // Signed right-shift
-                // expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
-                // expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
-                // expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
-                //
-                // expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
-                // expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
-                // expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
-                //
-                // expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
-                // expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
-                // expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
-            }
-
-            performCpuTests(testInput, expected);
-            performGpuTests(testInput, expected);
-        }
-        m_logger->log("FIRST TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
-    }
-
-private:
-    inline static constexpr int Iterations = 100u;
-
-    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues cpuTestValues;
-
-        fillTestValues(commonTestInputValues, cpuTestValues);
-        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
-
-    }
-
-    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues gpuTestValues;
-        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
-        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
-    }
-
-    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
-    {
-        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
-        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
-        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
-        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
-        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
-        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
-        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
-        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
-        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
-        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
-        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
-        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
-        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
-        verifyTestValue("emulatedUnaryMinus", expectedTestValues.emulatedUnaryMinus, testValues.emulatedUnaryMinus, testType);
-
-        // Morton Plus
-        verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
-        verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType);
-        verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType);
-        verifyTestValue("mortonPlus_emulated_2", expectedTestValues.mortonPlus_emulated_2, testValues.mortonPlus_emulated_2, testType);
-
-        verifyTestValue("mortonPlus_small_3", expectedTestValues.mortonPlus_small_3, testValues.mortonPlus_small_3, testType);
-        verifyTestValue("mortonPlus_medium_3", expectedTestValues.mortonPlus_medium_3, testValues.mortonPlus_medium_3, testType);
-        verifyTestValue("mortonPlus_full_3", expectedTestValues.mortonPlus_full_3, testValues.mortonPlus_full_3, testType);
-        verifyTestValue("mortonPlus_emulated_3", expectedTestValues.mortonPlus_emulated_3, testValues.mortonPlus_emulated_3, testType);
-
-        verifyTestValue("mortonPlus_small_4", expectedTestValues.mortonPlus_small_4, testValues.mortonPlus_small_4, testType);
-        verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType);
-        verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType);
-        verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType);
-
-        // Morton Minus
-        verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType);
-        verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType);
-        verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType);
-        verifyTestValue("mortonMinus_emulated_2", expectedTestValues.mortonMinus_emulated_2, testValues.mortonMinus_emulated_2, testType);
-
-        verifyTestValue("mortonMinus_small_3", expectedTestValues.mortonMinus_small_3, testValues.mortonMinus_small_3, testType);
-        verifyTestValue("mortonMinus_medium_3", expectedTestValues.mortonMinus_medium_3, testValues.mortonMinus_medium_3, testType);
-        verifyTestValue("mortonMinus_full_3", expectedTestValues.mortonMinus_full_3, testValues.mortonMinus_full_3, testType);
-        verifyTestValue("mortonMinus_emulated_3", expectedTestValues.mortonMinus_emulated_3, testValues.mortonMinus_emulated_3, testType);
-
-        verifyTestValue("mortonMinus_small_4", expectedTestValues.mortonMinus_small_4, testValues.mortonMinus_small_4, testType);
-        verifyTestValue("mortonMinus_medium_4", expectedTestValues.mortonMinus_medium_4, testValues.mortonMinus_medium_4, testType);
-        verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType);
-        verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType);
-
-        // Morton coordinate-wise equality
-        verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType);
-        verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType);
-        verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType);
-        verifyTestValue("mortonEqual_emulated_2", expectedTestValues.mortonEqual_emulated_2, testValues.mortonEqual_emulated_2, testType);
-
-        verifyTestValue("mortonEqual_small_3", expectedTestValues.mortonEqual_small_3, testValues.mortonEqual_small_3, testType);
-        verifyTestValue("mortonEqual_medium_3", expectedTestValues.mortonEqual_medium_3, testValues.mortonEqual_medium_3, testType);
-        verifyTestValue("mortonEqual_full_3", expectedTestValues.mortonEqual_full_3, testValues.mortonEqual_full_3, testType);
-        verifyTestValue("mortonEqual_emulated_3", expectedTestValues.mortonEqual_emulated_3, testValues.mortonEqual_emulated_3, testType);
-
-        verifyTestValue("mortonEqual_small_4", expectedTestValues.mortonEqual_small_4, testValues.mortonEqual_small_4, testType);
-        verifyTestValue("mortonEqual_medium_4", expectedTestValues.mortonEqual_medium_4, testValues.mortonEqual_medium_4, testType);
-        verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType);
-
-        // Morton coordinate-wise unsigned inequality
-        verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType);
-        verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType);
-        verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType);
-        verifyTestValue("mortonUnsignedLess_emulated_2", expectedTestValues.mortonUnsignedLess_emulated_2, testValues.mortonUnsignedLess_emulated_2, testType);
-
-        verifyTestValue("mortonUnsignedLess_small_3", expectedTestValues.mortonUnsignedLess_small_3, testValues.mortonUnsignedLess_small_3, testType);
-        verifyTestValue("mortonUnsignedLess_medium_3", expectedTestValues.mortonUnsignedLess_medium_3, testValues.mortonUnsignedLess_medium_3, testType);
-        verifyTestValue("mortonUnsignedLess_full_3", expectedTestValues.mortonUnsignedLess_full_3, testValues.mortonUnsignedLess_full_3, testType);
-        verifyTestValue("mortonUnsignedLess_emulated_3", expectedTestValues.mortonUnsignedLess_emulated_3, testValues.mortonUnsignedLess_emulated_3, testType);
-
-        verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType);
-        verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
-        verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
-
-        // Morton coordinate-wise signed inequality
-        verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
-        verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
-        verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
-
-        verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
-        verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
-        verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
-
-        verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
-        verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
-        verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
-
-        // Morton left-shift
-        verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
-        verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType);
-        verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType);
-        verifyTestValue("mortonLeftShift_emulated_2", expectedTestValues.mortonLeftShift_emulated_2, testValues.mortonLeftShift_emulated_2, testType);
-
-        verifyTestValue("mortonLeftShift_small_3", expectedTestValues.mortonLeftShift_small_3, testValues.mortonLeftShift_small_3, testType);
-        verifyTestValue("mortonLeftShift_medium_3", expectedTestValues.mortonLeftShift_medium_3, testValues.mortonLeftShift_medium_3, testType);
-        verifyTestValue("mortonLeftShift_full_3", expectedTestValues.mortonLeftShift_full_3, testValues.mortonLeftShift_full_3, testType);
-        verifyTestValue("mortonLeftShift_emulated_3", expectedTestValues.mortonLeftShift_emulated_3, testValues.mortonLeftShift_emulated_3, testType);
-
-        verifyTestValue("mortonLeftShift_small_4", expectedTestValues.mortonLeftShift_small_4, testValues.mortonLeftShift_small_4, testType);
-        verifyTestValue("mortonLeftShift_medium_4", expectedTestValues.mortonLeftShift_medium_4, testValues.mortonLeftShift_medium_4, testType);
-        verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType);
-        verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType);
-
-        // Morton unsigned right-shift
-        verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType);
-        verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType);
-        verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType);
-        verifyTestValue("mortonUnsignedRightShift_emulated_2", expectedTestValues.mortonUnsignedRightShift_emulated_2, testValues.mortonUnsignedRightShift_emulated_2, testType);
-
-        verifyTestValue("mortonUnsignedRightShift_small_3", expectedTestValues.mortonUnsignedRightShift_small_3, testValues.mortonUnsignedRightShift_small_3, testType);
-        verifyTestValue("mortonUnsignedRightShift_medium_3", expectedTestValues.mortonUnsignedRightShift_medium_3, testValues.mortonUnsignedRightShift_medium_3, testType);
-        verifyTestValue("mortonUnsignedRightShift_full_3", expectedTestValues.mortonUnsignedRightShift_full_3, testValues.mortonUnsignedRightShift_full_3, testType);
-        verifyTestValue("mortonUnsignedRightShift_emulated_3", expectedTestValues.mortonUnsignedRightShift_emulated_3, testValues.mortonUnsignedRightShift_emulated_3, testType);
-
-        verifyTestValue("mortonUnsignedRightShift_small_4", expectedTestValues.mortonUnsignedRightShift_small_4, testValues.mortonUnsignedRightShift_small_4, testType);
-        verifyTestValue("mortonUnsignedRightShift_medium_4", expectedTestValues.mortonUnsignedRightShift_medium_4, testValues.mortonUnsignedRightShift_medium_4, testType);
-        verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType);
-        verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType);
-
-        // Morton signed right-shift
-        verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
-        verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
-        verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
-
-        verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType);
-        verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType);
-        verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType);
-
-        verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType);
-        verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType);
-        verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType);
-    }
-};
-
-#endif
\ No newline at end of file
diff --git a/73_Mortons/app_resources/testCommon.hlsl b/73_Mortons/app_resources/testCommon.hlsl
deleted file mode 100644
index 93205db62..000000000
--- a/73_Mortons/app_resources/testCommon.hlsl
+++ /dev/null
@@ -1,258 +0,0 @@
-#include "common.hlsl"
-
-template <bool Signed, uint16_t Bits, uint16_t Dim, typename _uint64_t = uint64_t>
-morton::code<Signed, Bits, Dim, _uint64_t> createMortonFromAnyVec(vector<conditional_t<Signed, int64_t, uint64_t>, Dim> val)
-{
-	using morton_code_t = morton::code<Signed, Bits, Dim, _uint64_t>;
-	using decode_element_t = typename morton_code_t::decode_component_t ;
-	NBL_IF_CONSTEXPR(Signed)
-	{
-    return morton_code_t::create(_static_cast<vector<decode_element_t, Dim> >(val & ));
-	  
-	}
-}
-
-void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
-{
-	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
-	emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
-	emulated_int64_t signedEmulatedA = _static_cast<emulated_int64_t>(input.generatedA);
-
-	// Emulated int tests
-	output.emulatedAnd = emulatedA & emulatedB;
-	output.emulatedOr = emulatedA | emulatedB;
-	output.emulatedXor = emulatedA ^ emulatedB;
-	output.emulatedNot = emulatedA.operator~();
-	output.emulatedPlus = emulatedA + emulatedB;
-	output.emulatedMinus = emulatedA - emulatedB;
-	output.emulatedLess = uint32_t(emulatedA < emulatedB);
-	output.emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
-	output.emulatedGreater = uint32_t(emulatedA > emulatedB);
-	output.emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
-
-	left_shift_operator<emulated_uint64_t> leftShift;
-	output.emulatedLeftShifted = leftShift(emulatedA, input.shift);
-
-	arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
-	output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
-
-	arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
-	output.emulatedSignedRightShifted = signedRightShift(signedEmulatedA, input.shift);
-
-	output.emulatedUnaryMinus = signedEmulatedA.operator-();
-
-	// Morton tests
-	uint64_t2 Vec2A = { input.coordX, input.coordY };
-	uint64_t2 Vec2B = { input.coordZ, input.coordW };
-
-	uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
-	uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
-
-	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
-	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
-
-	int64_t2 Vec2ASigned = int64_t2(Vec2A);
-	int64_t2 Vec2BSigned = int64_t2(Vec2B);
-
-	int64_t3 Vec3ASigned = int64_t3(Vec3A);
-	int64_t3 Vec3BSigned = int64_t3(Vec3B);
-
-	int64_t4 Vec4ASigned = int64_t4(Vec4A);
-	int64_t4 Vec4BSigned = int64_t4(Vec4B);
-
-	morton::code<false, smallBits_2, 2> morton_small_2A = createMortonFromAnyVec<false, smallBits_2, 2>(Vec2A);
-	morton::code<false, mediumBits_2, 2> morton_medium_2A = createMortonFromAnyVec<false, mediumBits_2, 2>(Vec2A);
-	morton::code<false, fullBits_2, 2> morton_full_2A = createMortonFromAnyVec<false, fullBits_2, 2>(Vec2A);
-	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = createMortonFromAnyVec<false, fullBits_2, 2, emulated_uint64_t>(Vec2A);
-	morton::code<false, smallBits_2, 2> morton_small_2B = createMortonFromAnyVec<false, smallBits_2, 2>(Vec2B);
-	morton::code<false, mediumBits_2, 2> morton_medium_2B = createMortonFromAnyVec<false, mediumBits_2, 2>(Vec2B);
-	morton::code<false, fullBits_2, 2> morton_full_2B = createMortonFromAnyVec<false, fullBits_2, 2>(Vec2B);
-	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = createMortonFromAnyVec<false, fullBits_2, 2, emulated_uint64_t>(Vec2B);
-
-	morton::code<false, smallBits_3, 3> morton_small_3A = createMortonFromAnyVec<false, smallBits_3, 3>(Vec3A);
-	morton::code<false, mediumBits_3, 3> morton_medium_3A = createMortonFromAnyVec<false, mediumBits_3, 3>(Vec3A);
-	morton::code<false, fullBits_3, 3> morton_full_3A = createMortonFromAnyVec<false, fullBits_3, 3>(Vec3A);
-	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = createMortonFromAnyVec<false, fullBits_3, 3, emulated_uint64_t>(Vec3A);
-	morton::code<false, smallBits_3, 3> morton_small_3B = createMortonFromAnyVec<false, smallBits_3, 3>(Vec3B);
-	morton::code<false, mediumBits_3, 3> morton_medium_3B = createMortonFromAnyVec<false, mediumBits_3, 3>(Vec3B);
-	morton::code<false, fullBits_3, 3> morton_full_3B = createMortonFromAnyVec<false, fullBits_3, 3>(Vec3B);
-	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = createMortonFromAnyVec<false, fullBits_3, 3, emulated_uint64_t>(Vec3B);
-
-	morton::code<false, smallBits_4, 4> morton_small_4A = createMortonFromAnyVec<false, smallBits_4, 4>(Vec4A);
-	morton::code<false, mediumBits_4, 4> morton_medium_4A = createMortonFromAnyVec<false, mediumBits_4, 4>(Vec4A);
-	morton::code<false, fullBits_4, 4> morton_full_4A = createMortonFromAnyVec<false, fullBits_4, 4>(Vec4A);
-	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = createMortonFromAnyVec<false, fullBits_4, 4, emulated_uint64_t>(Vec4A);
-	morton::code<false, smallBits_4, 4> morton_small_4B = createMortonFromAnyVec<false, smallBits_4, 4>(Vec4B);
-	morton::code<false, mediumBits_4, 4> morton_medium_4B = createMortonFromAnyVec<false, mediumBits_4, 4>(Vec4B);
-	morton::code<false, fullBits_4, 4> morton_full_4B = createMortonFromAnyVec<false, fullBits_4, 4>(Vec4B);
-	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = createMortonFromAnyVec<false, fullBits_4, 4, emulated_uint64_t>(Vec4B);
-
-	morton::code<true, smallBits_2, 2> morton_small_2_signed = createMortonFromAnyVec<true, smallBits_2, 2>(Vec2ASigned);
-	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = createMortonFromAnyVec<true, mediumBits_2, 2>(Vec2ASigned);
-	morton::code<true, fullBits_2, 2> morton_full_2_signed = createMortonFromAnyVec<true, fullBits_2, 2>(Vec2ASigned);
-
-	morton::code<true, smallBits_3, 3> morton_small_3_signed = createMortonFromAnyVec<true, smallBits_3, 3>(Vec3ASigned);
-	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = createMortonFromAnyVec<true, mediumBits_3, 3>(Vec3ASigned);
-	morton::code<true, fullBits_3, 3> morton_full_3_signed = createMortonFromAnyVec<true, fullBits_3, 3>(Vec3ASigned);
-
-	morton::code<true, smallBits_4, 4> morton_small_4_signed = createMortonFromAnyVec<true, smallBits_4, 4>(Vec4ASigned);
-	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = createMortonFromAnyVec<true, mediumBits_4, 4>(Vec4ASigned);
-	morton::code<true, fullBits_4, 4> morton_full_4_signed = createMortonFromAnyVec<true, fullBits_4, 4>(Vec4ASigned);
-
-	// Plus
-	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
-	output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
-	output.mortonPlus_full_2 = morton_full_2A + morton_full_2B;
-	output.mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
-
-	output.mortonPlus_small_3 = morton_small_3A + morton_small_3B;
-	output.mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
-	output.mortonPlus_full_3 = morton_full_3A + morton_full_3B;
-	output.mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
-
-	output.mortonPlus_small_4 = morton_small_4A + morton_small_4B;
-	output.mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
-	output.mortonPlus_full_4 = morton_full_4A + morton_full_4B;
-	output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
-	
-	// Minus
-	output.mortonMinus_small_2 = morton_small_2A - morton_small_2B;
-	output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
-	output.mortonMinus_full_2 = morton_full_2A - morton_full_2B;
-	output.mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
-
-	output.mortonMinus_small_3 = morton_small_3A - morton_small_3B;
-	output.mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
-	output.mortonMinus_full_3 = morton_full_3A - morton_full_3B;
-	output.mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
-
-	output.mortonMinus_small_4 = morton_small_4A - morton_small_4B;
-	output.mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
-	output.mortonMinus_full_4 = morton_full_4A - morton_full_4B;
-	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
-	
-	// Coordinate-wise equality
-	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
-	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
-	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
-	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
-
-	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
-	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
-	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
-	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
-
-	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
-	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
-	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
-	output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
-	
-	// Coordinate-wise unsigned inequality (just testing with less)
-	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
-	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
-	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
-	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
-
-	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
-	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
-	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
-	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
-
-	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
-	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
-	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
-	
-	// Coordinate-wise signed inequality
-	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
-	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
-	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2BSigned)));
-
-	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
-	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
-	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3BSigned)));
-
-	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
-	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
-	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
-	
-	// Cast to uint16_t which is what left shift for Mortons expect
-	uint16_t castedShift = uint16_t(input.shift);
-	// Each left shift clamps to correct bits so the result kinda makes sense
-	// Left-shift
-	left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
-	output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2);
-	left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
-	output.mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
-	left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
-	output.mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift % fullBits_2);
-	left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
-	output.mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
-
-	left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
-	output.mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift % smallBits_3);
-	left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
-	output.mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
-	left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
-	output.mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift % fullBits_3);
-	left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
-	output.mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
-
-	left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
-	output.mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift % smallBits_4);
-	left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
-	output.mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
-	left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
-	output.mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift % fullBits_4);
-	left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
-	output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
-	
-	// Unsigned right-shift
-	arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
-	output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2);
-	arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
-	output.mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
-	output.mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift % fullBits_2);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
-	output.mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
-
-	arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
-	output.mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift % smallBits_3);
-	arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
-	output.mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
-	output.mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift % fullBits_3);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
-	output.mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
-
-	arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
-	output.mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift % smallBits_4);
-	arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
-	output.mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
-	output.mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift % fullBits_4);
-	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
-	output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
-	
-	// Signed right-shift
-	arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
-	output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2);
-	arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
-	output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2);
-	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
-	output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2);
-
-	arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
-	output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3);
-	arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
-	output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3);
-	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
-	output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3);
-
-	arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
-	output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4);
-	arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
-	output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4);
-	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
-	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
-}
\ No newline at end of file

From 91ae8657dee9b4de82c81b97b23b83d3824a6011 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:20:01 +0300
Subject: [PATCH 36/57] Fixed main camera aspect ratio, added 27 configurations
 for cube silhouette

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 248 ++++++++++++------
 72_SolidAngleVisualizer/include/transform.hpp |   2 +-
 72_SolidAngleVisualizer/main.cpp              |   9 +-
 3 files changed, 167 insertions(+), 92 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index 7c96a8316..fa0805356 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -53,29 +53,84 @@ static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0),
                             float3(0,0,0), float3(0,0,0), float3(0,0,0) };
 
 
-static const float3 colorLUT[8] = {
+static const float3 colorLUT[27] = {
+    // Row 1: Pure and bright colors
     float3(0, 0, 0),        // 0: Black
-    float3(1, 0, 0),       // 1: Red
-    float3(0, 1, 0),       // 2: Green
-    float3(1, 1, 0),       // 3: Yellow
-    float3(0, 0, 1),       // 4: Blue
-    float3(1, 0, 1),       // 5: Magenta
-    float3(0, 1, 1),       // 6: Cyan
-    float3(1, 1, 1)        // 7: White
+    float3(1, 1, 1),        // 1: White
+    float3(0.5, 0.5, 0.5),  // 2: Gray
+    
+    // Row 2: Primary colors
+    float3(1, 0, 0),        // 3: Red
+    float3(0, 1, 0),        // 4: Green
+    float3(0, 0, 1),        // 5: Blue
+    
+    // Row 3: Secondary colors
+    float3(1, 1, 0),        // 6: Yellow
+    float3(1, 0, 1),        // 7: Magenta
+    float3(0, 1, 1),        // 8: Cyan
+    
+    // Row 4: Orange family
+    float3(1, 0.5, 0),      // 9: Orange
+    float3(1, 0.65, 0),     // 10: Light Orange
+    float3(0.8, 0.4, 0),    // 11: Dark Orange
+    
+    // Row 5: Pink/Rose family
+    float3(1, 0.4, 0.7),    // 12: Pink
+    float3(1, 0.75, 0.8),   // 13: Light Pink
+    float3(0.7, 0.1, 0.3),  // 14: Deep Rose
+    
+    // Row 6: Purple/Violet family
+    float3(0.5, 0, 0.5),    // 15: Purple
+    float3(0.6, 0.4, 0.8),  // 16: Light Purple
+    float3(0.3, 0, 0.5),    // 17: Indigo
+    
+    // Row 7: Green variations
+    float3(0, 0.5, 0),      // 18: Dark Green
+    float3(0.5, 1, 0),      // 19: Lime
+    float3(0, 0.5, 0.25),   // 20: Forest Green
+    
+    // Row 8: Blue variations
+    float3(0, 0, 0.5),      // 21: Navy
+    float3(0.3, 0.7, 1),    // 22: Sky Blue
+    float3(0, 0.4, 0.6),    // 23: Teal
+    
+    // Row 9: Earth tones
+    float3(0.6, 0.4, 0.2),  // 24: Brown
+    float3(0.8, 0.7, 0.3),  // 25: Tan/Beige
+    float3(0.4, 0.3, 0.1)   // 26: Dark Brown
 };
 
 
     
 // Vertices are ordered CCW relative to the camera view.
-static const int silhouettes[8][6] = {
-    {2, 3, 1, 5, 4, 6}, // 0: Black
-    {6, 7, 5, 1, 0, 2}, // 1: Red
-    {7, 6, 4, 0, 1, 3}, // 2: Green
-    {3, 7, 5, 4, 0, 2}, // 3: Yellow
-    {3, 2, 0, 4, 5, 7}, // 4: Cyan
-    {1, 3, 7, 6, 4, 0}, // 5: Magenta
-    {0, 1, 5, 7, 6, 2}, // 6: White
-    {4, 6, 2, 3, 1, 5}  // 7: Gray
+static const int silhouettes[27][7] = {
+    {6, 1, 3, 2, 6, 4, 5}, // 0: Black
+    {6, 2, 6, 4, 5, 7, 3}, // 1: White 
+    {6, 0, 4, 5, 7, 3, 2}, // 2: Gray 
+    {6, 1, 3, 7, 6, 4, 5,}, // 3: Red 
+    {4, 4, 5, 7, 6, -1, -1}, // 4: Green 
+    {6, 0, 4, 5, 7, 6, 2}, // 5: Blue 
+    {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow 
+    {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta              
+    {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan 
+    {6, 1, 3, 2, 6, 7, 5}, // 9: Orange
+    {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange
+    {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange
+    {4, 1, 3, 7, 5, -1, -1}, // 12: Pink
+    {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink
+    {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose
+    {6, 0, 1, 3, 7, 5, 4}, // 15: Purple
+    {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple
+    {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo
+    {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green
+    {6, 0, 2, 6, 7, 3, 1}, // 19: Lime
+    {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green
+    {6, 0, 2, 3, 7, 5, 1}, // 21: Navy
+    {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue
+    {6, 0, 4, 6, 2, 3, 1}, // 23: Teal
+    {6, 0, 2, 3, 7, 5, 4},  // 24: Brown
+    {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige
+    {6, 1, 5, 4, 6, 2, 3}  // 26: Dark Brown
 };
 
 // Converts UV into centered, aspect-corrected NDC circle space
@@ -106,6 +161,33 @@ void computeCubeGeo()
     }
 }
 
+float4 drawCorners(float3 spherePos, float aaWidth)
+{
+    float4 color = float4(0,0,0,0);
+    // Draw corner labels for debugging
+    for (int i = 0; i < 8; i++)
+    {
+        float3 corner = normalize(corners[i]);
+        float2 cornerPos = corner.xy;
+        // Project corner onto 2D circle space
+        
+        // Distance from current fragment to corner
+        float dist = length(spherePos.xy - cornerPos);
+        
+        // Draw a small colored dot at the corner
+        float dotSize = 0.03f;
+        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
+        
+        if (dotAlpha > 0.0f)
+        {
+            float brightness = float(i) / 7.0f;
+            float3 dotColor = colorLUT[i];
+            color += float4(dotColor * dotAlpha, dotAlpha);
+        }
+    }
+    return color;
+}
+
 float4 drawRing(float2 p, float aaWidth)
 {
     float positionLength = length(p);
@@ -194,54 +276,11 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
     return edgeColor * alpha * intensity;
 }
 
-[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
+float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
 {
-    float4 color = float4(0, 0, 0, 0);
-    float2 p = toCircleSpace(vx.uv);
-    
-    // Convert 2D disk position to 3D hemisphere position
-    float2 normalized = p / CIRCLE_RADIUS;
-    float r2 = dot(normalized, normalized);
-    
-    // Convert UV to 3D position on hemisphere
-    float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
-    
-    computeCubeGeo();
-    
-    float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
-    
-    float3 viewDir = obbCenter; 
-    
-    // Is this correct?
-    float dotX = dot(viewDir, float3(pc.modelMatrix[0][0], pc.modelMatrix[1][0], pc.modelMatrix[2][0]));
-    float dotY = dot(viewDir, float3(pc.modelMatrix[0][1], pc.modelMatrix[1][1], pc.modelMatrix[2][1]));
-    float dotZ = dot(viewDir, float3(pc.modelMatrix[0][2], pc.modelMatrix[1][2], pc.modelMatrix[2][2]));
-
-    // Determine octant from ray direction signs
-    int octant = (dotX >= 0 ? 4 : 0) + 
-                 (dotY >= 0 ? 2 : 0) + 
-                 (dotZ >= 0 ? 1 : 0);
-
-    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
-    {
-        return float4(colorLUT[octant], 1.0f);
-    }
-    
-    float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); 
-    
-
-    // Draw the 6 silhouette edges
-    for (int i = 0; i < 6; i++) 
-    {
-        int v0Idx = silhouettes[octant][i];
-        int v1Idx = silhouettes[octant][(i + 1) % 6];
-        
-        float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth);
-        color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
-    }
-    
+    float4 color = float4(0,0,0,0);
     // Draw the remaining edges (non-silhouette) in a different color
-    float3 hiddenEdgeColor = float3(0.3, 0.3, 0.3); // Gray color for hidden edges
+    float3 hiddenEdgeColor = float3(0.3, 0.3, 0); // dark yellow color for hidden edges
     
     for (int i = 0; i < 12; i++)
     {
@@ -249,12 +288,14 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
         
         // Check if this edge is already drawn as a silhouette edge
         bool isSilhouette = false;
-        for (int j = 0; j < 6; j++)
+        int vertexCount = silhouettes[configIndex][0];
+        // Draw the 6 silhouette edges
+        for (int i = 0; i < vertexCount; i++) 
         {
-            int v0 = silhouettes[octant][j];
-            int v1 = silhouettes[octant][(j + 1) % 6];
+            int v0Idx = silhouettes[configIndex][i + 1];
+            int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1];
             
-            if ((edge.x == v0 && edge.y == v1) || (edge.x == v1 && edge.y == v0))
+            if ((edge.x == v0Idx && edge.y == v1Idx) || (edge.x == v1Idx && edge.y == v0Idx))
             {
                 isSilhouette = true;
                 break;
@@ -268,33 +309,66 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float
             color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a);
         }
     }
+    return color;
+}
 
-    // Draw corner labels for debugging
-    for (int i = 0; i < 8; i++)
+[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
+{
+    float4 color = float4(0, 0, 0, 0);
+    float2 p = toCircleSpace(vx.uv);
+    
+    // Convert 2D disk position to 3D hemisphere position
+    float2 normalized = p / CIRCLE_RADIUS;
+    float r2 = dot(normalized, normalized);
+    float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); 
+
+    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
     {
-        float3 corner = normalize(corners[i]);
-        float2 cornerPos = corner.xy;
-        // Project corner onto 2D circle space
-        
-        // Distance from current fragment to corner
-        float dist = length(spherePos.xy - cornerPos);
-        
-        // Draw a small colored dot at the corner
-        float dotSize = 0.03f;
-        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
+        return float4(colorLUT[configIndex], 1.0f);
+    }
+    
+    // Convert UV to 3D position on hemisphere
+    float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
+    
+    computeCubeGeo();
+    
+    // Get OBB center in world space
+    float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
+
+    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
+    float3 proj = mul(obbCenter, rotMatrix); // Get all 3 projections at once
+
+    // Get squared column lengths
+    float lenSqX = dot(rotMatrix[0], rotMatrix[0]);
+    float lenSqY = dot(rotMatrix[1], rotMatrix[1]);
+    float lenSqZ = dot(rotMatrix[2], rotMatrix[2]);
+
+    int3 region = int3(
+        proj.x < -lenSqX ? 0 : (proj.x > lenSqX ? 2 : 1),
+        proj.y < -lenSqY ? 0 : (proj.y > lenSqY ? 2 : 1),
+        proj.z < -lenSqZ ? 0 : (proj.z > lenSqZ ? 2 : 1)
+    );
+
+    int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26
+    
+    int vertexCount = silhouettes[configIndex][0];
+    for (int i = 0; i < vertexCount; i++) 
+    {
+        int v0Idx = silhouettes[configIndex][i + 1];
+        int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1];
         
-        if (dotAlpha > 0.0f)
-        {
-            float brightness = float(i) / 7.0f;
-            float3 dotColor = colorLUT[i];
-            color += float4(dotColor * dotAlpha, dotAlpha);
-        }
+        float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth);
+        color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
     }
     
+    color += drawHiddenEdges(spherePos, configIndex, aaWidth);
+
+    color += drawCorners(spherePos, aaWidth);
+    
     color += drawRing(p, aaWidth);
 
-    // if (r2 > 1.1f)
-    //     color.a = 0.0f; // Outside circle, make transparent
+    if (r2 > 1.1f)
+        color.a = 0.0f; // Outside circle, make transparent
     
     return color;
 }
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp
index 639c0fa3a..105b2f757 100644
--- a/72_SolidAngleVisualizer/include/transform.hpp
+++ b/72_SolidAngleVisualizer/include/transform.hpp
@@ -19,7 +19,7 @@ struct TransformRequestParams
 
 struct TransformReturnInfo
 {
-	nbl::hlsl::uint16_t2 sceneResolution = { 0, 0 };
+	nbl::hlsl::uint16_t2 sceneResolution = { 1, 1 };
 	bool isGizmoWindowHovered;
 	bool isGizmoBeingUsed;
 };
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index 8fb8bf144..5f73797a6 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -753,16 +753,17 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			// TODO: why is this a lambda and not just an assignment in a scope ?
 			camera.setProjectionMatrix([&]()
 				{
-					matrix4SIMD projection;
+					const auto& sceneRes = mainViewTransformReturnInfo.sceneResolution;
 
+					matrix4SIMD projection;
 					if (isPerspective)
 						if (isLH)
-							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar);
 						else
-							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar);
+							projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar);
 					else
 					{
-						float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x;
+						float viewHeight = viewWidth * sceneRes.y / sceneRes.x;
 
 						if (isLH)
 							projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar);

From 0124cc9c0ad83d4a38f1e8ac3ddcdf56125740ac Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:30:34 +0300
Subject: [PATCH 37/57] Shader fixes, bast uint16 resolutionf to float

---
 .../app_resources/hlsl/SolidAngleVis.frag.hlsl   | 16 +++++++++-------
 72_SolidAngleVisualizer/main.cpp                 |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index fa0805356..ec30c2b64 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -322,10 +322,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
     float r2 = dot(normalized, normalized);
     float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); 
 
-    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
-    {
-        return float4(colorLUT[configIndex], 1.0f);
-    }
+
     
     // Convert UV to 3D position on hemisphere
     float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
@@ -350,7 +347,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
     );
 
     int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26
-    
+
     int vertexCount = silhouettes[configIndex][0];
     for (int i = 0; i < vertexCount; i++) 
     {
@@ -367,8 +364,13 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
     
     color += drawRing(p, aaWidth);
 
-    if (r2 > 1.1f)
-        color.a = 0.0f; // Outside circle, make transparent
+    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
+    {
+        return float4(colorLUT[configIndex], 1.0f);
+    }
+
+    // if (r2 > 1.1f)
+    //     color.a = 0.0f; // Outside circle, make transparent
     
     return color;
 }
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index 5f73797a6..85685e705 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -753,7 +753,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			// TODO: why is this a lambda and not just an assignment in a scope ?
 			camera.setProjectionMatrix([&]()
 				{
-					const auto& sceneRes = mainViewTransformReturnInfo.sceneResolution;
+					const auto& sceneRes = float16_t2(mainViewTransformReturnInfo.sceneResolution);
 
 					matrix4SIMD projection;
 					if (isPerspective)

From a35eddd1bd83fbf636e820b59c6eef939ed09668 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Tue, 9 Dec 2025 00:44:42 +0300
Subject: [PATCH 38/57] Better color for non-silhouette edges

---
 .../app_resources/hlsl/SolidAngleVis.frag.hlsl                 | 2 +-
 72_SolidAngleVisualizer/main.cpp                               | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index ec30c2b64..51cb1946d 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -280,7 +280,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
 {
     float4 color = float4(0,0,0,0);
     // Draw the remaining edges (non-silhouette) in a different color
-    float3 hiddenEdgeColor = float3(0.3, 0.3, 0); // dark yellow color for hidden edges
+    float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); // dark yellow color for hidden edges
     
     for (int i = 0; i < 12; i++)
     {
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index 85685e705..e9266520d 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -933,9 +933,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 				transformParams.editTransformDecomposition = true;
 				mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams);
-				// MODEL: Zup -> Yup
-
-				m_OBBModelMatrix = imguizmoM16InOut.model;
 
 				// TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used
 				move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed);

From 197b46afe5df4239958cd57fbe4aae8921dd9eb4 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 9 Dec 2025 23:39:30 +0700
Subject: [PATCH 39/57] Enable second test set

---
 14_Mortons/CTester.h                      | 106 +++++++++++++++++++++-
 14_Mortons/ITester.h                      |   1 -
 14_Mortons/app_resources/common.hlsl      |   6 +-
 14_Mortons/app_resources/test2.comp.hlsl  |  17 ++++
 14_Mortons/app_resources/testCommon.hlsl  |  25 ++---
 14_Mortons/app_resources/testCommon2.hlsl |  40 ++++++++
 14_Mortons/main.cpp                       |  15 +--
 7 files changed, 179 insertions(+), 31 deletions(-)
 create mode 100644 14_Mortons/app_resources/test2.comp.hlsl
 create mode 100644 14_Mortons/app_resources/testCommon2.hlsl

diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
index 4c8b4276e..342cbcc00 100644
--- a/14_Mortons/CTester.h
+++ b/14_Mortons/CTester.h
@@ -3,6 +3,7 @@
 
 #include <nabla.h>
 #include "app_resources/testCommon.hlsl"
+#include "app_resources/testCommon2.hlsl"
 #include "ITester.h"
 
 using namespace nbl;
@@ -158,6 +159,7 @@ class CTester final : public ITester
                 expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall));
                 expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium));
                 expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
+                expected.mortonEqual_emulated_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
 
                 // Coordinate-wise unsigned inequality (just testing with less)
                 expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall));
@@ -343,17 +345,14 @@ class CTester final : public ITester
         verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
         verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
         verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
-        verifyTestValue("mortonSignedLess_emulated_2", expectedTestValues.mortonSignedLess_emulated_2, testValues.mortonSignedLess_emulated_2, testType);
         
         verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
         verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
         verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
-        verifyTestValue("mortonSignedLess_emulated_3", expectedTestValues.mortonSignedLess_emulated_3, testValues.mortonSignedLess_emulated_3, testType);
         
         verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
         verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
         verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
-        verifyTestValue("mortonSignedLess_emulated_4", expectedTestValues.mortonSignedLess_emulated_4, testValues.mortonSignedLess_emulated_4, testType);
         
         // // Morton left-shift
         verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
@@ -402,4 +401,105 @@ class CTester final : public ITester
     }
 };
 
+class CTester2 final : public ITester
+{
+public:
+    void performTests()
+    {
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                testInput.coordX = longDistribution(mt);
+                testInput.coordY = longDistribution(mt);
+                testInput.coordZ = longDistribution(mt);
+                testInput.coordW = longDistribution(mt);
+
+                uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
+                uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
+
+                uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
+                uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
+
+                uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
+                uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
+
+                uint16_t4 Vec4AFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4>(Vec4A);
+                uint16_t4 Vec4BFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4>(Vec4B);
+
+                int32_t2 Vec2ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2>(Vec2A);
+                int32_t2 Vec2BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2>(Vec2B);
+
+                int32_t3 Vec3ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3>(Vec3A);
+                int32_t3 Vec3BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3>(Vec3B);
+
+                int16_t4 Vec4ASignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4A);
+                int16_t4 Vec4BSignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4B);
+
+                expected.mortonUnsignedLess_emulated_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
+                
+                expected.mortonSignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
+                expected.mortonSignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
+                expected.mortonSignedLess_emulated_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
+
+                uint16_t castedShift = uint16_t(generatedShift);
+                expected.mortonSignedRightShift_emulated_2 = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2A << uint64_t(castedShift % fullBits_2));
+                expected.mortonSignedRightShift_emulated_3 = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3A << uint64_t(castedShift % fullBits_3));
+                expected.mortonSignedRightShift_emulated_4 = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A << uint64_t(castedShift % fullBits_4));
+
+            }
+
+            performCpuTests(testInput, expected);
+            // performGpuTests(testInput, expected);
+        }
+        m_logger->log("SECOND TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+private:
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+
+        fillTestValues2(commonTestInputValues, cpuTestValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        
+        verifyTestValue("mortonUnsignedLess_emulated_4", expectedTestValues.mortonUnsignedLess_emulated_4, testValues.mortonUnsignedLess_emulated_4, testType);
+        
+        verifyTestValue("mortonSignedLess_emulated_2", expectedTestValues.mortonSignedLess_emulated_2, testValues.mortonSignedLess_emulated_2, testType);
+        verifyTestValue("mortonSignedLess_emulated_3", expectedTestValues.mortonSignedLess_emulated_3, testValues.mortonSignedLess_emulated_3, testType);
+        verifyTestValue("mortonSignedLess_emulated_4", expectedTestValues.mortonSignedLess_emulated_4, testValues.mortonSignedLess_emulated_4, testType);
+        //
+        // verifyTestValue("mortonSignedRightShift_emulated_2", expectedTestValues.mortonSignedRightShift_emulated_2, testValues.mortonSignedRightShift_emulated_2, testType);
+        // verifyTestValue("mortonSignedRightShift_emulated_3", expectedTestValues.mortonSignedRightShift_emulated_3, testValues.mortonSignedRightShift_emulated_3, testType);
+        // verifyTestValue("mortonSignedRightShift_emulated_4", expectedTestValues.mortonSignedRightShift_emulated_4, testValues.mortonSignedRightShift_emulated_4, testType);
+        
+    }
+};
 #endif
\ No newline at end of file
diff --git a/14_Mortons/ITester.h b/14_Mortons/ITester.h
index a0c76ac75..3be6d1d6b 100644
--- a/14_Mortons/ITester.h
+++ b/14_Mortons/ITester.h
@@ -18,7 +18,6 @@ class ITester
     struct PipelineSetupData
     {
         std::string testShaderPath;
-
         core::smart_refctd_ptr<video::ILogicalDevice> device;
         core::smart_refctd_ptr<video::CVulkanConnection> api;
         core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
diff --git a/14_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
index 237e3260e..d209c737f 100644
--- a/14_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -57,10 +57,10 @@ T createAnyBitIntegerFromU64(uint64_t val)
 template <typename T, bool Signed, uint16_t Bits, uint16_t D>
 vector<T, D> createAnyBitIntegerVecFromU64Vec(vector<uint64_t, D> val)
 {
-  array_get<portable_vector_t<T, D>, T> getter;
-  array_set<portable_vector_t<T, D>, T> setter;
+    array_get<portable_vector_t<T, D>, T> getter;
+    array_set<portable_vector_t<T, D>, T> setter;
 	vector<T, D> output;
-  NBL_UNROLL
+    NBL_UNROLL
 	for (uint16_t i = 0; i < D; i++)
 	{
 		setter(output, i, createAnyBitIntegerFromU64<T, Signed, Bits>(getter(val, i)));
diff --git a/14_Mortons/app_resources/test2.comp.hlsl b/14_Mortons/app_resources/test2.comp.hlsl
new file mode 100644
index 000000000..30b998f49
--- /dev/null
+++ b/14_Mortons/app_resources/test2.comp.hlsl
@@ -0,0 +1,17 @@
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "testCommon2.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+
+[[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
+[[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
+
+[numthreads(1, 1, 1)]
+[shader("compute")]
+void main(uint3 invocationID : SV_DispatchThreadID)
+{
+    uint32_t testID = glsl::gl_GlobalInvocationID().x;
+    fillTestValues2(inputTestValues[testID], outputTestValues[testID]);
+}
diff --git a/14_Mortons/app_resources/testCommon.hlsl b/14_Mortons/app_resources/testCommon.hlsl
index dbe6ddbd2..6e9051c1b 100644
--- a/14_Mortons/app_resources/testCommon.hlsl
+++ b/14_Mortons/app_resources/testCommon.hlsl
@@ -98,7 +98,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonPlus_full_4 = morton_full_4A + morton_full_4B;
 	output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
 	
-	// // Minus
+	// Minus
 	output.mortonMinus_small_2 = morton_small_2A - morton_small_2B;
 	output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
 	output.mortonMinus_full_2 = morton_full_2A - morton_full_2B;
@@ -114,7 +114,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonMinus_full_4 = morton_full_4A - morton_full_4B;
 	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
 	
-	// // Coordinate-wise equality
+	// Coordinate-wise equality
 	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
 	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
 	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
@@ -128,7 +128,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
 	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
 	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
-	// output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
+    output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
 	
 	// Coordinate-wise unsigned inequality (just testing with less)
 	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
@@ -139,34 +139,29 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
 	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
 	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
-	// output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
 	
 	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
 	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
 	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
-	// output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
-	// less(Vec4A, Vec4B);
 	
 	// Coordinate-wise signed inequality
 	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2B)));
 	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2B)));
 	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2B)));
-	// output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(int32_t2(Vec2B))); 
 	
 	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3B)));
 	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3B)));
 	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3B)));
-	output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(int32_t3(Vec3B))); 
 	
 	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4B)));
 	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4B)));
 	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4B)));
-	// output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(int16_t4(Vec4B))); 
 	
-	// // Cast to uint16_t which is what left shift for Mortons expect
+	// Cast to uint16_t which is what left shift for Mortons expect
 	uint16_t castedShift = uint16_t(input.shift);
-	// // Each left shift clamps to correct bits so the result kinda makes sense
-	// // Left-shift
+	// Each left shift clamps to correct bits so the result kinda makes sense
+	// Left-shift
 	left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
 	output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2);
 	left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
@@ -244,10 +239,4 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
 	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
 
-	// arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
-	// output.mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); 
-	// arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
-	// output.mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); 
-	// arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
-	// output.mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); 
 }
\ No newline at end of file
diff --git a/14_Mortons/app_resources/testCommon2.hlsl b/14_Mortons/app_resources/testCommon2.hlsl
new file mode 100644
index 000000000..e7eced852
--- /dev/null
+++ b/14_Mortons/app_resources/testCommon2.hlsl
@@ -0,0 +1,40 @@
+#include "common.hlsl"
+
+void fillTestValues2(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
+{
+	uint64_t2 Vec2A = { input.coordX, input.coordY };
+	uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+	uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+	uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+	int32_t2 Vec2BSigned = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2, 2>(Vec2B);
+
+	int32_t3 Vec3BSigned = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3, 3>(Vec3B);
+
+	int16_t4 Vec4BSigned = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4, 4>(Vec4B);
+
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4A);
+	morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2A);
+	morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3_signed = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3A);
+	morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A);
+
+	
+	output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+	
+	output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(Vec2BSigned)); 
+	output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(Vec3BSigned)); 
+	output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(Vec4BSigned)); 
+
+	uint16_t castedShift = uint16_t(input.shift);
+
+	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
+	output.mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift % fullBits_2); 
+	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
+	output.mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift % fullBits_3); 
+	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
+	output.mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift % fullBits_4); 
+}
diff --git a/14_Mortons/main.cpp b/14_Mortons/main.cpp
index 6034e3469..bd4653f7c 100644
--- a/14_Mortons/main.cpp
+++ b/14_Mortons/main.cpp
@@ -2,8 +2,6 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include <nabla.h>
-#include <iostream>
-#include <cstdio>
 #include <assert.h>
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
@@ -47,10 +45,15 @@ class MortonTest final : public MonoDeviceApplication, public BuiltinResourcesAp
         // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
         // Blocked by https://github.com/KhronosGroup/SPIRV-Tools/issues/6104
         {
-            CTester mortonTester;
-            pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
-            mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
-            mortonTester.performTests();
+            // CTester mortonTester;
+            // pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
+            // mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
+            // mortonTester.performTests();
+
+            CTester2 mortonTester2;
+            pplnSetupData.testShaderPath = "app_resources/test2.comp.hlsl";
+            mortonTester2.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
+            mortonTester2.performTests();
         }
 
         return true;

From 6692311fddaf527dad42abe170394eb85ad4f5ae Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 9 Dec 2025 23:59:46 +0700
Subject: [PATCH 40/57] Delete fillSecondTestValues

---
 14_Mortons/app_resources/common.hlsl | 44 ----------------------------
 1 file changed, 44 deletions(-)

diff --git a/14_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
index d209c737f..895728f26 100644
--- a/14_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -244,50 +244,6 @@ struct TestValues
 	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
 	
-	/*
-	void fillSecondTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
-	{
-		uint64_t2 Vec2A = { input.coordX, input.coordY };
-		uint64_t2 Vec2B = { input.coordZ, input.coordW };
-
-		uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
-		uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
-
-		uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
-		uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
-
-		int64_t2 Vec2ASigned = int64_t2(Vec2A);
-		int64_t2 Vec2BSigned = int64_t2(Vec2B);
-
-		int64_t3 Vec3ASigned = int64_t3(Vec3A);
-		int64_t3 Vec3BSigned = int64_t3(Vec3B);
-
-		int64_t4 Vec4ASigned = int64_t4(Vec4A);
-		int64_t4 Vec4BSigned = int64_t4(Vec4B);
-
-		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
-		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
-		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3_signed = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
-		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
-
-		output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
-		
-		output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
-		
-		mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(int32_t2(Vec2BSigned))); 
-		mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(int32_t3(Vec3BSigned))); 
-		mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(int16_t4(Vec4BSigned))); 
-
-		uint16_t castedShift = uint16_t(input.shift);
-
-		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
-		mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); 
-		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
-		mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); 
-		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
-		mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); 
-	}
-	*/
 };
 
 #endif

From 4287ed1522352bd831280900fba1e7eb239e36c8 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Wed, 10 Dec 2025 23:57:16 +0700
Subject: [PATCH 41/57] Fix morton test

---
 14_Mortons/CTester.h                      | 194 +++++++++++-----------
 14_Mortons/app_resources/common.hlsl      |  24 +--
 14_Mortons/app_resources/testCommon.hlsl  | 106 ++++++++----
 14_Mortons/app_resources/testCommon2.hlsl |  17 +-
 4 files changed, 194 insertions(+), 147 deletions(-)

diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
index 342cbcc00..ff83c02cc 100644
--- a/14_Mortons/CTester.h
+++ b/14_Mortons/CTester.h
@@ -62,57 +62,59 @@ class CTester final : public ITester
                 uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
                 uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
 
-                uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 );
-                uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 );
-                uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2);
-                uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2);
-                uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2);
-                uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2);
+                uint16_t2 Vec2ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_2>(Vec2A);
+                uint16_t2 Vec2BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_2>(Vec2B);
+                uint16_t2 Vec2AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_2>(Vec2A);
+                uint16_t2 Vec2BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_2>(Vec2B);
+                uint32_t2 Vec2AFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_2>(Vec2A);
+                uint32_t2 Vec2BFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_2>(Vec2B);
 
                 uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
                 uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
 
-                uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3);
-                uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3);
-                uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3);
-                uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3);
-                uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3);
-                uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3);
+                uint16_t3 Vec3ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_3>(Vec3A);
+                uint16_t3 Vec3BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_3>(Vec3B);
+                uint16_t3 Vec3AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_3>(Vec3A);
+                uint16_t3 Vec3BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_3>(Vec3B);
+                uint32_t3 Vec3AFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_3>(Vec3A);
+                uint32_t3 Vec3BFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_3>(Vec3B);
 
                 uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
                 uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
 
-                uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4);
-                uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4);
-                uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4);
-                uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4);
-                uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4);
-                uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4);
+                uint16_t4 Vec4ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_4>(Vec4A);
+                uint16_t4 Vec4BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_4>(Vec4B);
+                uint16_t4 Vec4AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_4>(Vec4A);
+                uint16_t4 Vec4BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_4>(Vec4B);
+                uint16_t4 Vec4AFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4>(Vec4A);
+                uint16_t4 Vec4BFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4>(Vec4B);
 
                 // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them
                 // so their highest bits are all 0s or 1s depending on the sign of the number they encode
 
-                int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
-                int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
-                int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
-                int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
-                int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
-                int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
-
-                int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
-                int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
-                int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
-                int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
-                int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
-                int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
-
-                int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
-                int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
-                int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
-                int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
-                int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
-                int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+                int16_t2 Vec2ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_2>(Vec2A);
+                int16_t2 Vec2BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_2>(Vec2B);
+                int16_t2 Vec2ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true,mediumBits_2 >(Vec2A);
+                int16_t2 Vec2BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_2>(Vec2B);
+                int32_t2 Vec2ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2>(Vec2A);
+                int32_t2 Vec2BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2>(Vec2B);
+
+                int16_t3 Vec3ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_3>(Vec3A);
+                int16_t3 Vec3BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_3>(Vec3B);
+                int16_t3 Vec3ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_3>(Vec3A);
+                int16_t3 Vec3BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_3>(Vec3B);
+                int32_t3 Vec3ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3>(Vec3A);
+                int32_t3 Vec3BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3>(Vec3B);
+
+                int16_t4 Vec4ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_4>(Vec4A);
+                int16_t4 Vec4BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_4>(Vec4B);
+                int16_t4 Vec4ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_4>(Vec4A);
+                int16_t4 Vec4BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_4>(Vec4B);
+                int16_t4 Vec4ASignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4A);
+                int16_t4 Vec4BSignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4B);
 
+                const auto dummy1 = morton::code<true, smallBits_2, 2>(Vec2ASignedSmall);
+                const auto dummy2 = createMortonFromU64Vec<true, smallBits_2, 2>(Vec2A);
                 // Plus
                 expected.mortonPlus_small_2 = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2ASmall + Vec2BSmall);
                 expected.mortonPlus_medium_2 = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2AMedium + Vec2BMedium);
@@ -191,49 +193,49 @@ class CTester final : public ITester
 
                 uint16_t castedShift = uint16_t(generatedShift);
                 // Left-shift
-                expected.mortonLeftShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
-                expected.mortonLeftShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
-                expected.mortonLeftShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-                expected.mortonLeftShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-
-                expected.mortonLeftShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
-                expected.mortonLeftShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
-                expected.mortonLeftShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-                expected.mortonLeftShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-
-                expected.mortonLeftShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
-                expected.mortonLeftShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
-                expected.mortonLeftShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-                expected.mortonLeftShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-
+                expected.mortonLeftShift_small_2 = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2ASmall << uint16_t(castedShift % smallBits_2));
+                expected.mortonLeftShift_medium_2 = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2AMedium << uint16_t(castedShift % mediumBits_2));
+                expected.mortonLeftShift_full_2 = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2AFull << uint32_t(castedShift % fullBits_2));
+                expected.mortonLeftShift_emulated_2 = createMortonFromU64Vec<false, fullBits_2, 2, emulated_uint64_t>(Vec2AFull << uint32_t(castedShift % fullBits_2));
+                
+                expected.mortonLeftShift_small_3 = createMortonFromU64Vec<false, smallBits_3, 3>(Vec3ASmall << uint16_t(castedShift % smallBits_3));
+                expected.mortonLeftShift_medium_3 = createMortonFromU64Vec<false, mediumBits_3, 3>(Vec3AMedium << uint16_t(castedShift % mediumBits_3));
+                expected.mortonLeftShift_full_3 = createMortonFromU64Vec<false, fullBits_3, 3>(Vec3AFull << uint32_t(castedShift % fullBits_3));
+                expected.mortonLeftShift_emulated_3 = createMortonFromU64Vec<false, fullBits_3, 3, emulated_uint64_t>(Vec3AFull << uint32_t(castedShift % fullBits_3));
+                
+                expected.mortonLeftShift_small_4 = createMortonFromU64Vec<false, smallBits_4, 4>(Vec4ASmall << uint16_t(castedShift % smallBits_4));
+                expected.mortonLeftShift_medium_4 = createMortonFromU64Vec<false, mediumBits_4, 4>(Vec4AMedium << uint16_t(castedShift % mediumBits_4));
+                expected.mortonLeftShift_full_4 = createMortonFromU64Vec<false, fullBits_4, 4>(Vec4AFull << uint16_t(castedShift % fullBits_4));
+                expected.mortonLeftShift_emulated_4 = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4AFull << uint16_t(castedShift % fullBits_4));
+                
                 // Unsigned right-shift
-                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
-                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
-                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
-                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2));
-
-                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
-                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
-                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
-                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3));
-
-                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
-                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
-                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
-                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
-            
+                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall >> uint16_t(castedShift % smallBits_2));
+                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium >> uint16_t(castedShift % mediumBits_2));
+                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull >> uint32_t(castedShift % fullBits_2));
+                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull >> uint32_t(castedShift % fullBits_2));
+                
+                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall >> uint16_t(castedShift % smallBits_3));
+                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium >> uint16_t(castedShift % mediumBits_3));
+                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull >> uint32_t(castedShift % fullBits_3));
+                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull >> uint32_t(castedShift % fullBits_3));
+                
+                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall >> uint16_t(castedShift % smallBits_4));
+                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium >> uint16_t(castedShift % mediumBits_4));
+                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull >> uint16_t(castedShift % fullBits_4));
+                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull >> uint16_t(castedShift % fullBits_4));
+                
                 // Signed right-shift
-                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
-                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
-                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
-
-                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
-                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
-                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
-
-                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
-                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
-                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
+                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create(Vec2ASignedSmall >> int16_t(castedShift % smallBits_2));
+                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create(Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2));
+                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create(Vec2ASignedFull >> int32_t(castedShift % fullBits_2));
+                
+                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create(Vec3ASignedSmall >> int16_t(castedShift % smallBits_3));
+                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create(Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3));
+                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create(Vec3ASignedFull >> int32_t(castedShift % fullBits_3));
+                
+                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create(Vec4ASignedSmall >> int16_t(castedShift % smallBits_4));
+                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create(Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4));
+                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create(Vec4ASignedFull >> int16_t(castedShift % fullBits_4));
             }
 
             performCpuTests(testInput, expected);
@@ -278,7 +280,7 @@ class CTester final : public ITester
         verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
         verifyTestValue("emulatedUnaryMinus", expectedTestValues.emulatedUnaryMinus, testValues.emulatedUnaryMinus, testType);
 
-        // // Morton Plus
+        // Morton Plus
         verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
         verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType);
         verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType);
@@ -293,8 +295,8 @@ class CTester final : public ITester
         verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType);
         verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType);
         verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType);
-
-        // // Morton Minus
+        
+        // Morton Minus
         verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType);
         verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType);
         verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType);
@@ -310,7 +312,7 @@ class CTester final : public ITester
         verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType);
         verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType);
         
-        // // Morton coordinate-wise equality
+        // Morton coordinate-wise equality
         verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType);
         verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType);
         verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType);
@@ -326,7 +328,7 @@ class CTester final : public ITester
         verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType);
         verifyTestValue("mortonEqual_emulated_4", expectedTestValues.mortonEqual_emulated_4, testValues.mortonEqual_emulated_4, testType);
         
-        // // Morton coordinate-wise unsigned inequality
+        // Morton coordinate-wise unsigned inequality
         verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType);
         verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType);
         verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType);
@@ -341,7 +343,7 @@ class CTester final : public ITester
         verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
         verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
         
-        // // Morton coordinate-wise signed inequality
+        // Morton coordinate-wise signed inequality
         verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
         verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
         verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
@@ -354,7 +356,7 @@ class CTester final : public ITester
         verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
         verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
         
-        // // Morton left-shift
+        // Morton left-shift
         verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
         verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType);
         verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType);
@@ -370,7 +372,7 @@ class CTester final : public ITester
         verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType);
         verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType);
         
-        // // Morton unsigned right-shift
+        // Morton unsigned right-shift
         verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType);
         verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType);
         verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType);
@@ -386,7 +388,7 @@ class CTester final : public ITester
         verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType);
         verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType);
         
-        // // Morton signed right-shift
+        // Morton signed right-shift
         verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
         verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
         verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
@@ -456,14 +458,14 @@ class CTester2 final : public ITester
                 expected.mortonSignedLess_emulated_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
 
                 uint16_t castedShift = uint16_t(generatedShift);
-                expected.mortonSignedRightShift_emulated_2 = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2A << uint64_t(castedShift % fullBits_2));
-                expected.mortonSignedRightShift_emulated_3 = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3A << uint64_t(castedShift % fullBits_3));
-                expected.mortonSignedRightShift_emulated_4 = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A << uint64_t(castedShift % fullBits_4));
+                expected.mortonSignedRightShift_emulated_2 = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2ASignedFull >> int32_t(castedShift % fullBits_2));
+                expected.mortonSignedRightShift_emulated_3 = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3ASignedFull >> int32_t(castedShift % fullBits_3));
+                expected.mortonSignedRightShift_emulated_4 = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4ASignedFull >> int16_t(castedShift % fullBits_4));
 
             }
 
             performCpuTests(testInput, expected);
-            // performGpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
         }
         m_logger->log("SECOND TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
     }
@@ -495,10 +497,10 @@ class CTester2 final : public ITester
         verifyTestValue("mortonSignedLess_emulated_2", expectedTestValues.mortonSignedLess_emulated_2, testValues.mortonSignedLess_emulated_2, testType);
         verifyTestValue("mortonSignedLess_emulated_3", expectedTestValues.mortonSignedLess_emulated_3, testValues.mortonSignedLess_emulated_3, testType);
         verifyTestValue("mortonSignedLess_emulated_4", expectedTestValues.mortonSignedLess_emulated_4, testValues.mortonSignedLess_emulated_4, testType);
-        //
-        // verifyTestValue("mortonSignedRightShift_emulated_2", expectedTestValues.mortonSignedRightShift_emulated_2, testValues.mortonSignedRightShift_emulated_2, testType);
-        // verifyTestValue("mortonSignedRightShift_emulated_3", expectedTestValues.mortonSignedRightShift_emulated_3, testValues.mortonSignedRightShift_emulated_3, testType);
-        // verifyTestValue("mortonSignedRightShift_emulated_4", expectedTestValues.mortonSignedRightShift_emulated_4, testValues.mortonSignedRightShift_emulated_4, testType);
+        
+        verifyTestValue("mortonSignedRightShift_emulated_2", expectedTestValues.mortonSignedRightShift_emulated_2, testValues.mortonSignedRightShift_emulated_2, testType);
+        verifyTestValue("mortonSignedRightShift_emulated_3", expectedTestValues.mortonSignedRightShift_emulated_3, testValues.mortonSignedRightShift_emulated_3, testType);
+        verifyTestValue("mortonSignedRightShift_emulated_4", expectedTestValues.mortonSignedRightShift_emulated_4, testValues.mortonSignedRightShift_emulated_4, testType);
         
     }
 };
diff --git a/14_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
index 895728f26..ef75d6057 100644
--- a/14_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -19,8 +19,8 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4;
 NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
 NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
-template <typename T, uint16_t Bits>
-NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR T bitMask = (uint64_t(1) << Bits) - 1;
+template <uint16_t Bits>
+NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint64_t bitMask = (uint64_t(1) << (Bits-1)) - 1;
 
 
 #ifndef __HLSL_VERSION
@@ -41,23 +41,27 @@ constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1;
 
 using namespace nbl::hlsl;
 template <typename T, bool Signed, uint16_t Bits>
-T createAnyBitIntegerFromU64(uint64_t val)
+NBL_CONSTEXPR_INLINE_FUNC T createAnyBitIntegerFromU64(uint64_t val)
 {
-  if(Signed && (_static_cast<int64_t>(val) < 0))
+  if(Signed)
   {
+    NBL_CONSTEXPR_FUNC_SCOPE_VAR uint64_t mask = (uint64_t(1) << (Bits - 1)) - 1;
     // fill excess bit with one
-    return T(val) | ~bitMask<T, Bits>;
+	if (int64_t(val) < 0)
+		return T(val) | ~mask;
+	else
+        return T(val) & mask;
   } else
   {
-    return T(val) & bitMask<T, Bits>;
-    
+    NBL_CONSTEXPR_FUNC_SCOPE_VAR uint64_t mask = (uint64_t(1) << Bits) - 1;
+    return T(val) & mask;
   }
 }
 
 template <typename T, bool Signed, uint16_t Bits, uint16_t D>
-vector<T, D> createAnyBitIntegerVecFromU64Vec(vector<uint64_t, D> val)
+NBL_CONSTEXPR_INLINE_FUNC vector<T, D> createAnyBitIntegerVecFromU64Vec(vector<uint64_t, D> val)
 {
-    array_get<portable_vector_t<T, D>, T> getter;
+    array_get<portable_vector_t<uint64_t, D>, uint64_t> getter;
     array_set<portable_vector_t<T, D>, T> setter;
 	vector<T, D> output;
     NBL_UNROLL
@@ -69,7 +73,7 @@ vector<T, D> createAnyBitIntegerVecFromU64Vec(vector<uint64_t, D> val)
 }
 
 template <bool Signed, uint16_t Bits, uint16_t D, typename _uint64_t = uint64_t>
-morton::code<Signed, Bits, D, _uint64_t> createMortonFromU64Vec(const vector<uint64_t, D> vec)
+NBL_CONSTEXPR_INLINE_FUNC morton::code<Signed, Bits, D, _uint64_t> createMortonFromU64Vec(const vector<uint64_t, D> vec)
 {
 	using morton_code_t = morton::code<Signed, Bits, D, _uint64_t>;
 	using decode_component_t = typename morton_code_t::decode_component_t;
diff --git a/14_Mortons/app_resources/testCommon.hlsl b/14_Mortons/app_resources/testCommon.hlsl
index 6e9051c1b..f068b474b 100644
--- a/14_Mortons/app_resources/testCommon.hlsl
+++ b/14_Mortons/app_resources/testCommon.hlsl
@@ -40,6 +40,48 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
 	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
 
+	uint16_t2 Vec2ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_2, 2>(Vec2A);
+	uint16_t2 Vec2BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_2, 2>(Vec2B);
+	uint16_t2 Vec2AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_2, 2>(Vec2A);
+	uint16_t2 Vec2BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_2, 2>(Vec2B);
+	uint32_t2 Vec2AFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_2, 2>(Vec2A);
+  	uint32_t2 Vec2BFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_2, 2>(Vec2B);
+
+	uint16_t3 Vec3ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_3, 3>(Vec3A);
+	uint16_t3 Vec3BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_3, 3>(Vec3B);
+	uint16_t3 Vec3AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_3, 3>(Vec3A);
+	uint16_t3 Vec3BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_3, 3>(Vec3B);
+	uint32_t3 Vec3AFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_3, 3>(Vec3A);
+	uint32_t3 Vec3BFull = createAnyBitIntegerVecFromU64Vec<uint32_t, false, fullBits_3, 3>(Vec3B);
+
+	uint16_t4 Vec4ASmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_4, 4>(Vec4A);
+	uint16_t4 Vec4BSmall = createAnyBitIntegerVecFromU64Vec<uint16_t, false, smallBits_4, 4>(Vec4B);
+	uint16_t4 Vec4AMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_4, 4>(Vec4A);
+	uint16_t4 Vec4BMedium = createAnyBitIntegerVecFromU64Vec<uint16_t, false, mediumBits_4, 4>(Vec4B);
+	uint16_t4 Vec4AFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4, 4>(Vec4A);
+	uint16_t4 Vec4BFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4, 4>(Vec4B);
+
+	int16_t2 Vec2ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_2, 2>(Vec2A);
+	int16_t2 Vec2BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_2, 2>(Vec2B);
+	int16_t2 Vec2ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true,mediumBits_2, 2 >(Vec2A);
+	int16_t2 Vec2BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_2, 2>(Vec2B);
+	int32_t2 Vec2ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2, 2>(Vec2A);
+	int32_t2 Vec2BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2, 2>(Vec2B);
+
+	int16_t3 Vec3ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_3, 3>(Vec3A);
+	int16_t3 Vec3BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_3, 3>(Vec3B);
+	int16_t3 Vec3ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_3, 3>(Vec3A);
+	int16_t3 Vec3BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_3, 3>(Vec3B);
+	int32_t3 Vec3ASignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3, 3>(Vec3A);
+	int32_t3 Vec3BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3, 3>(Vec3B);
+
+	int16_t4 Vec4ASignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_4, 4>(Vec4A);
+	int16_t4 Vec4BSignedSmall = createAnyBitIntegerVecFromU64Vec<int16_t, true, smallBits_4, 4>(Vec4B);
+	int16_t4 Vec4ASignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_4, 4>(Vec4A);
+	int16_t4 Vec4BSignedMedium = createAnyBitIntegerVecFromU64Vec<int16_t, true, mediumBits_4, 4>(Vec4B);
+	int16_t4 Vec4ASignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4, 4>(Vec4A);
+	int16_t4 Vec4BSignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4, 4>(Vec4B);
+
 	morton::code<false, smallBits_2, 2> morton_small_2A = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2A);
 	morton::code<false, mediumBits_2, 2> morton_medium_2A = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2A);
 	morton::code<false, fullBits_2, 2> morton_full_2A = createMortonFromU64Vec<false, fullBits_2, 2>(Vec2A);
@@ -115,48 +157,48 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
 	
 	// Coordinate-wise equality
-	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
-	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
-	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
-	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(Vec2BSmall));
+	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(Vec2BMedium));
+	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(Vec2BFull));
+	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(Vec2BFull));
 	
-	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
-	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
-	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
-	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(Vec3BSmall));
+	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(Vec3BMedium));
+	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(Vec3BFull));
+	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(Vec3BFull));
 	
-	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
-	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
-	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
-    output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(Vec4BSmall));
+	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(Vec4BMedium));
+	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(Vec4BFull));
+    output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(Vec4BFull));
 	
 	// Coordinate-wise unsigned inequality (just testing with less)
-	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
-	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
-	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
-	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(Vec2BSmall));
+	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(Vec2BMedium));
+	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(Vec2BFull));
+	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(Vec2BFull));
 	
-	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
-	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
-	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
-	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(Vec3BSmall));
+	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(Vec3BMedium));
+	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(Vec3BFull));
+	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(Vec3BFull));
 	
-	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
-	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
-	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(Vec4BSmall));
+	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(Vec4BMedium));
+	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(Vec4BFull));
 	
 	// Coordinate-wise signed inequality
-	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2B)));
-	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2B)));
-	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2B)));
+	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(Vec2BSignedSmall));
+	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(Vec2BSignedMedium));
+	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(Vec2BSignedFull));
 	
-	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3B)));
-	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3B)));
-	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3B)));
+	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(Vec3BSignedSmall));
+	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(Vec3BSignedMedium));
+	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(Vec3BSignedFull));
 	
-	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4B)));
-	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4B)));
-	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4B)));
+	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(Vec4BSignedSmall));
+	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(Vec4BSignedMedium));
+	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(Vec4BSignedFull));
 	
 	// Cast to uint16_t which is what left shift for Mortons expect
 	uint16_t castedShift = uint16_t(input.shift);
diff --git a/14_Mortons/app_resources/testCommon2.hlsl b/14_Mortons/app_resources/testCommon2.hlsl
index e7eced852..365b82340 100644
--- a/14_Mortons/app_resources/testCommon2.hlsl
+++ b/14_Mortons/app_resources/testCommon2.hlsl
@@ -11,11 +11,10 @@ void fillTestValues2(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestV
 	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
 	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
 
-	int32_t2 Vec2BSigned = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2, 2>(Vec2B);
-
-	int32_t3 Vec3BSigned = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3, 3>(Vec3B);
-
-	int16_t4 Vec4BSigned = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4, 4>(Vec4B);
+	uint16_t4 Vec4BFull = createAnyBitIntegerVecFromU64Vec<uint16_t, false, fullBits_4, 4>(Vec4B);
+	int32_t2 Vec2BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_2, 2>(Vec2B);
+	int32_t3 Vec3BSignedFull = createAnyBitIntegerVecFromU64Vec<int32_t, true, fullBits_3, 3>(Vec3B);
+	int16_t4 Vec4BSignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4, 4>(Vec4B);
 
 	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = createMortonFromU64Vec<false, fullBits_4, 4, emulated_uint64_t>(Vec4A);
 	morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2A);
@@ -23,11 +22,11 @@ void fillTestValues2(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestV
 	morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A);
 
 	
-	output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(Vec4BFull));
 	
-	output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(Vec2BSigned)); 
-	output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(Vec3BSigned)); 
-	output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(Vec4BSigned)); 
+	output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(Vec2BSignedFull)); 
+	output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(Vec3BSignedFull)); 
+	output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(Vec4BSignedFull)); 
 
 	uint16_t castedShift = uint16_t(input.shift);
 

From 6a7b003798bb894d36be63609fb987dd20fccaa3 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 11 Dec 2025 00:27:20 +0700
Subject: [PATCH 42/57] Remove unnecessary code

---
 14_Mortons/app_resources/common.hlsl | 20 --------------------
 14_Mortons/main.cpp                  |  8 ++++----
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/14_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
index ef75d6057..980bb0c32 100644
--- a/14_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -19,26 +19,6 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4;
 NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
 NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
-template <uint16_t Bits>
-NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint64_t bitMask = (uint64_t(1) << (Bits-1)) - 1;
-
-
-#ifndef __HLSL_VERSION
-
-constexpr uint64_t smallBitsMask_2 = (uint64_t(1) << smallBits_2) - 1;
-constexpr uint64_t mediumBitsMask_2 = (uint64_t(1) << mediumBits_2) - 1;
-constexpr uint64_t fullBitsMask_2 = (uint64_t(1) << fullBits_2) - 1;
-
-constexpr uint64_t smallBitsMask_3 = (uint64_t(1) << smallBits_3) - 1;
-constexpr uint64_t mediumBitsMask_3 = (uint64_t(1) << mediumBits_3) - 1;
-constexpr uint64_t fullBitsMask_3 = (uint64_t(1) << fullBits_3) - 1;
-
-constexpr uint64_t smallBitsMask_4 = (uint64_t(1) << smallBits_4) - 1;
-constexpr uint64_t mediumBitsMask_4 = (uint64_t(1) << mediumBits_4) - 1;
-constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1;
-
-#endif
-
 using namespace nbl::hlsl;
 template <typename T, bool Signed, uint16_t Bits>
 NBL_CONSTEXPR_INLINE_FUNC T createAnyBitIntegerFromU64(uint64_t val)
diff --git a/14_Mortons/main.cpp b/14_Mortons/main.cpp
index bd4653f7c..12f55805f 100644
--- a/14_Mortons/main.cpp
+++ b/14_Mortons/main.cpp
@@ -45,10 +45,10 @@ class MortonTest final : public MonoDeviceApplication, public BuiltinResourcesAp
         // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
         // Blocked by https://github.com/KhronosGroup/SPIRV-Tools/issues/6104
         {
-            // CTester mortonTester;
-            // pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
-            // mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
-            // mortonTester.performTests();
+            CTester mortonTester;
+            pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
+            mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
+            mortonTester.performTests();
 
             CTester2 mortonTester2;
             pplnSetupData.testShaderPath = "app_resources/test2.comp.hlsl";

From f012a1af45dc0fd9240e342d185a3f1d6e4a2dc3 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 11 Dec 2025 00:44:24 +0700
Subject: [PATCH 43/57] Add some comment for the reason we have to CTester

---
 14_Mortons/CTester.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
index ff83c02cc..340e405d1 100644
--- a/14_Mortons/CTester.h
+++ b/14_Mortons/CTester.h
@@ -403,6 +403,7 @@ class CTester final : public ITester
     }
 };
 
+// Some hlsl code will result in compilation error if mixed together due to some bug in dxc. So we separate them into multiple shader compilation and test.
 class CTester2 final : public ITester
 {
 public:

From f415e8c09150a1643945112cb5c37cb1df3acf69 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 11 Dec 2025 00:48:53 +0700
Subject: [PATCH 44/57] Remove dummy code

---
 14_Mortons/CTester.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
index 340e405d1..447ceb18a 100644
--- a/14_Mortons/CTester.h
+++ b/14_Mortons/CTester.h
@@ -113,8 +113,6 @@ class CTester final : public ITester
                 int16_t4 Vec4ASignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4A);
                 int16_t4 Vec4BSignedFull = createAnyBitIntegerVecFromU64Vec<int16_t, true, fullBits_4>(Vec4B);
 
-                const auto dummy1 = morton::code<true, smallBits_2, 2>(Vec2ASignedSmall);
-                const auto dummy2 = createMortonFromU64Vec<true, smallBits_2, 2>(Vec2A);
                 // Plus
                 expected.mortonPlus_small_2 = createMortonFromU64Vec<false, smallBits_2, 2>(Vec2ASmall + Vec2BSmall);
                 expected.mortonPlus_medium_2 = createMortonFromU64Vec<false, mediumBits_2, 2>(Vec2AMedium + Vec2BMedium);

From 8f72b9ecda1a2e39a58b4fc43fc6d9a025f80728 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Thu, 11 Dec 2025 16:18:56 +0700
Subject: [PATCH 45/57] Fix compiler warning for shader compilation

---
 14_Mortons/app_resources/common.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/14_Mortons/app_resources/common.hlsl b/14_Mortons/app_resources/common.hlsl
index 980bb0c32..98e5e1342 100644
--- a/14_Mortons/app_resources/common.hlsl
+++ b/14_Mortons/app_resources/common.hlsl
@@ -27,14 +27,14 @@ NBL_CONSTEXPR_INLINE_FUNC T createAnyBitIntegerFromU64(uint64_t val)
   {
     NBL_CONSTEXPR_FUNC_SCOPE_VAR uint64_t mask = (uint64_t(1) << (Bits - 1)) - 1;
     // fill excess bit with one
-	if (int64_t(val) < 0)
-		return T(val) | ~mask;
+	if (_static_cast<int64_t>(val) < 0)
+		return _static_cast<T>(val | ~mask);
 	else
-        return T(val) & mask;
+        return _static_cast<T>(val & mask);
   } else
   {
     NBL_CONSTEXPR_FUNC_SCOPE_VAR uint64_t mask = (uint64_t(1) << Bits) - 1;
-    return T(val) & mask;
+    return _static_cast<T>(val & mask);
   }
 }
 

From 3042409a14c7e69e8e63191b5c1b996e863a7cda Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 12 Dec 2025 18:02:46 +0700
Subject: [PATCH 46/57] Add back second test to first in commented form

---
 14_Mortons/CTester.h                     | 15 +++++++++++++++
 14_Mortons/app_resources/testCommon.hlsl | 17 ++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/14_Mortons/CTester.h b/14_Mortons/CTester.h
index 447ceb18a..6933e77e5 100644
--- a/14_Mortons/CTester.h
+++ b/14_Mortons/CTester.h
@@ -175,19 +175,23 @@ class CTester final : public ITester
                 expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall));
                 expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium));
                 expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
+                expected.mortonUnsignedLess_emulated_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
 
                 // Coordinate-wise signed inequality
                 expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall));
                 expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium));
                 expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
+                expected.mortonSignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
 
                 expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall));
                 expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium));
                 expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
+                expected.mortonSignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
 
                 expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall));
                 expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium));
                 expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
+                expected.mortonSignedLess_emulated_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
 
                 uint16_t castedShift = uint16_t(generatedShift);
                 // Left-shift
@@ -226,14 +230,17 @@ class CTester final : public ITester
                 expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create(Vec2ASignedSmall >> int16_t(castedShift % smallBits_2));
                 expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create(Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2));
                 expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create(Vec2ASignedFull >> int32_t(castedShift % fullBits_2));
+                expected.mortonSignedRightShift_emulated_2 = createMortonFromU64Vec<true, fullBits_2, 2, emulated_uint64_t>(Vec2ASignedFull >> int32_t(castedShift % fullBits_2));
                 
                 expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create(Vec3ASignedSmall >> int16_t(castedShift % smallBits_3));
                 expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create(Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3));
                 expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create(Vec3ASignedFull >> int32_t(castedShift % fullBits_3));
+                expected.mortonSignedRightShift_emulated_3 = createMortonFromU64Vec<true, fullBits_3, 3, emulated_uint64_t>(Vec3ASignedFull >> int32_t(castedShift % fullBits_3));
                 
                 expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create(Vec4ASignedSmall >> int16_t(castedShift % smallBits_4));
                 expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create(Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4));
                 expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create(Vec4ASignedFull >> int16_t(castedShift % fullBits_4));
+                expected.mortonSignedRightShift_emulated_4 = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4ASignedFull >> int16_t(castedShift % fullBits_4));
             }
 
             performCpuTests(testInput, expected);
@@ -263,6 +270,7 @@ class CTester final : public ITester
 
     void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
     {
+        // Some verification is commented out and moved to CTester2 due to bug in dxc. Uncomment them when the bug is fixed.
         verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
         verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
         verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
@@ -340,19 +348,23 @@ class CTester final : public ITester
         verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType);
         verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
         verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
+        // verifyTestValue("mortonUnsignedLess_emulated_4", expectedTestValues.mortonUnsignedLess_emulated_4, testValues.mortonUnsignedLess_emulated_4, testType);
         
         // Morton coordinate-wise signed inequality
         verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
         verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
         verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
+        // verifyTestValue("mortonSignedLess_emulated_2", expectedTestValues.mortonSignedLess_emulated_2, testValues.mortonSignedLess_emulated_2, testType);
         
         verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
         verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
         verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
+        // verifyTestValue("mortonSignedLess_emulated_3", expectedTestValues.mortonSignedLess_emulated_3, testValues.mortonSignedLess_emulated_3, testType);
         
         verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
         verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
         verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
+        // verifyTestValue("mortonSignedLess_emulated_4", expectedTestValues.mortonSignedLess_emulated_4, testValues.mortonSignedLess_emulated_4, testType);
         
         // Morton left-shift
         verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
@@ -390,14 +402,17 @@ class CTester final : public ITester
         verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
         verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
         verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
+        // verifyTestValue("mortonSignedRightShift_emulated_2", expectedTestValues.mortonSignedRightShift_emulated_2, testValues.mortonSignedRightShift_emulated_2, testType);
         
         verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType);
         verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType);
         verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType);
+        //verifyTestValue("mortonSignedRightShift_emulated_3", expectedTestValues.mortonSignedRightShift_emulated_3, testValues.mortonSignedRightShift_emulated_3, testType);
         
         verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType);
         verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType);
         verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType);
+        // verifyTestValue("mortonSignedRightShift_emulated_4", expectedTestValues.mortonSignedRightShift_emulated_4, testValues.mortonSignedRightShift_emulated_4, testType);
     }
 };
 
diff --git a/14_Mortons/app_resources/testCommon.hlsl b/14_Mortons/app_resources/testCommon.hlsl
index f068b474b..6144b6ce9 100644
--- a/14_Mortons/app_resources/testCommon.hlsl
+++ b/14_Mortons/app_resources/testCommon.hlsl
@@ -123,7 +123,8 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = createMortonFromU64Vec<true, mediumBits_4, 4>(Vec4A);
 	morton::code<true, fullBits_4, 4> morton_full_4_signed = createMortonFromU64Vec<true, fullBits_4, 4>(Vec4A);
 	morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = createMortonFromU64Vec<true, fullBits_4, 4, emulated_uint64_t>(Vec4A);
-	
+
+    // Some test and operation is moved to testCommon2.hlsl due to dxc bug that cause compilation failure. Uncomment when the bug is fixed.
 	// Plus
 	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
 	output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
@@ -186,19 +187,23 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(Vec4BSmall));
 	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(Vec4BMedium));
 	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(Vec4BFull));
+	// output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(Vec4BFull));
 	
 	// Coordinate-wise signed inequality
 	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(Vec2BSignedSmall));
 	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(Vec2BSignedMedium));
 	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(Vec2BSignedFull));
+	// output.mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(Vec2BSignedFull)); 
 	
 	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(Vec3BSignedSmall));
 	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(Vec3BSignedMedium));
 	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(Vec3BSignedFull));
+	// output.mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(Vec3BSignedFull)); 
 	
 	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(Vec4BSignedSmall));
 	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(Vec4BSignedMedium));
 	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(Vec4BSignedFull));
+	// output.mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(Vec4BSignedFull)); 
 	
 	// Cast to uint16_t which is what left shift for Mortons expect
 	uint16_t castedShift = uint16_t(input.shift);
@@ -231,7 +236,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
 	output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
 	
-	// // Unsigned right-shift
+	// Unsigned right-shift
 	arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
 	output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2);
 	arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
@@ -259,13 +264,15 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
 	output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
 	
-	// // Signed right-shift
+	// Signed right-shift
 	arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
 	output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2);
 	arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
 	output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2);
 	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
 	output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2);
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
+	// output.mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift % fullBits_2); 
 	
 	arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
 	output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3);
@@ -273,6 +280,8 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3);
 	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
 	output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3);
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
+	// output.mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift % fullBits_3); 
 	
 	arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
 	output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4);
@@ -280,5 +289,7 @@ void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestVa
 	output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4);
 	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
 	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
+	// arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
+	// output.mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift % fullBits_4); 
 
 }
\ No newline at end of file

From 7011ea0c5787518d5fe72977bc1af61eeccefd1d Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 12 Dec 2025 23:41:52 +0700
Subject: [PATCH 47/57] Fix example 28 to use select instead of ternary_op

---
 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl     | 12 +++++-------
 .../app_resources/kernel_fft_second_axis.hlsl        | 10 ++++------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index 07c2ec8cf..ffb405eef 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -68,8 +68,6 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 		// This one shows up a lot so we give it a name
 		const bool oddThread = glsl::gl_SubgroupInvocationID() & 1u;
 
-		ternary_operator<complex_t<scalar_t> > ternaryOp;
-
 		// Since every two consecutive columns are stored as one packed column, we divide the index by 2 to get the index of that packed column
 		const uint32_t firstIndex = workgroup::SubgroupContiguousIndex() / 2;
 		int32_t paddedIndex = int32_t(firstIndex) - pushConstants.halfPadding;
@@ -93,17 +91,17 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 
 			if (glsl::gl_WorkGroupID().x)
 			{
-				complex_t<scalar_t> lo = ternaryOp(oddThread, otherThreadLoOrHi, loOrHi);
-				complex_t<scalar_t> hi = ternaryOp(oddThread, loOrHi, otherThreadLoOrHi);
+				complex_t<scalar_t> lo = select(oddThread, otherThreadLoOrHi, loOrHi);
+				complex_t<scalar_t> hi = select(oddThread, loOrHi, otherThreadLoOrHi);
 				fft::unpack<scalar_t>(lo, hi);
 
 				// --------------------------------------------------- MIRROR PADDING -------------------------------------------------------------------------------------------
 				#ifdef MIRROR_PADDING
-				preloaded[localElementIndex] = ternaryOp(oddThread ^ invert, hi, lo);
+				preloaded[localElementIndex] = select(oddThread ^ invert, hi, lo);
 				// ----------------------------------------------------- ZERO PADDING -------------------------------------------------------------------------------------------
 				#else
 				const complex_t<scalar_t> Zero = { scalar_t(0), scalar_t(0) };
-				preloaded[localElementIndex] = ternaryOp(invert, Zero, ternaryOp(oddThread, hi, lo));
+				preloaded[localElementIndex] = select(invert, Zero, select(oddThread, hi, lo));
 				#endif
 				// ------------------------------------------------ END PADDING DIVERGENCE ----------------------------------------------------------------------------------------
 			}
@@ -116,7 +114,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 				const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 				// Odd thread writes `hi = Z1 + iN1`
 				const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-				preloaded[localElementIndex] = ternaryOp(oddThread ^ invert, oddThreadHi, evenThreadLo);
+				preloaded[localElementIndex] = select(oddThread ^ invert, oddThreadHi, evenThreadLo);
 			}
 			paddedIndex += WorkgroupSize / 2;
 		}
diff --git a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
index eaecb5d0f..a1e5a76cd 100644
--- a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+++ b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
@@ -46,8 +46,6 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 		// This one shows up a lot so we give it a name
 		const bool oddThread = glsl::gl_SubgroupInvocationID() & 1u;
 
-		ternary_operator<complex_t<scalar_t> > ternaryOp;
-
 		if (glsl::gl_WorkGroupID().x)
 		{
 			// Even thread must index a y corresponding to an even element of the previous FFT pass, and the odd thread must index its DFT Mirror
@@ -72,10 +70,10 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const vector <scalar_t, 2> loOrHiVector = vector <scalar_t, 2>(loOrHi.real(), loOrHi.imag());
 					const vector <scalar_t, 2> otherThreadloOrHiVector = glsl::subgroupShuffleXor< vector <scalar_t, 2> >(loOrHiVector, 1u);
 					const complex_t<scalar_t> otherThreadLoOrHi = { otherThreadloOrHiVector.x, otherThreadloOrHiVector.y };
-					complex_t<scalar_t> lo = ternaryOp(oddThread, otherThreadLoOrHi, loOrHi);
-					complex_t<scalar_t> hi = ternaryOp(oddThread, loOrHi, otherThreadLoOrHi);
+					complex_t<scalar_t> lo = select(oddThread, otherThreadLoOrHi, loOrHi);
+					complex_t<scalar_t> hi = select(oddThread, loOrHi, otherThreadLoOrHi);
 					fft::unpack<scalar_t>(lo, hi);
-					preloaded[channel][localElementIndex] = ternaryOp(oddThread, hi, lo);
+					preloaded[channel][localElementIndex] = select(oddThread, hi, lo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}
@@ -112,7 +110,7 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 					// Odd thread writes `hi = Z1 + iN1`
 					const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-					preloaded[channel][localElementIndex] = ternaryOp(oddThread, oddThreadHi, evenThreadLo);
+					preloaded[channel][localElementIndex] = select(oddThread, oddThreadHi, evenThreadLo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}

From 02eed2e1c81446c9f757f1fc7dc3a283abeabf23 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Fri, 12 Dec 2025 23:58:34 +0700
Subject: [PATCH 48/57] Fix example 28

---
 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index ffb405eef..1b8a4c076 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -97,7 +97,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 
 				// --------------------------------------------------- MIRROR PADDING -------------------------------------------------------------------------------------------
 				#ifdef MIRROR_PADDING
-				preloaded[localElementIndex] = select(oddThread ^ invert, hi, lo);
+				preloaded[localElementIndex] = select(_static_cast<bool>(oddThread ^ invert), hi, lo);
 				// ----------------------------------------------------- ZERO PADDING -------------------------------------------------------------------------------------------
 				#else
 				const complex_t<scalar_t> Zero = { scalar_t(0), scalar_t(0) };
@@ -114,7 +114,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 				const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 				// Odd thread writes `hi = Z1 + iN1`
 				const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-				preloaded[localElementIndex] = select(oddThread ^ invert, oddThreadHi, evenThreadLo);
+				preloaded[localElementIndex] = select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
 			}
 			paddedIndex += WorkgroupSize / 2;
 		}

From 30b4f52c17cf8f6ae2aaae9b846aa1048954cd16 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 13 Dec 2025 02:26:15 +0700
Subject: [PATCH 49/57] prefix select with hlsl::

---
 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl      | 10 +++++-----
 28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index 1b8a4c076..61a819992 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -91,17 +91,17 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 
 			if (glsl::gl_WorkGroupID().x)
 			{
-				complex_t<scalar_t> lo = select(oddThread, otherThreadLoOrHi, loOrHi);
-				complex_t<scalar_t> hi = select(oddThread, loOrHi, otherThreadLoOrHi);
+				complex_t<scalar_t> lo = hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
+				complex_t<scalar_t> hi = hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
 				fft::unpack<scalar_t>(lo, hi);
 
 				// --------------------------------------------------- MIRROR PADDING -------------------------------------------------------------------------------------------
 				#ifdef MIRROR_PADDING
-				preloaded[localElementIndex] = select(_static_cast<bool>(oddThread ^ invert), hi, lo);
+				preloaded[localElementIndex] = hlsl::select(_static_cast<bool>(oddThread ^ invert), hi, lo);
 				// ----------------------------------------------------- ZERO PADDING -------------------------------------------------------------------------------------------
 				#else
 				const complex_t<scalar_t> Zero = { scalar_t(0), scalar_t(0) };
-				preloaded[localElementIndex] = select(invert, Zero, select(oddThread, hi, lo));
+				preloaded[localElementIndex] = hlsl::select(invert, Zero, hlsl::select(oddThread, hi, lo));
 				#endif
 				// ------------------------------------------------ END PADDING DIVERGENCE ----------------------------------------------------------------------------------------
 			}
@@ -114,7 +114,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 				const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 				// Odd thread writes `hi = Z1 + iN1`
 				const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-				preloaded[localElementIndex] = select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
+				preloaded[localElementIndex] = hlsl::select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
 			}
 			paddedIndex += WorkgroupSize / 2;
 		}
diff --git a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
index a1e5a76cd..6276ed02e 100644
--- a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+++ b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
@@ -70,10 +70,10 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const vector <scalar_t, 2> loOrHiVector = vector <scalar_t, 2>(loOrHi.real(), loOrHi.imag());
 					const vector <scalar_t, 2> otherThreadloOrHiVector = glsl::subgroupShuffleXor< vector <scalar_t, 2> >(loOrHiVector, 1u);
 					const complex_t<scalar_t> otherThreadLoOrHi = { otherThreadloOrHiVector.x, otherThreadloOrHiVector.y };
-					complex_t<scalar_t> lo = select(oddThread, otherThreadLoOrHi, loOrHi);
-					complex_t<scalar_t> hi = select(oddThread, loOrHi, otherThreadLoOrHi);
+					complex_t<scalar_t> lo = hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
+					complex_t<scalar_t> hi = hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
 					fft::unpack<scalar_t>(lo, hi);
-					preloaded[channel][localElementIndex] = select(oddThread, hi, lo);
+					preloaded[channel][localElementIndex] = hlsl::select(oddThread, hi, lo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}
@@ -110,7 +110,7 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 					// Odd thread writes `hi = Z1 + iN1`
 					const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-					preloaded[channel][localElementIndex] = select(oddThread, oddThreadHi, evenThreadLo);
+					preloaded[channel][localElementIndex] = hlsl::select(oddThread, oddThreadHi, evenThreadLo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}

From 3e443b12e4511240fa783cc4bab6291c0d115ed9 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Sat, 13 Dec 2025 02:52:17 +0700
Subject: [PATCH 50/57] Add nbl prefix to hlsl::select

---
 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl      | 10 +++++-----
 28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index 61a819992..a0c1133cc 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -91,17 +91,17 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 
 			if (glsl::gl_WorkGroupID().x)
 			{
-				complex_t<scalar_t> lo = hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
-				complex_t<scalar_t> hi = hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
+				complex_t<scalar_t> lo = nbl::hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
+				complex_t<scalar_t> hi = nbl::hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
 				fft::unpack<scalar_t>(lo, hi);
 
 				// --------------------------------------------------- MIRROR PADDING -------------------------------------------------------------------------------------------
 				#ifdef MIRROR_PADDING
-				preloaded[localElementIndex] = hlsl::select(_static_cast<bool>(oddThread ^ invert), hi, lo);
+				preloaded[localElementIndex] = nbl::hlsl::select(_static_cast<bool>(oddThread ^ invert), hi, lo);
 				// ----------------------------------------------------- ZERO PADDING -------------------------------------------------------------------------------------------
 				#else
 				const complex_t<scalar_t> Zero = { scalar_t(0), scalar_t(0) };
-				preloaded[localElementIndex] = hlsl::select(invert, Zero, hlsl::select(oddThread, hi, lo));
+				preloaded[localElementIndex] = nbl::hlsl::select(invert, Zero, nbl::hlsl::select(oddThread, hi, lo));
 				#endif
 				// ------------------------------------------------ END PADDING DIVERGENCE ----------------------------------------------------------------------------------------
 			}
@@ -114,7 +114,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 				const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 				// Odd thread writes `hi = Z1 + iN1`
 				const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-				preloaded[localElementIndex] = hlsl::select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
+				preloaded[localElementIndex] = nbl::hlsl::select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
 			}
 			paddedIndex += WorkgroupSize / 2;
 		}
diff --git a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
index 6276ed02e..eca81e859 100644
--- a/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+++ b/28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
@@ -70,10 +70,10 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const vector <scalar_t, 2> loOrHiVector = vector <scalar_t, 2>(loOrHi.real(), loOrHi.imag());
 					const vector <scalar_t, 2> otherThreadloOrHiVector = glsl::subgroupShuffleXor< vector <scalar_t, 2> >(loOrHiVector, 1u);
 					const complex_t<scalar_t> otherThreadLoOrHi = { otherThreadloOrHiVector.x, otherThreadloOrHiVector.y };
-					complex_t<scalar_t> lo = hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
-					complex_t<scalar_t> hi = hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
+					complex_t<scalar_t> lo = nbl::hlsl::select(oddThread, otherThreadLoOrHi, loOrHi);
+					complex_t<scalar_t> hi = nbl::hlsl::select(oddThread, loOrHi, otherThreadLoOrHi);
 					fft::unpack<scalar_t>(lo, hi);
-					preloaded[channel][localElementIndex] = hlsl::select(oddThread, hi, lo);
+					preloaded[channel][localElementIndex] = nbl::hlsl::select(oddThread, hi, lo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}
@@ -110,7 +110,7 @@ struct PreloadedSecondAxisAccessor : MultiChannelPreloadedAccessorMirrorTradeBas
 					const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 					// Odd thread writes `hi = Z1 + iN1`
 					const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-					preloaded[channel][localElementIndex] = hlsl::select(oddThread, oddThreadHi, evenThreadLo);
+					preloaded[channel][localElementIndex] = nbl::hlsl::select(oddThread, oddThreadHi, evenThreadLo);
 
 					packedColumnIndex += WorkgroupSize / 2;
 				}

From 07d0197eff60cae99ff226825b942ec64b3504d7 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Fri, 12 Dec 2025 21:29:55 -0300
Subject: [PATCH 51/57] Patch for semantic clarity, remove usage of hlsl
 keyword named variable

---
 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
index a0c1133cc..02ae4ff40 100644
--- a/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+++ b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
@@ -80,7 +80,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 		{
 			// If mirrored, we need to invert which thread is loading lo and which is loading hi
 			// If using zero-padding, useful to find out if we're outside of [0,1) bounds
-			bool invert = paddedIndex < 0 || paddedIndex >= pushConstants.imageHalfRowLength;
+			bool inPadding = paddedIndex < 0 || paddedIndex >= pushConstants.imageHalfRowLength;
 			int32_t wrappedIndex = paddedIndex < 0 ? ~paddedIndex : paddedIndex; // ~x = - x - 1 in two's complement (except maybe at the borders of representable range) 
 			wrappedIndex = paddedIndex < pushConstants.imageHalfRowLength ? wrappedIndex : pushConstants.imageRowLength + ~paddedIndex;
 			const complex_t<scalar_t> loOrHi = colMajorAccessor.get(colMajorOffset(wrappedIndex, y));
@@ -97,11 +97,11 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 
 				// --------------------------------------------------- MIRROR PADDING -------------------------------------------------------------------------------------------
 				#ifdef MIRROR_PADDING
-				preloaded[localElementIndex] = nbl::hlsl::select(_static_cast<bool>(oddThread ^ invert), hi, lo);
+				preloaded[localElementIndex] = nbl::hlsl::select(oddThread != inPadding, hi, lo);
 				// ----------------------------------------------------- ZERO PADDING -------------------------------------------------------------------------------------------
 				#else
 				const complex_t<scalar_t> Zero = { scalar_t(0), scalar_t(0) };
-				preloaded[localElementIndex] = nbl::hlsl::select(invert, Zero, nbl::hlsl::select(oddThread, hi, lo));
+				preloaded[localElementIndex] = nbl::hlsl::select(inPadding, Zero, nbl::hlsl::select(oddThread, hi, lo));
 				#endif
 				// ------------------------------------------------ END PADDING DIVERGENCE ----------------------------------------------------------------------------------------
 			}
@@ -114,7 +114,7 @@ struct PreloadedSecondAxisAccessor : PreloadedAccessorMirrorTradeBase
 				const complex_t<scalar_t> evenThreadLo = { loOrHi.real(), otherThreadLoOrHi.real() };
 				// Odd thread writes `hi = Z1 + iN1`
 				const complex_t<scalar_t> oddThreadHi = { otherThreadLoOrHi.imag(), loOrHi.imag() };
-				preloaded[localElementIndex] = nbl::hlsl::select(_static_cast<bool>(oddThread ^ invert), oddThreadHi, evenThreadLo);
+				preloaded[localElementIndex] = nbl::hlsl::select(oddThread != inPadding, oddThreadHi, evenThreadLo);
 			}
 			paddedIndex += WorkgroupSize / 2;
 		}

From 8a20833f36be08910e307cd59b9e2550b0cfe0f1 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Sun, 14 Dec 2025 12:14:47 +0100
Subject: [PATCH 52/57] refactor slightly

---
 .../app_resources/binarySearch.comp.hlsl      | 11 ++++----
 .../app_resources/common.h                    | 14 ++++------
 .../app_resources/present.frag.hlsl           | 19 --------------
 72_CooperativeBinarySearch/main.cpp           | 26 ++++++++++---------
 4 files changed, 25 insertions(+), 45 deletions(-)
 delete mode 100644 72_CooperativeBinarySearch/app_resources/present.frag.hlsl

diff --git a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
index 05c0d8464..0834e8f91 100644
--- a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
+++ b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl
@@ -1,18 +1,18 @@
-// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
 #pragma wave shader_stage(compute)
 
 #include "common.h"
+
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl"
+
 using namespace nbl::hlsl;
 
 [[vk::push_constant]] PushConstants Constants;
 [[vk::binding(0)]] StructuredBuffer<uint> Histogram;
 [[vk::binding(1)]] RWStructuredBuffer<uint> Output;
 
-static const uint32_t GroupsharedSize = 256;
 
 uint getNextPowerOfTwo(uint number) {
 	return 2 << firstbithigh(number - 1);
@@ -61,9 +61,10 @@ uint binarySearchLowerBoundFindValue(uint findValue, StructuredBuffer<uint> sear
 	return left + firstLaneGreaterThan - 1;
 }
 
+static const uint32_t GroupsharedSize = WorkgroupSize;
 groupshared uint shared_groupSearchBufferMinIndex;
 groupshared uint shared_groupSearchBufferMaxIndex;
-groupshared uint shared_groupSearchValues[GroupsharedSize];
+groupshared uint shared_groupSearchValues[WorkgroupSize];
 
 // Binary search using the entire workgroup, making it log32 or log64 (every iteration, the possible set of 
 // values is divided by the number of lanes in a wave)
@@ -112,7 +113,7 @@ uint binarySearchLowerBoundCooperative(uint groupIndex, uint groupThread, Struct
 	return laneValue;
 }
 
-[numthreads(256, 1, 1)]
+[numthreads(WorkgroupSize,1,1)]
 void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID)
 {
     Output[thread.x] = binarySearchLowerBoundCooperative(group.x, groupThread.x, Histogram, Constants.EntityCount);
diff --git a/72_CooperativeBinarySearch/app_resources/common.h b/72_CooperativeBinarySearch/app_resources/common.h
index 4a3cacaa4..65f606b08 100644
--- a/72_CooperativeBinarySearch/app_resources/common.h
+++ b/72_CooperativeBinarySearch/app_resources/common.h
@@ -1,19 +1,15 @@
-#ifndef _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
-#define _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
+#ifndef _COOPERATIVE_BINARY_SEARCH_H_INCLUDED_
+#define _COOPERATIVE_BINARY_SEARCH_H_INCLUDED_
 
 #include <nbl/builtin/hlsl/cpp_compat/basic.h>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 
-using namespace nbl::hlsl;
-namespace nbl {
-namespace hlsl {
+// TODO: NBL_CONSTEXPR_NSPC_VAR
+static const uint32_t WorkgroupSize = 256;
 
 struct PushConstants
 {
 	uint32_t EntityCount;
 };
 
-};
-};
-
-#endif // _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_
+#endif // _COOPERATIVE_BINARY_SEARCH_H_INCLUDED_
diff --git a/72_CooperativeBinarySearch/app_resources/present.frag.hlsl b/72_CooperativeBinarySearch/app_resources/present.frag.hlsl
deleted file mode 100644
index 22695657c..000000000
--- a/72_CooperativeBinarySearch/app_resources/present.frag.hlsl
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#pragma wave shader_stage(fragment)
-
-// vertex shader is provided by the fullScreenTriangle extension
-#include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
-using namespace nbl::hlsl;
-using namespace ext::FullScreenTriangle;
-
-// binding 0 set 0
-[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture;
-[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState;
-
-[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0
-{
-    return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f);
-}
\ No newline at end of file
diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp
index 828adf34f..81724c1b8 100644
--- a/72_CooperativeBinarySearch/main.cpp
+++ b/72_CooperativeBinarySearch/main.cpp
@@ -20,11 +20,14 @@ using namespace nbl::ui;
 using namespace nbl::video;
 using namespace nbl::examples;
 
-//using namespace glm;
-
-static constexpr uint32_t TestCaseIndices[] = {
+//
+constexpr uint32_t TestCaseIndices[] = {
 #include "testCaseData.h"
 };
+constexpr uint32_t numIndices = sizeof(TestCaseIndices) / sizeof(TestCaseIndices[0]);
+constexpr uint32_t lastValue = TestCaseIndices[numIndices - 1];
+// just some extra stuff over the edge
+constexpr uint32_t totalValues = lastValue + 100;
 
 
 void cpu_tests();
@@ -85,7 +88,7 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
 		    SPushConstantRange pcRange = {};
 		    pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE;
 		    pcRange.offset = 0u;
-		    pcRange.size = sizeof(nbl::hlsl::PushConstants);
+		    pcRange.size = sizeof(PushConstants);
             auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout));
             IGPUComputePipeline::SCreationParams params = {};
             params.layout = layout.get();
@@ -94,11 +97,12 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
             if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
                 return logFail("Failed to create compute pipeline!\n");
         }
-
+        
+        const size_t sizes[2] = {sizeof(TestCaseIndices),sizeof(uint32_t)*totalValues};
         for (uint32_t i = 0; i < bindingCount; i++)
         {
             m_buffers[i] = m_device->createBuffer(IGPUBuffer::SCreationParams {
-                {.size = 500000, .usage = 
+                {.size = sizes[i], .usage =
                     IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | 
                     IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT,
                 }
@@ -146,7 +150,8 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
         memcpy(
             reinterpret_cast<void*>(outPtr), 
             reinterpret_cast<const void*>(&TestCaseIndices[0]), 
-            sizeof(TestCaseIndices));
+            sizeof(TestCaseIndices)
+        );
 
         // In contrast to fences, we just need one semaphore to rule all dispatches
         return true;
@@ -187,16 +192,13 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp
         
 
         const IGPUDescriptorSet* set = m_descriptorSet.get();
-        const uint32_t numIndices = sizeof(TestCaseIndices) / sizeof(TestCaseIndices[0]);
-        const uint32_t lastValue = TestCaseIndices[numIndices - 1];
-        const uint32_t totalValues = lastValue + 100;
-        nbl::hlsl::PushConstants coopBinarySearchPC = {
+        PushConstants coopBinarySearchPC = {
             .EntityCount = numIndices,
         };
 
         m_cmdbuf->bindComputePipeline(m_pipeline.get());
         m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set);
-        m_cmdbuf->pushConstants(m_pipeline->getLayout(), nbl::hlsl::ShaderStage::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::PushConstants), &coopBinarySearchPC);
+        m_cmdbuf->pushConstants(m_pipeline->getLayout(), nbl::hlsl::ShaderStage::ESS_COMPUTE, 0u, sizeof(PushConstants), &coopBinarySearchPC);
         m_cmdbuf->dispatch((totalValues + 255u) / 256u, 1u, 1u);
 
 		layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT);

From 4425ec1454acd2e7771f290d7b5f08fd9dbcb07b Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 15 Dec 2025 20:32:18 +0100
Subject: [PATCH 53/57] ambiguity of `is_same_v` patched up

---
 22_CppCompat/app_resources/test.comp.hlsl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/22_CppCompat/app_resources/test.comp.hlsl b/22_CppCompat/app_resources/test.comp.hlsl
index 98be76c53..17c59f970 100644
--- a/22_CppCompat/app_resources/test.comp.hlsl
+++ b/22_CppCompat/app_resources/test.comp.hlsl
@@ -3,9 +3,6 @@
 //// For conditions of distribution and use, see copyright notice in nabla.h
 #include "app_resources/common.hlsl"
 
-template<typename L, typename R>
-const static bool is_same_v = nbl::hlsl::is_same_v<L,R>;
-
 
 struct PushConstants
 {
@@ -88,6 +85,7 @@ struct device_capabilities2
 };
 
 [numthreads(8, 8, 1)]
+[shader("compute")]
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
     fill(invocationID, 1);
@@ -157,9 +155,9 @@ void main(uint3 invocationID : SV_DispatchThreadID)
     {
         static const uint16_t TEST_VALUE_0 = 5;
         static const uint32_t TEST_VALUE_1 = 0x80000000u;
-        static const uint32_t TEST_VALUE_2 = 0x8000000000000000u;
+        static const uint32_t TEST_VALUE_2 = 0x8000000000000000u; // TODO: Przmek is this intended? it warns because its too big from uint32_t
         static const uint32_t TEST_VALUE_3 = 0x00000001u;
-        static const uint32_t TEST_VALUE_4 = 0x0000000000000001u;
+        static const uint32_t TEST_VALUE_4 = 0x0000000000000001u; // TODO: Przmek is this intended? it warns because its too big from uint32_t
         
 
         fill(invocationID, 5.01);

From 1c6458d81b83aea176ac7ebda7450a9b395a85bd Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 17 Dec 2025 22:23:10 +0300
Subject: [PATCH 54/57] A lot more debuggability, and: -  Camera movement is
 disabled correctly - Hacked ViewManipulate to use for the cube itself - Added
 a storage buffer for debugging and getting stuff from GPU to CPU - Most
 importantly, disabled skew, used TRS for that - Random OBB buttons -
 Detection of mismatch of silhouette vertices (between slow more correct algo
 vs fast LUT based algo)

---
 .../app_resources/hlsl/Drawing.hlsl           | 172 +++++
 .../hlsl/SolidAngleVis.frag.hlsl              | 644 +++++++++---------
 .../app_resources/hlsl/common.hlsl            |  49 +-
 .../app_resources/hlsl/utils.hlsl             |  23 +
 72_SolidAngleVisualizer/include/transform.hpp |  73 +-
 72_SolidAngleVisualizer/main.cpp              | 375 ++++++++--
 .../include/nbl/examples/cameras/CCamera.hpp  |   5 +
 7 files changed, 939 insertions(+), 402 deletions(-)
 create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
 create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
new file mode 100644
index 000000000..c3cb5befa
--- /dev/null
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
@@ -0,0 +1,172 @@
+#ifndef _DEBUG_HLSL_
+#define _DEBUG_HLSL_
+#include "common.hlsl"
+
+float2 sphereToCircle(float3 spherePoint)
+{
+    if (spherePoint.z >= 0.0f)
+    {
+        return spherePoint.xy * CIRCLE_RADIUS;
+    }
+    else
+    {
+        float r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z);
+        float uv2Plus1 = r2 + 1.0f;
+        return (spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS;
+    }
+}
+
+float4 drawGreatCircleArc(float3 fragPos, float3 points[2], int visibility, float aaWidth)
+{
+    if (visibility == 0) return float4(0,0,0,0);
+    
+    float3 v0 = normalize(points[0]);
+    float3 v1 = normalize(points[1]);
+    float3 p = normalize(fragPos);
+    
+    float3 arcNormal = normalize(cross(v0, v1));
+    float dist = abs(dot(p, arcNormal));
+    
+    float dotMid = dot(v0, v1);
+    bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid);
+    
+    if (!onArc) return float4(0,0,0,0);
+    
+    float avgDepth = (length(points[0]) + length(points[1])) * 0.5f;
+    float depthScale = 3.0f / avgDepth;
+    
+    float baseWidth = (visibility == 1) ? 0.01f : 0.005f;
+    float width = min(baseWidth * depthScale, 0.02f);
+    
+    float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
+    
+    float4 edgeColor = (visibility == 1) ? 
+        float4(0.0f, 0.5f, 1.0f, 1.0f) :
+        float4(1.0f, 0.0f, 0.0f, 1.0f);
+    
+    float intensity = (visibility == 1) ? 1.0f : 0.5f;
+    return edgeColor * alpha * intensity;
+}
+
+float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth)
+{
+    float4 color = float4(0,0,0,0);
+    float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1);
+    
+    for (int i = 0; i < 12; i++)
+    {
+        if ((silEdgeMask & (1u << i)) == 0)
+        {
+            int2 edge = allEdges[i];
+            float3 edgePoints[2] = { corners[edge.x], corners[edge.y] };
+            float4 edgeContribution = drawGreatCircleArc(spherePos, edgePoints, 1, aaWidth);
+            color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a);
+        }
+    }
+    return color;
+}
+
+float4 drawCorners(float3 spherePos, float2 p, float aaWidth)
+{
+    float4 color = float4(0,0,0,0);
+    for (int i = 0; i < 8; i++)
+    {
+        float3 corner3D = normalize(corners[i]);
+        float2 cornerPos = sphereToCircle(corner3D);
+        float dist = length(p - cornerPos);
+        float dotSize = 0.02f;
+        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
+        if (dotAlpha > 0.0f)
+        {
+            float3 dotColor = colorLUT[i];
+            color += float4(dotColor * dotAlpha, dotAlpha);
+        }
+    }
+    return color;
+}
+
+float4 drawRing(float2 p, float aaWidth)
+{
+    float positionLength = length(p);
+    float ringWidth = 0.002f;
+    float ringDistance = abs(positionLength - CIRCLE_RADIUS);
+    float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
+    return ringAlpha * float4(1, 1, 1, 1);
+}
+
+// Check if a face on the hemisphere is visible from camera at origin
+bool isFaceVisible(float3 faceCenter, float3 faceNormal)
+{
+    float3 viewVec = normalize(-faceCenter); // Vector from camera to face
+    return dot(faceNormal, viewVec) > 0.0f;
+}
+
+int getEdgeVisibility(int edgeIdx)
+{
+    int2 faces = edgeToFaces[edgeIdx];
+
+    // Transform normals to world space
+    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
+    float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
+    float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
+
+    bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1);
+    bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2);
+
+    // Silhouette: exactly one face visible
+    if (visible1 != visible2) return 1;
+
+    // Inner edge: both faces visible
+    if (visible1 && visible2) return 2;
+
+    // Hidden edge: both faces hidden
+    return 0;
+}
+
+#if DEBUG_DATA
+uint32_t computeGroundTruthEdgeMask()
+{
+    uint32_t mask = 0u;
+    NBL_UNROLL
+    for (int j = 0; j < 12; j++)
+    {
+        // getEdgeVisibility returns 1 for a silhouette edge based on 3D geometry
+        if (getEdgeVisibility(j) == 1)
+        {
+            mask |= (1u << j);
+        }
+    }
+    return mask;
+}
+
+void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSilMask)
+{
+    uint32_t mismatchAccumulator = 0;
+
+    // The Ground Truth now represents the full 3D silhouette, clipped or not.
+    uint32_t groundTruthMask = computeGroundTruthEdgeMask();
+
+    // The comparison checks if the generated mask perfectly matches the full 3D ground truth.
+    uint32_t mismatchMask = groundTruthMask ^ generatedSilMask;
+
+    if (mismatchMask != 0)
+    {
+        NBL_UNROLL
+        for (int j = 0; j < 12; j++)
+        {
+            if ((mismatchMask >> j) & 1u)
+            {
+                int2 edge = allEdges[j];
+                // Accumulate vertex indices where error occurred
+                mismatchAccumulator |= (1u << edge.x) | (1u << edge.y);
+            }
+        }
+    }
+    
+    // Simple Write (assuming all fragments calculate the same result)
+    InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator);
+}
+#endif
+
+
+#endif // _DEBUG_HLSL_
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index 51cb1946d..cd291dbd2 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -1,376 +1,374 @@
 #pragma wave shader_stage(fragment)
 
 #include "common.hlsl"
-
 #include <nbl/builtin/hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl>
+#include "utils.hlsl"
 
 using namespace nbl::hlsl;
 using namespace ext::FullScreenTriangle;
 
 [[vk::push_constant]] struct PushConstants pc;
+[[vk::binding(0, 0)]] RWStructuredBuffer<ResultData> DebugDataBuffer;
 
-static const float CIRCLE_RADIUS = 0.75f;
+static const float CIRCLE_RADIUS = 0.5f;
 
 // --- Geometry Utils ---
 
-// Adjacency of edges to faces
-static const int2 edgeToFaces[12] = { 
-    {4,2}, {3,4}, {2,5}, {5,3}, 
-    {2,0}, {0,3}, {1,2}, {3,1}, 
-    {0,4}, {5,0}, {4,1}, {1,5} 
-};
-
-//float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f
 static const float3 constCorners[8] = {
-    float3(-1, -1, -1), // 0
-    float3( 1, -1, -1), // 1
-    float3(-1,  1, -1), // 2
-    float3( 1,  1, -1), // 3
-    float3(-1, -1,  1), // 4
-    float3( 1, -1,  1), // 5
-    float3(-1,  1,  1), // 6
-    float3( 1,  1,  1)  // 7
+	float3(-1, -1, -1), float3(1, -1, -1), float3(-1,  1, -1), float3(1,  1, -1),
+	float3(-1, -1,  1), float3(1, -1,  1), float3(-1,  1,  1), float3(1,  1,  1)
 };
 
-// All 12 edges of the cube (vertex index pairs)
 static const int2 allEdges[12] = {
-    {0, 1}, {2, 3}, {4, 5}, {6, 7}, // Edges along X axis
-    {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Edges along Y axis
-    {0, 4}, {1, 5}, {2, 6}, {3, 7}  // Edges along Z axis
+	{0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis
+	{0, 2}, {1, 3}, {4, 6}, {5, 7}, // Y axis
+	{0, 4}, {1, 5}, {2, 6}, {3, 7}  // Z axis
 };
 
-static const float3 localNormals[6] = {
-    float3(0, 0, -1), // Face 0 (Z-)
-    float3(0, 0, 1),  // Face 1 (Z+)
-    float3(-1, 0, 0), // Face 2 (X-)
-    float3(1, 0, 0),  // Face 3 (X+)
-    float3(0, -1, 0), // Face 4 (Y-)
-    float3(0, 1, 0)   // Face 5 (Y+)
+// Adjacency of edges to faces
+// Corrected Adjacency of edges to faces
+static const int2 edgeToFaces[12] = {
+	// Edge Index:  | allEdges[i]  | Shared Faces: 
+
+	/* 0 (0-1) */   {4, 0},         // Y- (4) and Z- (0)
+	/* 1 (2-3) */   {5, 0},         // Y+ (5) and Z- (0)
+	/* 2 (4-5) */   {4, 1},         // Y- (4) and Z+ (1)
+	/* 3 (6-7) */   {5, 1},         // Y+ (5) and Z+ (1)
+
+	/* 4 (0-2) */   {2, 0},         // X- (2) and Z- (0)
+	/* 5 (1-3) */   {3, 0},         // X+ (3) and Z- (0)
+	/* 6 (4-6) */   {2, 1},         // X- (2) and Z+ (1)
+	/* 7 (5-7) */   {3, 1},         // X+ (3) and Z+ (1)
+
+	/* 8 (0-4) */   {2, 4},         // X- (2) and Y- (4)
+	/* 9 (1-5) */   {3, 4},         // X+ (3) and Y- (4)
+	/* 10 (2-6) */  {2, 5},         // X- (2) and Y+ (5)
+	/* 11 (3-7) */  {3, 5}          // X+ (3) and Y+ (5)
 };
-
 static float3 corners[8];
-static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), 
-                            float3(0,0,0), float3(0,0,0), float3(0,0,0) };
-
-
-static const float3 colorLUT[27] = {
-    // Row 1: Pure and bright colors
-    float3(0, 0, 0),        // 0: Black
-    float3(1, 1, 1),        // 1: White
-    float3(0.5, 0.5, 0.5),  // 2: Gray
-    
-    // Row 2: Primary colors
-    float3(1, 0, 0),        // 3: Red
-    float3(0, 1, 0),        // 4: Green
-    float3(0, 0, 1),        // 5: Blue
-    
-    // Row 3: Secondary colors
-    float3(1, 1, 0),        // 6: Yellow
-    float3(1, 0, 1),        // 7: Magenta
-    float3(0, 1, 1),        // 8: Cyan
-    
-    // Row 4: Orange family
-    float3(1, 0.5, 0),      // 9: Orange
-    float3(1, 0.65, 0),     // 10: Light Orange
-    float3(0.8, 0.4, 0),    // 11: Dark Orange
-    
-    // Row 5: Pink/Rose family
-    float3(1, 0.4, 0.7),    // 12: Pink
-    float3(1, 0.75, 0.8),   // 13: Light Pink
-    float3(0.7, 0.1, 0.3),  // 14: Deep Rose
-    
-    // Row 6: Purple/Violet family
-    float3(0.5, 0, 0.5),    // 15: Purple
-    float3(0.6, 0.4, 0.8),  // 16: Light Purple
-    float3(0.3, 0, 0.5),    // 17: Indigo
-    
-    // Row 7: Green variations
-    float3(0, 0.5, 0),      // 18: Dark Green
-    float3(0.5, 1, 0),      // 19: Lime
-    float3(0, 0.5, 0.25),   // 20: Forest Green
-    
-    // Row 8: Blue variations
-    float3(0, 0, 0.5),      // 21: Navy
-    float3(0.3, 0.7, 1),    // 22: Sky Blue
-    float3(0, 0.4, 0.6),    // 23: Teal
-    
-    // Row 9: Earth tones
-    float3(0.6, 0.4, 0.2),  // 24: Brown
-    float3(0.8, 0.7, 0.3),  // 25: Tan/Beige
-    float3(0.4, 0.3, 0.1)   // 26: Dark Brown
+static float3 faceCenters[6] = {
+	float3(0,0,0), float3(0,0,0), float3(0,0,0),
+	float3(0,0,0), float3(0,0,0), float3(0,0,0)
+};
+
+static const float3 localNormals[6] = {
+	float3(0, 0, -1), // Face 0 (Z-)
+	float3(0, 0, 1),  // Face 1 (Z+)
+	float3(-1, 0, 0), // Face 2 (X-)
+	float3(1, 0, 0),  // Face 3 (X+)
+	float3(0, -1, 0), // Face 4 (Y-)
+	float3(0, 1, 0)   // Face 5 (Y+)
 };
 
 
-    
+// TODO: unused, remove later
 // Vertices are ordered CCW relative to the camera view.
 static const int silhouettes[27][7] = {
-    {6, 1, 3, 2, 6, 4, 5}, // 0: Black
-    {6, 2, 6, 4, 5, 7, 3}, // 1: White 
-    {6, 0, 4, 5, 7, 3, 2}, // 2: Gray 
-    {6, 1, 3, 7, 6, 4, 5,}, // 3: Red 
-    {4, 4, 5, 7, 6, -1, -1}, // 4: Green 
-    {6, 0, 4, 5, 7, 6, 2}, // 5: Blue 
-    {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow 
-    {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta              
-    {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan 
-    {6, 1, 3, 2, 6, 7, 5}, // 9: Orange
-    {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange
-    {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange
-    {4, 1, 3, 7, 5, -1, -1}, // 12: Pink
-    {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink
-    {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose
-    {6, 0, 1, 3, 7, 5, 4}, // 15: Purple
-    {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple
-    {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo
-    {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green
-    {6, 0, 2, 6, 7, 3, 1}, // 19: Lime
-    {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green
-    {6, 0, 2, 3, 7, 5, 1}, // 21: Navy
-    {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue
-    {6, 0, 4, 6, 2, 3, 1}, // 23: Teal
-    {6, 0, 2, 3, 7, 5, 4},  // 24: Brown
-    {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige
-    {6, 1, 5, 4, 6, 2, 3}  // 26: Dark Brown
+	{6, 1, 3, 2, 6, 4, 5},      // 0: Black
+	{6, 2, 6, 4, 5, 7, 3},      // 1: White 
+	{6, 0, 4, 5, 7, 3, 2},      // 2: Gray 
+	{6, 1, 3, 7, 6, 4, 5,},     // 3: Red 
+	{4, 4, 5, 7, 6, -1, -1},    // 4: Green 
+	{6, 0, 4, 5, 7, 6, 2},      // 5: Blue 
+	{6, 0, 1, 3, 7, 6, 4},      // 6: Yellow 
+	{6, 0, 1, 5, 7, 6, 4},      // 7: Magenta              
+	{6, 0, 1, 5, 7, 6, 2},      // 8: Cyan 
+	{6, 1, 3, 2, 6, 7, 5},      // 9: Orange
+	{4, 2, 6, 7, 3, -1, -1},    // 10: Light Orange
+	{6, 0, 4, 6, 7, 3, 2},      // 11: Dark Orange
+	{4, 1, 3, 7, 5, -1, -1},    // 12: Pink
+	{6, 0, 4, 6, 7, 3, 2},      // 13: Light Pink
+	{4, 0, 4, 6, 2, -1, -1},    // 14: Deep Rose
+	{6, 0, 1, 3, 7, 5, 4},      // 15: Purple
+	{4, 0, 1, 5, 4, -1, -1},    // 16: Light Purple
+	{6, 0, 1, 5, 4, 6, 2},      // 17: Indigo
+	{6, 0, 2, 6, 7, 5, 1},      // 18: Dark Green
+	{6, 0, 2, 6, 7, 3, 1},      // 19: Lime
+	{6, 0, 4, 6, 7, 3, 1},      // 20: Forest Green
+	{6, 0, 2, 3, 7, 5, 1},      // 21: Navy
+	{4, 0, 2, 3, 1, -1, -1},    // 22: Sky Blue
+	{6, 0, 4, 6, 2, 3, 1},      // 23: Teal
+	{6, 0, 2, 3, 7, 5, 4},      // 24: Brown
+	{6, 0, 2, 3, 1, 5, 4},      // 25: Tan/Beige
+	{6, 1, 5, 4, 6, 2, 3}       // 26: Dark Brown
 };
 
-// Converts UV into centered, aspect-corrected NDC circle space
-float2 toCircleSpace(float2 uv)
-{
-    // Map [0,1] UV to [-1,1]
-    float2 p = uv * 2.0f - 1.0f;
-
-    // Correct aspect ratio
-    float aspect = pc.viewport.z / pc.viewport.w; // width / height
-    p.x *= aspect;
-
-    return p * CIRCLE_RADIUS;
-}
+// Binary packed silhouettes
+static const uint32_t binSilhouettes[27] = {
+	0b11000000000000101100110010011001,
+	0b11000000000000011111101100110010,
+	0b11000000000000010011111101100000,
+	0b11000000000000101100110111011001,
+	0b10000000000000000000110111101100,
+	0b11000000000000010110111101100000,
+	0b11000000000000100110111011001000,
+	0b11000000000000100110111101001000,
+	0b11000000000000010110111101001000,
+	0b11000000000000101111110010011001,
+	0b10000000000000000000011111110010,
+	0b11000000000000010011111110100000,
+	0b10000000000000000000101111011001,
+	0b11000000000000010011111110100000,
+	0b10000000000000000000010110100000,
+	0b11000000000000100101111011001000,
+	0b10000000000000000000100101001000,
+	0b11000000000000010110100101001000,
+	0b11000000000000001101111110010000,
+	0b11000000000000001011111110010000,
+	0b11000000000000001011111110100000,
+	0b11000000000000001101111011010000,
+	0b10000000000000000000001011010000,
+	0b11000000000000001011010110100000,
+	0b11000000000000100101111011010000,
+	0b11000000000000100101001011010000,
+	0b11000000000000011010110100101001,
+};
 
-void computeCubeGeo()
+int getSilhouetteVertex(uint32_t packedSil, int index)
 {
-    for (int i = 0; i < 8; i++)
-    {
-        float3 localPos = constCorners[i]; //float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f;
-        float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
-        
-        corners[i] = worldPos.xyz;
-        
-        faceCenters[i/4]      += worldPos / 4.0f; 
-        faceCenters[2+i%2]    += worldPos / 4.0f; 
-        faceCenters[4+(i/2)%2] += worldPos / 4.0f; 
-    }
+	return (packedSil >> (3 * index)) & 0x7;
 }
 
-float4 drawCorners(float3 spherePos, float aaWidth)
+// Get silhouette size
+int getSilhouetteSize(uint32_t sil)
 {
-    float4 color = float4(0,0,0,0);
-    // Draw corner labels for debugging
-    for (int i = 0; i < 8; i++)
-    {
-        float3 corner = normalize(corners[i]);
-        float2 cornerPos = corner.xy;
-        // Project corner onto 2D circle space
-        
-        // Distance from current fragment to corner
-        float dist = length(spherePos.xy - cornerPos);
-        
-        // Draw a small colored dot at the corner
-        float dotSize = 0.03f;
-        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
-        
-        if (dotAlpha > 0.0f)
-        {
-            float brightness = float(i) / 7.0f;
-            float3 dotColor = colorLUT[i];
-            color += float4(dotColor * dotAlpha, dotAlpha);
-        }
-    }
-    return color;
+	return (sil >> 29) & 0x7;
+
 }
 
-float4 drawRing(float2 p, float aaWidth)
+// Check if vertex has negative z
+bool getVertexZNeg(int vertexIdx)
 {
-    float positionLength = length(p);
-    
-    // Add a white background circle ring
-    float ringWidth = 0.01f;
-    float ringDistance = abs(positionLength - CIRCLE_RADIUS);
-    float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
-    
-    return ringAlpha * float4(1, 1, 1, 1); 
+	return normalize(corners[vertexIdx]).z < 0.0f;
 }
 
-// Check if a face on the hemisphere is visible from camera at origin
-bool isFaceVisible(float3 faceCenter, float3 faceNormal)
+#include "Drawing.hlsl"
+
+
+void setDebugData(uint32_t sil, int3 region, int configIndex, uint32_t clippedVertexCount)
 {
-    // Face is visible if normal points toward camera (at origin)
-    float3 viewVec = -normalize(faceCenter); // Vector from face to camera
-    return dot(faceNormal, viewVec) > 0.0f;
+#if DEBUG_DATA
+	DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil));
+	DebugDataBuffer[0].region = uint3(region);
+	DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex);
+	DebugDataBuffer[0].clippedVertexCount = clippedVertexCount;
+	for (int i = 0; i < 6; i++)
+	{
+		DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i));
+	}
+	DebugDataBuffer[0].silhouette = sil;
+#endif
 }
 
-int getEdgeVisibility(int edgeIdx, float3 cameraPos)
+float2 toCircleSpace(float2 uv)
 {
-    int2 faces = edgeToFaces[edgeIdx];
-    
-    // Transform normals to world space
-    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
-    float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]);
-    float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]);
-    
-    bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1);
-    bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2);
-    
-    // Silhouette: exactly one face visible
-    if (visible1 != visible2) return 1;
-    
-    // Inner edge: both faces visible
-    if (visible1 && visible2) return 2;
-    
-    // Hidden edge: both faces hidden
-    return 0;
+	float2 p = uv * 2.0f - 1.0f;
+	float aspect = pc.viewport.z / pc.viewport.w;
+	p.x *= aspect;
+	return p;
 }
 
-// Draw great circle arc in fragment shader with horizon clipping
-float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth)
+uint32_t packSilhouette(const int s[7]) 
 {
-    if (visibility == 0) return float4(0,0,0,0); // Hidden edge
-    
-    float3 v0 = normalize(corners[edgeVerts.x]);
-    float3 v1 = normalize(corners[edgeVerts.y]);
-    float3 p = normalize(fragPos); // Current point on hemisphere
-    
-    // HORIZON CLIPPING: Current fragment must be on front hemisphere
-    if (p.z < 0.0f) 
-        return float4(0,0,0,0);
-    
-    // HORIZON CLIPPING: Skip edge if both endpoints are behind horizon
-    if (v0.z < 0.0f && v1.z < 0.0f) 
-        return float4(0,0,0,0);
-    
-    // Great circle plane normal
-    float3 arcNormal = normalize(cross(v0, v1));
-    
-    // Distance to great circle
-    float dist = abs(dot(p, arcNormal));
-    
-    // Check if point is within arc bounds
-    float dotMid = dot(v0, v1);
-    bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid);
-    
-    if (!onArc) return float4(0,0,0,0);
-    
-    // Depth-based width scaling
-    float avgDepth = (length(corners[edgeVerts.x]) + length(corners[edgeVerts.y])) * 0.5f;
-    float depthScale = 3.0f / avgDepth;
-    
-    float baseWidth = (visibility == 1) ? 0.01f : 0.005f;
-    float width = min(baseWidth * depthScale, 0.02f);
-    
-    float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
-    
-    float4 edgeColor = (visibility == 1) ? 
-        float4(0.0f, 0.5f, 1.0f, 1.0f) :  // Silhouette: blue
-        float4(1.0f, 0.0f, 0.0f, 1.0f);   // Inner: red
-    
-    float intensity = (visibility == 1) ? 1.0f : 0.5f;
-    return edgeColor * alpha * intensity;
+	uint32_t packed = 0;
+	int size = s[0] & 0x7; // 3 bits for size
+
+	// Pack vertices LSB-first (vertex1 in lowest 3 bits above size)
+	for (int i = 1; i <= 6; ++i) {
+		int v = s[i];
+		if (v < 0) v = 0; // replace unused vertices with 0
+		packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1)
+	}
+
+	// Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices)
+	packed |= (size & 0x7) << 29;
+
+	return packed;
 }
 
-float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth)
+void computeCubeGeo()
 {
-    float4 color = float4(0,0,0,0);
-    // Draw the remaining edges (non-silhouette) in a different color
-    float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); // dark yellow color for hidden edges
-    
-    for (int i = 0; i < 12; i++)
-    {
-        int2 edge = allEdges[i];
-        
-        // Check if this edge is already drawn as a silhouette edge
-        bool isSilhouette = false;
-        int vertexCount = silhouettes[configIndex][0];
-        // Draw the 6 silhouette edges
-        for (int i = 0; i < vertexCount; i++) 
-        {
-            int v0Idx = silhouettes[configIndex][i + 1];
-            int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1];
-            
-            if ((edge.x == v0Idx && edge.y == v1Idx) || (edge.x == v1Idx && edge.y == v0Idx))
-            {
-                isSilhouette = true;
-                break;
-            }
-        }
-        
-        // Only draw if it's not a silhouette edge
-        if (!isSilhouette)
-        {
-            float4 edgeContribution = drawGreatCircleArc(spherePos, edge, 1, aaWidth);
-            color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a);
-        }
-    }
-    return color;
+	for (int i = 0; i < 8; i++)
+	for (int i = 0; i < 8; i++)
+	{
+		float3 localPos = constCorners[i];
+		float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
+		corners[i] = worldPos.xyz;
+		faceCenters[i / 4] += worldPos / 4.0f;
+		faceCenters[2 + i % 2] += worldPos / 4.0f;
+		faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f;
+	}
 }
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
 {
-    float4 color = float4(0, 0, 0, 0);
-    float2 p = toCircleSpace(vx.uv);
-    
-    // Convert 2D disk position to 3D hemisphere position
-    float2 normalized = p / CIRCLE_RADIUS;
-    float r2 = dot(normalized, normalized);
-    float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); 
-
-
-    
-    // Convert UV to 3D position on hemisphere
-    float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2)));
-    
-    computeCubeGeo();
-    
-    // Get OBB center in world space
-    float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
-
-    float3x3 rotMatrix = (float3x3)pc.modelMatrix;
-    float3 proj = mul(obbCenter, rotMatrix); // Get all 3 projections at once
-
-    // Get squared column lengths
-    float lenSqX = dot(rotMatrix[0], rotMatrix[0]);
-    float lenSqY = dot(rotMatrix[1], rotMatrix[1]);
-    float lenSqZ = dot(rotMatrix[2], rotMatrix[2]);
-
-    int3 region = int3(
-        proj.x < -lenSqX ? 0 : (proj.x > lenSqX ? 2 : 1),
-        proj.y < -lenSqY ? 0 : (proj.y > lenSqY ? 2 : 1),
-        proj.z < -lenSqZ ? 0 : (proj.z > lenSqZ ? 2 : 1)
-    );
-
-    int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26
-
-    int vertexCount = silhouettes[configIndex][0];
-    for (int i = 0; i < vertexCount; i++) 
-    {
-        int v0Idx = silhouettes[configIndex][i + 1];
-        int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1];
-        
-        float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth);
-        color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
-    }
-    
-    color += drawHiddenEdges(spherePos, configIndex, aaWidth);
-
-    color += drawCorners(spherePos, aaWidth);
-    
-    color += drawRing(p, aaWidth);
-
-    if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f)))
-    {
-        return float4(colorLUT[configIndex], 1.0f);
-    }
-
-    // if (r2 > 1.1f)
-    //     color.a = 0.0f; // Outside circle, make transparent
-    
-    return color;
+	float4 color = float4(0, 0, 0, 0);
+	float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y)));
+	float2 p = toCircleSpace(vx.uv);
+
+	float2 normalized = p / CIRCLE_RADIUS;
+	float r2 = dot(normalized, normalized);
+
+	float3 spherePos;
+	if (r2 <= 1.0f)
+	{
+		spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2));
+	}
+	else
+	{
+		float uv2Plus1 = r2 + 1.0f;
+		spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1;
+	}
+	spherePos = normalize(spherePos);
+
+	computeCubeGeo();
+
+	float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
+
+	float3x3 upper3x3 = (float3x3)pc.modelMatrix;
+
+#if 1
+	// Compute reciprocal scales
+	float3 rcpScales = rsqrt(float3(
+		dot(upper3x3[0], upper3x3[0]),
+		dot(upper3x3[1], upper3x3[1]),
+		dot(upper3x3[2], upper3x3[2])
+	));
+
+	// Build inverse-rotation-only matrix
+	float3x3 invRot;
+	invRot[0] = upper3x3[0] * rcpScales.x;
+	invRot[1] = upper3x3[1] * rcpScales.y;
+	invRot[2] = upper3x3[2] * rcpScales.z;
+
+	// Project center into OBB local space
+	float3 normalizedProj = mul(invRot, obbCenter);
+#else
+	float3 normalizedProj = mul(inverse(upper3x3), obbCenter);
+#endif
+	int3 region = int3(
+		normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1),
+		normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1),
+		normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1)
+	);
+	int configIndex = region.x + region.y * 3 + region.z * 9;
+
+	// uint32_t sil = packSilhouette(silhouettes[configIndex]);
+	uint32_t sil = binSilhouettes[configIndex];
+
+	int vertexCount = getSilhouetteSize(sil);
+	bool longSilhouette = (vertexCount == 6);
+	uint32_t silEdgeMask = 0;
+
+#if DEBUG_DATA
+	{
+		for (int i = 0; i < vertexCount; i++)
+		{
+			int vIdx = i % vertexCount;
+			int v1Idx = (i + 1) % vertexCount;
+
+			int v0Corner = getSilhouetteVertex(sil, vIdx);
+			int v1Corner = getSilhouetteVertex(sil, v1Idx);
+			// Mark edge as part of silhouette 
+			for (int e = 0; e < 12; e++)
+			{
+				int2 edge = allEdges[e];
+				if ((edge.x == v0Corner && edge.y == v1Corner) ||
+				(edge.x == v1Corner && edge.y == v0Corner))
+				{
+					silEdgeMask |= (1u << e);
+				}
+			}
+		}
+		validateEdgeVisibility(sil, vertexCount, silEdgeMask);
+	}
+#endif
+	// Build clip mask for vertices below horizon (z < 0)
+	uint32_t clipMask = 0u;
+	NBL_UNROLL
+	for (int i = 0; i < 6; i++)
+	{
+		if (i >= vertexCount) break;
+		clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i;
+	}
+
+	int clipCount = countbits(clipMask);
+
+	// Total clipped vertices
+	int clippedVertCount = vertexCount + (clipMask != 0u ? (2 - clipCount) : 0);
+
+	// Find rotation amount to place positive vertices first
+	int rotateAmount = 0;
+	if (clipMask != 0u)
+	{
+		uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u);
+		bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask >> (vertexCount - 1)) & 1u);
+
+		rotateAmount = wrapAround ?
+			((firstbithigh(invertedMask) + 1) % vertexCount) :
+			firstbitlow(clipMask);
+	}
+
+	// Rotate silhouette bits
+	uint32_t vertexBits = sil & 0x1FFFFFFF;
+	uint32_t rotatedVertexBits = rotr(vertexBits, rotateAmount * 3, vertexCount * 3);
+	uint32_t rotatedSil = (sil & 0xE0000000) | rotatedVertexBits;
+
+	// Rotate the clip mask to match
+	uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount);
+
+	// Draw clipped silhouette edges
+	for (int i = 0; i < clippedVertCount; i++)
+	{
+		int nextI = (i + 1) % clippedVertCount;
+
+		int vIdx = i % vertexCount;
+		int v1Idx = nextI % vertexCount;
+
+		// Extract clip bits directly
+		bool v0Clipped = (rotatedClipMask >> vIdx) & 1u;
+		bool v1Clipped = (rotatedClipMask >> v1Idx) & 1u;
+
+		// Skip if both clipped
+		if (v0Clipped && v1Clipped) continue;
+
+		int v0Corner = getSilhouetteVertex(rotatedSil, vIdx);
+		int v1Corner = getSilhouetteVertex(rotatedSil, v1Idx);
+
+		float3 v0 = normalize(corners[v0Corner]);
+		float3 v1 = normalize(corners[v1Corner]);
+
+		float3 points[2] = { corners[v0Corner], corners[v1Corner] };
+
+		// Clip using bit state
+		if (v0Clipped)
+		{
+			float t = v0.z / (v0.z - v1.z);
+			points[0] = normalize(lerp(corners[v0Corner], corners[v1Corner], t));
+		}
+		else if (v1Clipped)
+		{
+			float t = v0.z / (v0.z - v1.z);
+			points[1] = normalize(lerp(corners[v0Corner], corners[v1Corner], t));
+		}
+
+		// Draw edge
+		float4 edgeContribution = drawGreatCircleArc(spherePos, points, 1, aaWidth);
+		color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
+
+	}
+
+
+	setDebugData(sil, region, configIndex, clippedVertCount);
+
+	color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth);
+	color += drawCorners(spherePos, p, aaWidth);
+	color += drawRing(p, aaWidth);
+
+	if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f)))
+	{
+		return float4(colorLUT[configIndex], 1.0f);
+	}
+
+	return color;
 }
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
index 80368d08f..3c87a48bc 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
@@ -2,13 +2,52 @@
 #define _SOLID_ANGLE_VIS_COMMON_HLSL_
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
+#define DEBUG_DATA 1
 
-
-struct PushConstants
+namespace nbl
 {
-	nbl::hlsl::float32_t3x4 modelMatrix;
-	nbl::hlsl::float32_t4 viewport;
-};
+    namespace hlsl
+    {
+
+        struct ResultData
+        {
+            uint32_t3 region;
+            uint32_t silhouetteIndex;
+            
+            uint32_t silhouetteVertexCount;
+            uint32_t silhouette;
+            uint32_t clippedVertexCount;
+            uint32_t edgeVisibilityMismatch;
+
+            uint32_t vertices[6];
+        };
+
+        struct PushConstants
+        {
+            float32_t3x4 modelMatrix;
+            float32_t4 viewport;
+        };
 
+        static const float32_t3 colorLUT[27] = {
+            float32_t3(0, 0, 0), 		float32_t3(1, 1, 1), 		float32_t3(0.5, 0.5, 0.5),
+            float32_t3(1, 0, 0), 		float32_t3(0, 1, 0), 		float32_t3(0, 0, 1),
+            float32_t3(1, 1, 0), 		float32_t3(1, 0, 1), 		float32_t3(0, 1, 1),
+            float32_t3(1, 0.5, 0), 		float32_t3(1, 0.65, 0), 	float32_t3(0.8, 0.4, 0),
+            float32_t3(1, 0.4, 0.7), 	float32_t3(1, 0.75, 0.8), 	float32_t3(0.7, 0.1, 0.3),
+            float32_t3(0.5, 0, 0.5), 	float32_t3(0.6, 0.4, 0.8), 	float32_t3(0.3, 0, 0.5),
+            float32_t3(0, 0.5, 0), 		float32_t3(0.5, 1, 0), 		float32_t3(0, 0.5, 0.25),
+            float32_t3(0, 0, 0.5), 		float32_t3(0.3, 0.7, 1), 	float32_t3(0, 0.4, 0.6),
+            float32_t3(0.6, 0.4, 0.2), 	float32_t3(0.8, 0.7, 0.3), 	float32_t3(0.4, 0.3, 0.1)
+        };
 
+#ifndef __HLSL_VERSION
+		static const char* colorNames[27] = {"Black",
+			"White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan",
+			"Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple",
+			"Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown",
+			"Tan/Beige", "Dark Brown"
+		};
+#endif // __HLSL_VERSION
+    }
+}
 #endif // _SOLID_ANGLE_VIS_COMMON_HLSL_
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl
new file mode 100644
index 000000000..4031e048f
--- /dev/null
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl
@@ -0,0 +1,23 @@
+#ifndef _UTILS_HLSL_
+#define _UTILS_HLSL_
+
+// TODO: implemented somewhere else?
+// Bit rotation helpers
+uint32_t rotl(uint32_t value, uint32_t bits, uint32_t width)
+{
+    bits = bits % width;
+    uint32_t mask = (1u << width) - 1u;
+    value &= mask;
+    return ((value << bits) | (value >> (width - bits))) & mask;
+}
+
+uint32_t rotr(uint32_t value, uint32_t bits, uint32_t width)
+{
+    bits = bits % width;
+    uint32_t mask = (1u << width) - 1u;
+    value &= mask;
+    return ((value >> bits) | (value << (width - bits))) & mask;
+}
+
+
+#endif // _UTILS_HLSL_
diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp
index 105b2f757..538173223 100644
--- a/72_SolidAngleVisualizer/include/transform.hpp
+++ b/72_SolidAngleVisualizer/include/transform.hpp
@@ -1,27 +1,21 @@
 #ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
 #define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
 
-
 #include "nbl/ui/ICursorControl.h"
-
 #include "nbl/ext/ImGui/ImGui.h"
-
 #include "imgui/imgui_internal.h"
 #include "imguizmo/ImGuizmo.h"
 
-
 struct TransformRequestParams
 {
-	float camDistance = 8.f;
 	uint8_t sceneTexDescIx = ~0;
-	bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false;
+	bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = true;
 };
 
 struct TransformReturnInfo
 {
 	nbl::hlsl::uint16_t2 sceneResolution = { 1, 1 };
-	bool isGizmoWindowHovered;
-	bool isGizmoBeingUsed;
+	bool allowCameraMovement = false;
 };
 
 TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params)
@@ -35,7 +29,7 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 	static bool boundSizing = false;
 	static bool boundSizingSnap = false;
 
-	ImGui::Text("Press T/R/G to change gizmo mode");
+	ImGui::Text("Use gizmo (T/R/G) or ViewManipulate widget to transform the cube");
 
 	if (params.editTransformDecomposition)
 	{
@@ -55,11 +49,13 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 			mCurrentGizmoOperation = ImGuizmo::SCALE;
 		if (ImGui::RadioButton("Universal", mCurrentGizmoOperation == ImGuizmo::UNIVERSAL))
 			mCurrentGizmoOperation = ImGuizmo::UNIVERSAL;
+
+		// For UI editing, decompose temporarily
 		float matrixTranslation[3], matrixRotation[3], matrixScale[3];
 		ImGuizmo::DecomposeMatrixToComponents(matrix, matrixTranslation, matrixRotation, matrixScale);
-		ImGui::InputFloat3("Tr", matrixTranslation);
-		ImGui::InputFloat3("Rt", matrixRotation);
-		ImGui::InputFloat3("Sc", matrixScale);
+		ImGui::DragFloat3("Tr", matrixTranslation, 0.01f);
+		ImGui::DragFloat3("Rt", matrixRotation, 0.01f);
+		ImGui::DragFloat3("Sc", matrixScale, 0.01f);
 		ImGuizmo::RecomposeMatrixFromComponents(matrixTranslation, matrixRotation, matrixScale, matrix);
 
 		if (mCurrentGizmoOperation != ImGuizmo::SCALE)
@@ -101,17 +97,18 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 	ImGuiIO& io = ImGui::GetIO();
 	float viewManipulateRight = io.DisplaySize.x;
 	float viewManipulateTop = 0;
+	bool isWindowHovered = false;
 	static ImGuiWindowFlags gizmoWindowFlags = 0;
 
 	/*
-		for the "useWindow" case we just render to a gui area, 
+		for the "useWindow" case we just render to a gui area,
 		otherwise to fake full screen transparent window
 
-		note that for both cases we make sure gizmo being 
-		rendered is aligned to our texture scene using 
-        imgui  "cursor" screen positions
+		note that for both cases we make sure gizmo being
+		rendered is aligned to our texture scene using
+		imgui  "cursor" screen positions
 	*/
-// TODO: this shouldn't be handled here I think
+	// TODO: this shouldn't be handled here I think
 	SImResourceInfo info;
 	info.textureID = params.sceneTexDescIx;
 	info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER;
@@ -128,17 +125,17 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 		ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
 		ImVec2 windowPos = ImGui::GetWindowPos();
 		ImVec2 cursorPos = ImGui::GetCursorScreenPos();
+		isWindowHovered = ImGui::IsWindowHovered();
 
 		ImGui::Image(info, contentRegionSize);
 		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
-		retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y};
-		retval.isGizmoWindowHovered = ImGui::IsWindowHovered();
+		retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y };
 
 		viewManipulateRight = cursorPos.x + contentRegionSize.x;
 		viewManipulateTop = cursorPos.y;
 
 		ImGuiWindow* window = ImGui::GetCurrentWindow();
-		gizmoWindowFlags = (ImGui::IsWindowHovered() && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0);
+		gizmoWindowFlags = (isWindowHovered && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0);
 	}
 	else
 	{
@@ -149,21 +146,45 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 
 		ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
 		ImVec2 cursorPos = ImGui::GetCursorScreenPos();
+		isWindowHovered = ImGui::IsWindowHovered();
 
 		ImGui::Image(info, contentRegionSize);
 		ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y);
-		retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y};
-		retval.isGizmoWindowHovered = ImGui::IsWindowHovered();
+		retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y };
 
 		viewManipulateRight = cursorPos.x + contentRegionSize.x;
 		viewManipulateTop = cursorPos.y;
 	}
 
+	// Standard Manipulate gizmo - let ImGuizmo modify the matrix directly
 	ImGuizmo::Manipulate(cameraView, cameraProjection, mCurrentGizmoOperation, mCurrentGizmoMode, matrix, NULL, useSnap ? &snap[0] : NULL, boundSizing ? bounds : NULL, boundSizingSnap ? boundsSnap : NULL);
-	retval.isGizmoBeingUsed = ImGuizmo::IsOver() || (ImGuizmo::IsUsing() && ImGui::IsMouseDown(ImGuiMouseButton_Left));
 
-	if(params.enableViewManipulate)
-		ImGuizmo::ViewManipulate(cameraView, params.camDistance, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010);
+	retval.allowCameraMovement = isWindowHovered && !ImGuizmo::IsUsing();
+
+	// ViewManipulate for rotating the view
+	if (params.enableViewManipulate)
+	{
+		// Store original translation and scale before ViewManipulate
+		// Decompose original matrix
+		nbl::hlsl::float32_t3 translation, rotation, scale;
+		ImGuizmo::DecomposeMatrixToComponents(matrix, &translation.x, &rotation.x, &scale.x);
+
+		float temp[16];
+		nbl::hlsl::float32_t3 baseTranslation(0.0f);
+		nbl::hlsl::float32_t3 baseScale(1.0f);
+		ImGuizmo::RecomposeMatrixFromComponents(&baseTranslation.x, &rotation.x, &baseScale.x, temp);
+		// Manipulate rotation only
+		ImGuizmo::ViewManipulate(temp, 1.0f, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010);
+
+		// Extract rotation from manipulated temp
+		nbl::hlsl::float32_t3 newRot;
+		ImGuizmo::DecomposeMatrixToComponents(temp, &baseTranslation.x, &newRot.x, &baseScale.x);
+
+		// Recompose original matrix with new rotation but keep translation & scale
+		ImGuizmo::RecomposeMatrixFromComponents(&translation.x, &newRot.x, &scale.x, matrix);
+
+		retval.allowCameraMovement &= isWindowHovered && !ImGuizmo::IsUsingViewManipulate();
+	}
 
 	ImGui::End();
 	ImGui::PopStyleColor();
@@ -171,4 +192,4 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti
 	return retval;
 }
 
-#endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__
\ No newline at end of file
+#endif // _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index e9266520d..1c52547af 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -211,7 +211,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					return shader;
 				};
 
-			auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources());
 			ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get());
 			if (!fsTriProtoPPln)
 				return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
@@ -232,17 +231,73 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				.size = sizeof(PushConstants)
 			} };
 
-			auto visualizationLayout = m_device->createPipelineLayout(
-				ranges,
-				nullptr,
-				nullptr,
-				nullptr,
-				nullptr
+			nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
+				{
+					.binding = 0,
+					.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+					.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = ShaderStage::ESS_FRAGMENT,
+					.count = 1
+				}
+			};
+			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout(bindings);
+			if (!dsLayout)
+				logFail("Failed to create a Descriptor Layout!\n");
+
+
+			auto visualizationLayout = m_device->createPipelineLayout(ranges
+#if DEBUG_DATA
+				, dsLayout
+#endif
 			);
 			m_visualizationPipeline = fsTriProtoPPln.createPipeline(fragSpec, visualizationLayout.get(), m_solidAngleRenderpass.get());
 			if (!m_visualizationPipeline)
 				return logFail("Could not create Graphics Pipeline!");
 
+			// Allocate the memory
+#if DEBUG_DATA
+			{
+				constexpr size_t BufferSize = sizeof(ResultData);
+
+				nbl::video::IGPUBuffer::SCreationParams params = {};
+				params.size = BufferSize;
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
+				m_outputStorageBuffer = m_device->createBuffer(std::move(params));
+				if (!m_outputStorageBuffer)
+					logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+				m_outputStorageBuffer->setObjectDebugName("ResultData output buffer");
+
+				nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputStorageBuffer->getMemoryReqs();
+				reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+				m_allocation = m_device->allocate(reqs, m_outputStorageBuffer.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
+				if (!m_allocation.isValid())
+					logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+				assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get());
+				smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
+
+				m_ds = pool->createDescriptorSet(std::move(dsLayout));
+				{
+					IGPUDescriptorSet::SDescriptorInfo info[1];
+					info[0].desc = smart_refctd_ptr(m_outputStorageBuffer);
+					info[0].info.buffer = { .offset = 0,.size = BufferSize };
+					IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+						{.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
+					};
+					m_device->updateDescriptorSets(writes, {});
+				}
+			}
+
+			if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ))
+				logFail("Failed to map the Device Memory!\n");
+
+			// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+			const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize());
+			if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+#endif
 		}
 
 		// Create ImGUI
@@ -336,6 +391,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} };
 		if (m_solidAngleViewFramebuffer)
 		{
+#if DEBUG_DATA
+			asset::SBufferRange<IGPUBuffer> range
+			{
+				.offset = 0,
+				.size = m_outputStorageBuffer->getSize(),
+				.buffer = m_outputStorageBuffer
+			};
+			cb->fillBuffer(range, 0u);
+#endif
 			auto creationParams = m_solidAngleViewFramebuffer->getCreationParameters();
 			cb->beginDebugMarker("Draw Circle View Frame");
 			{
@@ -361,11 +425,17 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				auto pipeline = m_visualizationPipeline;
 				cb->bindGraphicsPipeline(pipeline.get());
 				cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc);
-				//cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 3, 1, &ds);
+				cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get());
 				ext::FullScreenTriangle::recordDrawCall(cb);
 			}
 			cb->endRenderPass();
 			cb->endDebugMarker();
+
+#if DEBUG_DATA
+			m_device->waitIdle();
+			std::memcpy(&m_GPUOutResulData, static_cast<ResultData*>(m_allocation.memory->getMappedPointer()), sizeof(ResultData));
+			m_device->waitIdle();
+#endif
 		}
 		// draw main view
 		if (m_mainViewFramebuffer)
@@ -557,6 +627,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				{
 					if (interface.move)
 						camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl
+					else
+						camera.mouseKeysUp();
 
 					for (const auto& e : events) // here capture
 					{
@@ -713,6 +785,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		cb->setViewport(0u, 1u, &viewport);
 	}
 
+#if DEBUG_DATA
+	~SolidAngleVisualizer() override
+	{
+		m_allocation.memory->unmap();
+	}
+#endif
+
 	// Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers
 	constexpr static inline uint32_t MaxFramesInFlight = 3u;
 	constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT;
@@ -721,13 +800,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 	// we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes
 	constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight;
 
-	constexpr static inline float32_t4x4 OBBModelMatrixDefault
-	{
-		1.0f, 0.0f, 0.0f, 0.0f,
-		0.0f, 1.0f, 0.0f, 0.0f,
-		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 3.0f, 1.0f
-	};
+	static inline ResultData m_GPUOutResulData;
 	//
 	smart_refctd_ptr<CGeometryCreatorScene> m_scene;
 	smart_refctd_ptr<IGPURenderpass> m_solidAngleRenderpass;
@@ -737,6 +810,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 	smart_refctd_ptr<IGPUFramebuffer> m_mainViewFramebuffer;
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_visualizationPipeline;
 	//
+	nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {};
+	smart_refctd_ptr<IGPUBuffer> m_outputStorageBuffer;
+	smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_ds = nullptr;
 	smart_refctd_ptr<ISemaphore> m_semaphore;
 	uint64_t m_realFrameIx = 0;
 	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
@@ -794,7 +870,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			//	transformParams.useWindow = true;
 
 			ImGui::Text("Camera");
-			bool viewDirty = false;
 
 			if (ImGui::RadioButton("LH", isLH))
 				isLH = true;
@@ -827,13 +902,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 			ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f);
 			ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f);
 
-			viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f);
 
-			if (viewDirty || firstFrame)
+			if (firstFrame)
 			{
 				camera.setPosition(cameraIntialPosition);
 				camera.setTarget(cameraInitialTarget);
-				camera.setBackupUpVector(cameraInitialUp);
 				camera.setUpVector(cameraInitialUp);
 
 				camera.recomputeViewMatrix();
@@ -909,45 +982,35 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 			if (ImGui::IsKeyPressed(ImGuiKey_End))
 			{
-				m_OBBModelMatrix = OBBModelMatrixDefault;
+				m_TRS = TRS{};
 			}
 
-			static struct
 			{
-				float32_t4x4 view, projection, model;
-			} imguizmoM16InOut;
+				static struct
+				{
+					float32_t4x4 view, projection, model;
+				} imguizmoM16InOut;
 
-			ImGuizmo::SetID(0u);
+				ImGuizmo::SetID(0u);
 
-			// TODO: camera will return hlsl::float32_tMxN 
-			auto view = *reinterpret_cast<const float32_t3x4*>(camera.getViewMatrix().pointer());
-			imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view));
+				// TODO: camera will return hlsl::float32_tMxN 
+				auto view = *reinterpret_cast<const float32_t3x4*>(camera.getViewMatrix().pointer());
+				imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view));
 
-			// TODO: camera will return hlsl::float32_tMxN 
-			imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast<const float32_t4x4*>(camera.getProjectionMatrix().pointer()));
-			imguizmoM16InOut.model = m_OBBModelMatrix;
+				// TODO: camera will return hlsl::float32_tMxN 
+				imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast<const float32_t4x4*>(camera.getProjectionMatrix().pointer()));
+				ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]);
 
-			{
 				if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates
 					imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/	
 
 				transformParams.editTransformDecomposition = true;
 				mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams);
+				move = mainViewTransformReturnInfo.allowCameraMovement;
 
-				// TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used
-				move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed);
-
+				ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x);
+				ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]);
 			}
-
-			// to Nabla + update camera & model matrices
-			// TODO: make it more nicely, extract:
-			// - Position by computing inverse of the view matrix and grabbing its translation
-			// - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position
-			// But then set the view matrix this way anyway, because up-vector may not be compatible
-			//const auto& view = camera.getViewMatrix();
-			//const_cast<core::matrix3x4SIMD&>(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok)
-			m_OBBModelMatrix = imguizmoM16InOut.model;
-
 			// object meta display
 			//{
 			//	ImGui::Begin("Object");
@@ -964,12 +1027,193 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 				ImVec2 contentRegionSize = ImGui::GetContentRegionAvail();
 				solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast<uint16_t>(contentRegionSize.x), static_cast<uint16_t>(contentRegionSize.y));
-				solidAngleViewTransformReturnInfo.isGizmoBeingUsed = false; // not used in this view
-				solidAngleViewTransformReturnInfo.isGizmoWindowHovered = false; // not used in this view
+				solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view
 				ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize);
 				ImGui::End();
 			}
 
+			// Show data coming from GPU
+#if DEBUG_DATA
+			{
+				if (ImGui::Begin("Result Data"))
+				{
+					auto drawColorField = [&](const char* fieldName, uint32_t index)
+						{
+							ImGui::Text("%s: %u", fieldName, index);
+
+							if (index >= 27)
+							{
+								ImGui::SameLine();
+								ImGui::Text("<invalid>");
+								return;
+							}
+
+							const auto& c = colorLUT[index]; // uses the combined LUT we made earlier
+
+							ImGui::SameLine();
+
+							// Color preview button
+							ImGui::ColorButton(
+								fieldName,
+								ImVec4(c.r, c.g, c.b, 1.0f),
+								0,
+								ImVec2(20, 20)
+							);
+
+							ImGui::SameLine();
+							ImGui::Text("%s", colorNames[index]);
+						};
+
+					// Vertices
+					if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen))
+					{
+						for (uint32_t i = 0; i < 6; ++i)
+						{
+							if (i < m_GPUOutResulData.silhouetteVertexCount)
+							{
+								ImGui::Text("corners[%u]", i);
+								ImGui::SameLine();
+								drawColorField(":", m_GPUOutResulData.vertices[i]);
+								ImGui::SameLine();
+								static const float32_t3 constCorners[8] = {
+									float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1,  1, -1), float32_t3(1,  1, -1),
+									float32_t3(-1, -1,  1), float32_t3(1, -1,  1), float32_t3(-1,  1,  1), float32_t3(1,  1,  1)
+								};
+								float32_t3 vertexLocation = constCorners[m_GPUOutResulData.vertices[i]];
+								ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z);
+							}
+							else
+							{
+								ImGui::Text("corners[%u] ::  ", i);
+								ImGui::SameLine();
+								ImGui::ColorButton(
+									"<unused>",
+									ImVec4(0.0f, 0.0f, 0.0f, 0.0f),
+									0,
+									ImVec2(20, 20)
+								);
+								ImGui::SameLine();
+								ImGui::Text("<unused>");
+
+							}
+
+						}
+					}
+
+					if (ImGui::CollapsingHeader("Color LUT Map"))
+					{
+						for (int i = 0; i < 27; i++)
+							drawColorField(" ", i);
+					}
+
+					ImGui::Separator();
+
+					// Silhouette info
+					drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex);
+
+					ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount);
+					ImGui::Text("silhouette Clipped VertexCount: %u", m_GPUOutResulData.clippedVertexCount);
+					ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false");
+
+					{
+						float32_t3 xAxis = m_OBBModelMatrix[0].xyz;
+						float32_t3 yAxis = m_OBBModelMatrix[1].xyz;
+						float32_t3 zAxis = m_OBBModelMatrix[2].xyz;
+
+						float32_t3 nx = normalize(xAxis);
+						float32_t3 ny = normalize(yAxis);
+						float32_t3 nz = normalize(zAxis);
+
+						const float epsilon = 1e-4;
+						bool hasSkew = false;
+						if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon)
+							hasSkew = true;
+						ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false");
+					}
+
+					static bool modalShown = false;
+					static uint32_t lastSilhouetteIndex = ~0u;
+
+					// Reset modal flag if silhouette configuration changed
+					if (m_GPUOutResulData.silhouetteIndex != lastSilhouetteIndex)
+					{
+						modalShown = false;
+						lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex;
+					}
+
+					if (!m_GPUOutResulData.edgeVisibilityMismatch)
+					{
+						// Reset flag when mismatch is cleared
+						modalShown = false;
+					}
+					if (m_GPUOutResulData.edgeVisibilityMismatch && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care
+					{
+						// Open modal popup only once per configuration
+						ImGui::OpenPopup("Edge Visibility Mismatch Warning");
+						modalShown = true;
+					}
+
+					// Modal popup
+					if (ImGui::BeginPopupModal("Edge Visibility Mismatch Warning", NULL, ImGuiWindowFlags_AlwaysAutoResize))
+					{
+						ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!");
+						ImGui::Separator();
+
+						ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility.");
+						ImGui::Text("This indicates the pre-computed silhouette data may be incorrect.");
+						ImGui::Spacing();
+
+						// Show configuration info
+						ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex);
+						ImGui::TextWrapped("Region: (%d, %d, %d)",
+							m_GPUOutResulData.region.x,
+							m_GPUOutResulData.region.y,
+							m_GPUOutResulData.region.z);
+						ImGui::Spacing();
+
+						ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch);
+
+						// Show which specific vertices are mismatched
+						ImGui::Text("Vertices involved in mismatched edges:");
+						ImGui::Indent();
+						for (int i = 0; i < 8; i++)
+						{
+							if (m_GPUOutResulData.edgeVisibilityMismatch & (1u << i))
+							{
+								ImGui::BulletText("Vertex %d", i);
+							}
+						}
+						ImGui::Unindent();
+						ImGui::Spacing();
+
+						if (ImGui::Button("OK", ImVec2(120, 0)))
+						{
+							ImGui::CloseCurrentPopup();
+						}
+
+						ImGui::EndPopup();
+					}
+
+					ImGui::Separator();
+
+					// Region (uint32_t3)
+					ImGui::Text("region: (%u, %u, %u)",
+						m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z);
+
+					ImGui::Separator();
+
+					// Silhouette mask printed in binary
+					char buf[33];
+					for (int i = 0; i < 32; i++)
+						buf[i] = (m_GPUOutResulData.silhouette & (1u << (31 - i))) ? '1' : '0';
+					buf[32] = '\0';
+
+					ImGui::Text("silhouette: 0x%08X", m_GPUOutResulData.silhouette);
+					ImGui::Text("binary: %s", buf);
+				}
+				ImGui::End();
+			}
+#endif
 			// view matrices editor
 			{
 				ImGui::Begin("Matrices");
@@ -995,6 +1239,32 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 							ImGui::Separator();
 					};
 
+				static RandomSampler rng(69); // Initialize RNG with seed
+				if (ImGui::Button("Randomize Translation"))
+				{
+					m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
+				}
+				ImGui::SameLine();
+
+				if (ImGui::Button("Randomize Rotation"))
+				{
+					m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f));
+				}
+				ImGui::SameLine();
+
+				if (ImGui::Button("Randomize Scale"))
+				{
+					m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+				}
+
+				ImGui::SameLine();
+				if (ImGui::Button("Randomize All"))
+				{
+					m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
+					m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f));
+					m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+				}
+
 				addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]);
 				addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, camera.getViewMatrix().pointer());
 				addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, camera.getProjectionMatrix().pointer(), false);
@@ -1071,6 +1341,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				ImGui::End();
 			}
 			ImGui::End();
+
+			ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &m_OBBModelMatrix[0][0]);
 		}
 
 		smart_refctd_ptr<ext::imgui::UI> imGUI;
@@ -1085,15 +1357,22 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 		};
 		SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = { SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value };
 		//
-		Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD());
+		Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, core::matrix4SIMD(), 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f));
 		// mutables
-		float32_t4x4 m_OBBModelMatrix = OBBModelMatrixDefault;
+		struct TRS // Source of truth
+		{
+			float32_t3 translation{ 0.0f, 0.0f, 3.0f };
+			float32_t3 rotation{ 0.0f };  // MUST stay orthonormal
+			float32_t3 scale{ 1.0f };
+		} m_TRS;
+		float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS
 
 		//std::string_view objectName;
 		TransformRequestParams transformParams;
 		TransformReturnInfo mainViewTransformReturnInfo;
 		TransformReturnInfo solidAngleViewTransformReturnInfo;
 
+
 		const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f };
 		const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f };
 		const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f };
diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp
index e5f077e46..c61f93333 100644
--- a/common/include/nbl/examples/cameras/CCamera.hpp
+++ b/common/include/nbl/examples/cameras/CCamera.hpp
@@ -302,6 +302,11 @@ class Camera
 		lastVirtualUpTimeStamp = nextPresentationTimeStamp;
 	}
 
+	// TODO: temporary but a good fix for the camera events when mouse stops dragging gizmo
+	void mouseKeysUp()
+	{
+		mouseDown = false;
+	}
 private:
 
 	inline void initDefaultKeysMap() { mapKeysToWASD(); }

From 2e306fc96bfae85a9669ad552751cece33d1b383 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Thu, 18 Dec 2025 01:10:56 +0300
Subject: [PATCH 55/57] better (still not perfect) manual inverse of rotation
 matrix

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index cd291dbd2..bf58e3231 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -228,21 +228,13 @@ void computeCubeGeo()
 	float3x3 upper3x3 = (float3x3)pc.modelMatrix;
 
 #if 1
-	// Compute reciprocal scales
-	float3 rcpScales = rsqrt(float3(
-		dot(upper3x3[0], upper3x3[0]),
-		dot(upper3x3[1], upper3x3[1]),
-		dot(upper3x3[2], upper3x3[2])
-	));
-
-	// Build inverse-rotation-only matrix
-	float3x3 invRot;
-	invRot[0] = upper3x3[0] * rcpScales.x;
-	invRot[1] = upper3x3[1] * rcpScales.y;
-	invRot[2] = upper3x3[2] * rcpScales.z;
-
-	// Project center into OBB local space
-	float3 normalizedProj = mul(invRot, obbCenter);
+float3 rcpScales = rsqrt(float3(
+    dot(upper3x3[0], upper3x3[0]),
+    dot(upper3x3[1], upper3x3[1]),
+    dot(upper3x3[2], upper3x3[2])
+));
+
+float3 normalizedProj = mul(transpose(upper3x3), obbCenter) * rcpScales;
 #else
 	float3 normalizedProj = mul(inverse(upper3x3), obbCenter);
 #endif

From 12486d4670f0453722351814996d91f198a16749 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Thu, 18 Dec 2025 02:24:41 +0300
Subject: [PATCH 56/57] Fixed faster inverse of rotation matrix, thanks Matt!

---
 .../hlsl/SolidAngleVis.frag.hlsl              | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index bf58e3231..01d166aac 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -223,21 +223,20 @@ void computeCubeGeo()
 
 	computeCubeGeo();
 
-	float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz;
+    float4x3 columnModel = transpose(pc.modelMatrix);
 
-	float3x3 upper3x3 = (float3x3)pc.modelMatrix;
+	float3 obbCenter = columnModel[3].xyz;
 
-#if 1
-float3 rcpScales = rsqrt(float3(
-    dot(upper3x3[0], upper3x3[0]),
-    dot(upper3x3[1], upper3x3[1]),
-    dot(upper3x3[2], upper3x3[2])
-));
+	float3x3 upper3x3 = (float3x3)columnModel;
+
+    float3 rcpScales = rcp(float3(
+        dot(upper3x3[0], upper3x3[0]),
+        dot(upper3x3[1], upper3x3[1]),
+        dot(upper3x3[2], upper3x3[2])
+    ));
+
+    float3 normalizedProj = mul(upper3x3, obbCenter) * rcpScales;
 
-float3 normalizedProj = mul(transpose(upper3x3), obbCenter) * rcpScales;
-#else
-	float3 normalizedProj = mul(inverse(upper3x3), obbCenter);
-#endif
 	int3 region = int3(
 		normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1),
 		normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1),

From 1961a898fd0a91c8e4d5c1a3fcb02df9142e8388 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Sat, 20 Dec 2025 10:18:48 +0300
Subject: [PATCH 57/57] Fast clipping, less branches, also

- More debug data going to imgui
- Little bit of shader code refactoring
- "Revert to last" button to go back to last random transformation of the OBB
- Added getVertexZNeg() and getVertex() preprocessor branches for faster versions
---
 .../app_resources/hlsl/Drawing.hlsl           | 122 ++--
 .../hlsl/SolidAngleVis.frag.hlsl              | 639 ++++++++++--------
 .../app_resources/hlsl/common.hlsl            |  42 +-
 72_SolidAngleVisualizer/main.cpp              |  90 ++-
 4 files changed, 532 insertions(+), 361 deletions(-)

diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
index c3cb5befa..f3f1b4e96 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl
@@ -16,79 +16,124 @@ float2 sphereToCircle(float3 spherePoint)
     }
 }
 
-float4 drawGreatCircleArc(float3 fragPos, float3 points[2], int visibility, float aaWidth)
+float drawGreatCircleArc(float3 fragPos, float3 points[2], float aaWidth, float width = 0.01f)
 {
-    if (visibility == 0) return float4(0,0,0,0);
-    
     float3 v0 = normalize(points[0]);
     float3 v1 = normalize(points[1]);
     float3 p = normalize(fragPos);
-    
+
     float3 arcNormal = normalize(cross(v0, v1));
     float dist = abs(dot(p, arcNormal));
-    
+
     float dotMid = dot(v0, v1);
     bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid);
-    
-    if (!onArc) return float4(0,0,0,0);
-    
+
+    if (!onArc)
+        return 0.0f;
+
     float avgDepth = (length(points[0]) + length(points[1])) * 0.5f;
     float depthScale = 3.0f / avgDepth;
-    
-    float baseWidth = (visibility == 1) ? 0.01f : 0.005f;
-    float width = min(baseWidth * depthScale, 0.02f);
-    
+
+    width = min(width * depthScale, 0.02f);
     float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist);
-    
-    float4 edgeColor = (visibility == 1) ? 
-        float4(0.0f, 0.5f, 1.0f, 1.0f) :
-        float4(1.0f, 0.0f, 0.0f, 1.0f);
-    
-    float intensity = (visibility == 1) ? 1.0f : 0.5f;
-    return edgeColor * alpha * intensity;
+
+    return alpha;
 }
 
 float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth)
 {
-    float4 color = float4(0,0,0,0);
+    float4 color = 0;
     float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1);
-    
+
+    NBL_UNROLL
     for (int i = 0; i < 12; i++)
     {
-        if ((silEdgeMask & (1u << i)) == 0)
+        // skip silhouette edges
+        if (silEdgeMask & (1u << i))
+            continue;
+
+        int2 edge = allEdges[i];
+
+        float3 v0 = normalize(getVertex(edge.x));
+        float3 v1 = normalize(getVertex(edge.y));
+
+        bool neg0 = v0.z < 0.0f;
+        bool neg1 = v1.z < 0.0f;
+
+        // fully hidden
+        if (neg0 && neg1)
+            continue;
+
+        float3 p0 = v0;
+        float3 p1 = v1;
+
+        // clip if needed
+        if (neg0 ^ neg1)
         {
-            int2 edge = allEdges[i];
-            float3 edgePoints[2] = { corners[edge.x], corners[edge.y] };
-            float4 edgeContribution = drawGreatCircleArc(spherePos, edgePoints, 1, aaWidth);
-            color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a);
+            float t = v0.z / (v0.z - v1.z);
+            float3 clip = normalize(lerp(v0, v1, t));
+
+            p0 = neg0 ? clip : v0;
+            p1 = neg1 ? clip : v1;
         }
+
+        float3 pts[2] = {p0, p1};
+        float4 c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.005f);
+        color += float4(hiddenEdgeColor * c.a, c.a);
     }
+
     return color;
 }
 
 float4 drawCorners(float3 spherePos, float2 p, float aaWidth)
 {
-    float4 color = float4(0,0,0,0);
+    float4 color = 0;
+
+    float dotSize = 0.02f;
+    float innerDotSize = dotSize * 0.5f;
+
     for (int i = 0; i < 8; i++)
     {
-        float3 corner3D = normalize(corners[i]);
+        float3 corner3D = normalize(getVertex(i));
         float2 cornerPos = sphereToCircle(corner3D);
+
         float dist = length(p - cornerPos);
-        float dotSize = 0.02f;
-        float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist);
-        if (dotAlpha > 0.0f)
+
+        // outer dot
+        float outerAlpha = 1.0f - smoothstep(dotSize - aaWidth,
+                                             dotSize + aaWidth,
+                                             dist);
+
+        if (outerAlpha <= 0.0f)
+            continue;
+
+        float3 dotColor = colorLUT[i];
+        color += float4(dotColor * outerAlpha, outerAlpha);
+
+        // -------------------------------------------------
+        // inner black dot for hidden corners
+        // -------------------------------------------------
+        if (corner3D.z < 0.0f)
         {
-            float3 dotColor = colorLUT[i];
-            color += float4(dotColor * dotAlpha, dotAlpha);
+            float innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth,
+                                                 innerDotSize + aaWidth,
+                                                 dist);
+
+            // ensure it stays inside the outer dot
+            innerAlpha *= outerAlpha;
+
+            float3 innerColor = float3(0.0, 0.0, 0.0);
+            color -= float4(innerAlpha.xxx, 0.0f);
         }
     }
+
     return color;
 }
 
 float4 drawRing(float2 p, float aaWidth)
 {
     float positionLength = length(p);
-    float ringWidth = 0.002f;
+    float ringWidth = 0.003f;
     float ringDistance = abs(positionLength - CIRCLE_RADIUS);
     float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance);
     return ringAlpha * float4(1, 1, 1, 1);
@@ -114,10 +159,12 @@ int getEdgeVisibility(int edgeIdx)
     bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2);
 
     // Silhouette: exactly one face visible
-    if (visible1 != visible2) return 1;
+    if (visible1 != visible2)
+        return 1;
 
     // Inner edge: both faces visible
-    if (visible1 && visible2) return 2;
+    if (visible1 && visible2)
+        return 2;
 
     // Hidden edge: both faces hidden
     return 0;
@@ -162,11 +209,10 @@ void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSil
             }
         }
     }
-    
+
     // Simple Write (assuming all fragments calculate the same result)
     InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator);
 }
 #endif
 
-
 #endif // _DEBUG_HLSL_
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
index 01d166aac..d7ceed943 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl
@@ -15,351 +15,438 @@ static const float CIRCLE_RADIUS = 0.5f;
 // --- Geometry Utils ---
 
 static const float3 constCorners[8] = {
-	float3(-1, -1, -1), float3(1, -1, -1), float3(-1,  1, -1), float3(1,  1, -1),
-	float3(-1, -1,  1), float3(1, -1,  1), float3(-1,  1,  1), float3(1,  1,  1)
-};
+    float3(-1, -1, -1), float3(1, -1, -1), float3(-1, 1, -1), float3(1, 1, -1),
+    float3(-1, -1, 1), float3(1, -1, 1), float3(-1, 1, 1), float3(1, 1, 1)};
 
 static const int2 allEdges[12] = {
-	{0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis
-	{0, 2}, {1, 3}, {4, 6}, {5, 7}, // Y axis
-	{0, 4}, {1, 5}, {2, 6}, {3, 7}  // Z axis
+    {0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis
+    {0, 2},
+    {1, 3},
+    {4, 6},
+    {5, 7}, // Y axis
+    {0, 4},
+    {1, 5},
+    {2, 6},
+    {3, 7} // Z axis
 };
 
 // Adjacency of edges to faces
 // Corrected Adjacency of edges to faces
 static const int2 edgeToFaces[12] = {
-	// Edge Index:  | allEdges[i]  | Shared Faces: 
-
-	/* 0 (0-1) */   {4, 0},         // Y- (4) and Z- (0)
-	/* 1 (2-3) */   {5, 0},         // Y+ (5) and Z- (0)
-	/* 2 (4-5) */   {4, 1},         // Y- (4) and Z+ (1)
-	/* 3 (6-7) */   {5, 1},         // Y+ (5) and Z+ (1)
-
-	/* 4 (0-2) */   {2, 0},         // X- (2) and Z- (0)
-	/* 5 (1-3) */   {3, 0},         // X+ (3) and Z- (0)
-	/* 6 (4-6) */   {2, 1},         // X- (2) and Z+ (1)
-	/* 7 (5-7) */   {3, 1},         // X+ (3) and Z+ (1)
-
-	/* 8 (0-4) */   {2, 4},         // X- (2) and Y- (4)
-	/* 9 (1-5) */   {3, 4},         // X+ (3) and Y- (4)
-	/* 10 (2-6) */  {2, 5},         // X- (2) and Y+ (5)
-	/* 11 (3-7) */  {3, 5}          // X+ (3) and Y+ (5)
+    // Edge Index:  | allEdges[i]  | Shared Faces:
+
+    /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0)
+    /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0)
+    /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1)
+    /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1)
+
+    /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0)
+    /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0)
+    /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1)
+    /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1)
+
+    /* 8 (0-4) */ {2, 4},  // X- (2) and Y- (4)
+    /* 9 (1-5) */ {3, 4},  // X+ (3) and Y- (4)
+    /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5)
+    /* 11 (3-7) */ {3, 5}  // X+ (3) and Y+ (5)
 };
 static float3 corners[8];
 static float3 faceCenters[6] = {
-	float3(0,0,0), float3(0,0,0), float3(0,0,0),
-	float3(0,0,0), float3(0,0,0), float3(0,0,0)
-};
+    float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0),
+    float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0)};
 
 static const float3 localNormals[6] = {
-	float3(0, 0, -1), // Face 0 (Z-)
-	float3(0, 0, 1),  // Face 1 (Z+)
-	float3(-1, 0, 0), // Face 2 (X-)
-	float3(1, 0, 0),  // Face 3 (X+)
-	float3(0, -1, 0), // Face 4 (Y-)
-	float3(0, 1, 0)   // Face 5 (Y+)
+    float3(0, 0, -1), // Face 0 (Z-)
+    float3(0, 0, 1),  // Face 1 (Z+)
+    float3(-1, 0, 0), // Face 2 (X-)
+    float3(1, 0, 0),  // Face 3 (X+)
+    float3(0, -1, 0), // Face 4 (Y-)
+    float3(0, 1, 0)   // Face 5 (Y+)
 };
 
-
 // TODO: unused, remove later
 // Vertices are ordered CCW relative to the camera view.
 static const int silhouettes[27][7] = {
-	{6, 1, 3, 2, 6, 4, 5},      // 0: Black
-	{6, 2, 6, 4, 5, 7, 3},      // 1: White 
-	{6, 0, 4, 5, 7, 3, 2},      // 2: Gray 
-	{6, 1, 3, 7, 6, 4, 5,},     // 3: Red 
-	{4, 4, 5, 7, 6, -1, -1},    // 4: Green 
-	{6, 0, 4, 5, 7, 6, 2},      // 5: Blue 
-	{6, 0, 1, 3, 7, 6, 4},      // 6: Yellow 
-	{6, 0, 1, 5, 7, 6, 4},      // 7: Magenta              
-	{6, 0, 1, 5, 7, 6, 2},      // 8: Cyan 
-	{6, 1, 3, 2, 6, 7, 5},      // 9: Orange
-	{4, 2, 6, 7, 3, -1, -1},    // 10: Light Orange
-	{6, 0, 4, 6, 7, 3, 2},      // 11: Dark Orange
-	{4, 1, 3, 7, 5, -1, -1},    // 12: Pink
-	{6, 0, 4, 6, 7, 3, 2},      // 13: Light Pink
-	{4, 0, 4, 6, 2, -1, -1},    // 14: Deep Rose
-	{6, 0, 1, 3, 7, 5, 4},      // 15: Purple
-	{4, 0, 1, 5, 4, -1, -1},    // 16: Light Purple
-	{6, 0, 1, 5, 4, 6, 2},      // 17: Indigo
-	{6, 0, 2, 6, 7, 5, 1},      // 18: Dark Green
-	{6, 0, 2, 6, 7, 3, 1},      // 19: Lime
-	{6, 0, 4, 6, 7, 3, 1},      // 20: Forest Green
-	{6, 0, 2, 3, 7, 5, 1},      // 21: Navy
-	{4, 0, 2, 3, 1, -1, -1},    // 22: Sky Blue
-	{6, 0, 4, 6, 2, 3, 1},      // 23: Teal
-	{6, 0, 2, 3, 7, 5, 4},      // 24: Brown
-	{6, 0, 2, 3, 1, 5, 4},      // 25: Tan/Beige
-	{6, 1, 5, 4, 6, 2, 3}       // 26: Dark Brown
+    {6, 1, 3, 2, 6, 4, 5},   // 0: Black
+    {6, 2, 6, 4, 5, 7, 3},   // 1: White
+    {6, 0, 4, 5, 7, 3, 2},   // 2: Gray
+    {6, 1, 3, 7, 6, 4, 5},   // 3: Red
+    {4, 4, 5, 7, 6, -1, -1}, // 4: Green
+    {6, 0, 4, 5, 7, 6, 2},   // 5: Blue
+    {6, 0, 1, 3, 7, 6, 4},   // 6: Yellow
+    {6, 0, 1, 5, 7, 6, 4},   // 7: Magenta
+    {6, 0, 1, 5, 7, 6, 2},   // 8: Cyan
+    {6, 1, 3, 2, 6, 7, 5},   // 9: Orange
+    {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange
+    {6, 0, 4, 6, 7, 3, 2},   // 11: Dark Orange
+    {4, 1, 3, 7, 5, -1, -1}, // 12: Pink
+    {6, 0, 4, 6, 7, 3, 2},   // 13: Light Pink
+    {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose
+    {6, 0, 1, 3, 7, 5, 4},   // 15: Purple
+    {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple
+    {6, 0, 1, 5, 4, 6, 2},   // 17: Indigo
+    {6, 0, 2, 6, 7, 5, 1},   // 18: Dark Green
+    {6, 0, 2, 6, 7, 3, 1},   // 19: Lime
+    {6, 0, 4, 6, 7, 3, 1},   // 20: Forest Green
+    {6, 0, 2, 3, 7, 5, 1},   // 21: Navy
+    {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue
+    {6, 0, 4, 6, 2, 3, 1},   // 23: Teal
+    {6, 0, 2, 3, 7, 5, 4},   // 24: Brown
+    {6, 0, 2, 3, 1, 5, 4},   // 25: Tan/Beige
+    {6, 1, 5, 4, 6, 2, 3}    // 26: Dark Brown
 };
 
 // Binary packed silhouettes
 static const uint32_t binSilhouettes[27] = {
-	0b11000000000000101100110010011001,
-	0b11000000000000011111101100110010,
-	0b11000000000000010011111101100000,
-	0b11000000000000101100110111011001,
-	0b10000000000000000000110111101100,
-	0b11000000000000010110111101100000,
-	0b11000000000000100110111011001000,
-	0b11000000000000100110111101001000,
-	0b11000000000000010110111101001000,
-	0b11000000000000101111110010011001,
-	0b10000000000000000000011111110010,
-	0b11000000000000010011111110100000,
-	0b10000000000000000000101111011001,
-	0b11000000000000010011111110100000,
-	0b10000000000000000000010110100000,
-	0b11000000000000100101111011001000,
-	0b10000000000000000000100101001000,
-	0b11000000000000010110100101001000,
-	0b11000000000000001101111110010000,
-	0b11000000000000001011111110010000,
-	0b11000000000000001011111110100000,
-	0b11000000000000001101111011010000,
-	0b10000000000000000000001011010000,
-	0b11000000000000001011010110100000,
-	0b11000000000000100101111011010000,
-	0b11000000000000100101001011010000,
-	0b11000000000000011010110100101001,
+    0b11000000000000101100110010011001,
+    0b11000000000000011111101100110010,
+    0b11000000000000010011111101100000,
+    0b11000000000000101100110111011001,
+    0b10000000000000000000110111101100,
+    0b11000000000000010110111101100000,
+    0b11000000000000100110111011001000,
+    0b11000000000000100110111101001000,
+    0b11000000000000010110111101001000,
+    0b11000000000000101111110010011001,
+    0b10000000000000000000011111110010,
+    0b11000000000000010011111110100000,
+    0b10000000000000000000101111011001,
+    0b11000000000000010011111110100000,
+    0b10000000000000000000010110100000,
+    0b11000000000000100101111011001000,
+    0b10000000000000000000100101001000,
+    0b11000000000000010110100101001000,
+    0b11000000000000001101111110010000,
+    0b11000000000000001011111110010000,
+    0b11000000000000001011111110100000,
+    0b11000000000000001101111011010000,
+    0b10000000000000000000001011010000,
+    0b11000000000000001011010110100000,
+    0b11000000000000100101111011010000,
+    0b11000000000000100101001011010000,
+    0b11000000000000011010110100101001,
 };
 
 int getSilhouetteVertex(uint32_t packedSil, int index)
 {
-	return (packedSil >> (3 * index)) & 0x7;
+    return (packedSil >> (3 * index)) & 0x7;
 }
 
 // Get silhouette size
 int getSilhouetteSize(uint32_t sil)
 {
-	return (sil >> 29) & 0x7;
-
+    return (sil >> 29) & 0x7;
 }
 
 // Check if vertex has negative z
 bool getVertexZNeg(int vertexIdx)
 {
-	return normalize(corners[vertexIdx]).z < 0.0f;
+#if FAST
+    float3 localPos = float3(
+        (vertexIdx & 1) ? 1.0f : -1.0f,
+        (vertexIdx & 2) ? 1.0f : -1.0f,
+        (vertexIdx & 4) ? 1.0f : -1.0f);
+
+    float transformedZ = dot(pc.modelMatrix[2].xyz, localPos) + pc.modelMatrix[2].w;
+    return transformedZ < 0.0f;
+#else
+    return corners[vertexIdx].z < 0.0f;
+#endif
 }
 
-#include "Drawing.hlsl"
+float3 getVertex(int vertexIdx)
+{
+#if FAST
+    // Reconstruct local cube corner from index bits
+    float sx = (vertexIdx & 1) ? 1.0f : -1.0f;
+    float sy = (vertexIdx & 2) ? 1.0f : -1.0f;
+    float sz = (vertexIdx & 4) ? 1.0f : -1.0f;
+
+    float4x3 model = transpose(pc.modelMatrix);
+
+    // Transform to world
+    // Full position, not just Z like getVertexZNeg
+    return model[0].xyz * sx +
+           model[1].xyz * sy +
+           model[2].xyz * sz +
+           model[3].xyz;
+    // return mul(pc.modelMatrix, float4(sx, sy, sz, 1.0f));
+#else
+    return corners[vertexIdx];
+#endif
+}
 
+#include "Drawing.hlsl"
 
-void setDebugData(uint32_t sil, int3 region, int configIndex, uint32_t clippedVertexCount)
+void setDebugData(uint32_t sil, int3 region, int configIndex)
 {
 #if DEBUG_DATA
-	DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil));
-	DebugDataBuffer[0].region = uint3(region);
-	DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex);
-	DebugDataBuffer[0].clippedVertexCount = clippedVertexCount;
-	for (int i = 0; i < 6; i++)
-	{
-		DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i));
-	}
-	DebugDataBuffer[0].silhouette = sil;
+    DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil));
+    DebugDataBuffer[0].region = uint3(region);
+    DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex);
+    for (int i = 0; i < 6; i++)
+    {
+        DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i));
+    }
+    DebugDataBuffer[0].silhouette = sil;
 #endif
 }
 
 float2 toCircleSpace(float2 uv)
 {
-	float2 p = uv * 2.0f - 1.0f;
-	float aspect = pc.viewport.z / pc.viewport.w;
-	p.x *= aspect;
-	return p;
+    float2 p = uv * 2.0f - 1.0f;
+    float aspect = pc.viewport.z / pc.viewport.w;
+    p.x *= aspect;
+    return p;
 }
 
-uint32_t packSilhouette(const int s[7]) 
+uint32_t packSilhouette(const int s[7])
 {
-	uint32_t packed = 0;
-	int size = s[0] & 0x7; // 3 bits for size
-
-	// Pack vertices LSB-first (vertex1 in lowest 3 bits above size)
-	for (int i = 1; i <= 6; ++i) {
-		int v = s[i];
-		if (v < 0) v = 0; // replace unused vertices with 0
-		packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1)
-	}
-
-	// Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices)
-	packed |= (size & 0x7) << 29;
-
-	return packed;
+    uint32_t packed = 0;
+    int size = s[0] & 0x7; // 3 bits for size
+
+    // Pack vertices LSB-first (vertex1 in lowest 3 bits above size)
+    for (int i = 1; i <= 6; ++i)
+    {
+        int v = s[i];
+        if (v < 0)
+            v = 0;                            // replace unused vertices with 0
+        packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1)
+    }
+
+    // Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices)
+    packed |= (size & 0x7) << 29;
+
+    return packed;
 }
 
 void computeCubeGeo()
 {
-	for (int i = 0; i < 8; i++)
-	for (int i = 0; i < 8; i++)
-	{
-		float3 localPos = constCorners[i];
-		float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
-		corners[i] = worldPos.xyz;
-		faceCenters[i / 4] += worldPos / 4.0f;
-		faceCenters[2 + i % 2] += worldPos / 4.0f;
-		faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f;
-	}
+    for (int i = 0; i < 8; i++)
+    {
+        float3 localPos = constCorners[i];
+        float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz;
+        corners[i] = worldPos.xyz;
+        faceCenters[i / 4] += worldPos / 4.0f;
+        faceCenters[2 + i % 2] += worldPos / 4.0f;
+        faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f;
+    }
+}
+
+// Helper to draw an edge with proper color mapping
+float4 drawEdge(int originalEdgeIdx, float3 pts[2], float3 spherePos, float aaWidth, float width = 0.01f)
+{
+    float4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width);
+    return float4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a);
+};
+
+float4 drawSilhouette(uint32_t vertexCount, uint32_t sil, float3 spherePos, float aaWidth)
+{
+    float4 color = 0;
+
+    // Build clip mask (z < 0)
+    uint32_t clipMask = 0u;
+    NBL_UNROLL
+    for (int i = 0; i < 4; i++)
+        clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i;
+
+    if (vertexCount == 6)
+    {
+        NBL_UNROLL
+        for (int i = 4; i < 6; i++)
+            clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i;
+    }
+
+    int clipCount = countbits(clipMask);
+
+    // Early exit if fully clipped
+    if (clipCount == vertexCount)
+        return color;
+
+    // No clipping needed - fast path
+    if (clipCount == 0)
+    {
+        for (int i = 0; i < vertexCount; i++)
+        {
+            int i0 = i;
+            int i1 = (i + 1) % vertexCount;
+
+            float3 v0 = getVertex(getSilhouetteVertex(sil, i0));
+            float3 v1 = getVertex(getSilhouetteVertex(sil, i1));
+            float3 pts[2] = {v0, v1};
+
+            color += drawEdge(i1, pts, spherePos, aaWidth);
+        }
+        return color;
+    }
+
+    // Rotate clip mask so positives come first
+    uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u);
+    bool wrapAround = ((clipMask & 1u) != 0u) &&
+                      ((clipMask & (1u << (vertexCount - 1))) != 0u);
+    int rotateAmount = wrapAround
+                           ? firstbitlow(invertedMask)   // -> First POSITIVE
+                           : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative
+
+    uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount);
+    uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3);
+
+    int positiveCount = vertexCount - clipCount;
+
+    // ALWAYS compute both clip points
+    int lastPosIdx = positiveCount - 1;
+    int firstNegIdx = positiveCount;
+    float3 vLastPos = getVertex(getSilhouetteVertex(rotatedSil, lastPosIdx));
+    float3 vFirstNeg = getVertex(getSilhouetteVertex(rotatedSil, firstNegIdx));
+    float t = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+    float3 clipA = lerp(vLastPos, vFirstNeg, t);
+
+    float3 vLastNeg = getVertex(getSilhouetteVertex(rotatedSil, vertexCount - 1));
+    float3 vFirstPos = getVertex(getSilhouetteVertex(rotatedSil, 0));
+    t = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+    float3 clipB = lerp(vLastNeg, vFirstPos, t);
+
+    // Draw positive edges
+    NBL_UNROLL
+    for (int i = 0; i < positiveCount; i++)
+    {
+
+        float3 v0 = getVertex(getSilhouetteVertex(rotatedSil, i));
+        bool useClipA = (i == positiveCount - 1);
+        float3 v1 = useClipA ? clipA : getVertex(getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount));
+
+        float3 pts[2] = {v0, v1};
+        color += drawEdge(i + 1, pts, spherePos, aaWidth);
+    }
+
+    // NP edge
+    if (clipCount > 0 && clipCount < vertexCount)
+    {
+        float3 vFirst = getVertex(getSilhouetteVertex(rotatedSil, 0));
+        float3 npPts[2] = {clipB, vFirst};
+        color += drawEdge(0, npPts, spherePos, aaWidth);
+    }
+
+    // Horizon arc
+    if (clipCount > 0 && clipCount < vertexCount)
+    {
+        float3 arcPts[2] = {clipA, clipB};
+        color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f);
+    }
+
+#if DEBUG_DATA
+    DebugDataBuffer[0].clipMask = clipMask;
+    DebugDataBuffer[0].clipCount = clipCount;
+    {
+        int transitions = 0;
+        for (int i = 0; i < vertexCount; i++)
+        {
+            bool a = (rotatedClipMask >> i) & 1u;
+            bool b = (rotatedClipMask >> ((i + 1) % vertexCount)) & 1u;
+            if (a != b)
+                transitions++;
+        }
+        // transitions must be 0 or 2
+        DebugDataBuffer[0].MoreThanTwoBitTransitions = transitions > 2;
+        DebugDataBuffer[0].rotatedClipMask = rotatedClipMask;
+        DebugDataBuffer[0].rotateAmount = rotateAmount;
+        DebugDataBuffer[0].positiveVertCount = positiveCount;
+        DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround;
+        DebugDataBuffer[0].rotatedSil = rotatedSil;
+    }
+#endif
+    return color;
 }
 
 [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0
 {
-	float4 color = float4(0, 0, 0, 0);
-	float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y)));
-	float2 p = toCircleSpace(vx.uv);
+    float4 color = float4(0, 0, 0, 0);
+    for (int i = 0; i < 1; i++)
+    {
+
+        float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y)));
+        float2 p = toCircleSpace(vx.uv);
 
-	float2 normalized = p / CIRCLE_RADIUS;
-	float r2 = dot(normalized, normalized);
+        float2 normalized = p / CIRCLE_RADIUS;
+        float r2 = dot(normalized, normalized);
 
-	float3 spherePos;
-	if (r2 <= 1.0f)
-	{
-		spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2));
-	}
-	else
-	{
-		float uv2Plus1 = r2 + 1.0f;
-		spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1;
-	}
-	spherePos = normalize(spherePos);
+        float3 spherePos;
+        if (r2 <= 1.0f)
+        {
+            spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2));
+        }
+        else
+        {
+            float uv2Plus1 = r2 + 1.0f;
+            spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1;
+        }
+        spherePos = normalize(spherePos);
 
-	computeCubeGeo();
+        computeCubeGeo();
 
-    float4x3 columnModel = transpose(pc.modelMatrix);
+        float4x3 columnModel = transpose(pc.modelMatrix);
 
-	float3 obbCenter = columnModel[3].xyz;
+        float3 obbCenter = columnModel[3].xyz;
 
-	float3x3 upper3x3 = (float3x3)columnModel;
+        float3x3 upper3x3 = (float3x3)columnModel;
 
-    float3 rcpScales = rcp(float3(
-        dot(upper3x3[0], upper3x3[0]),
-        dot(upper3x3[1], upper3x3[1]),
-        dot(upper3x3[2], upper3x3[2])
-    ));
+        float3 rcpSqScales = rcp(float3(
+            dot(upper3x3[0], upper3x3[0]),
+            dot(upper3x3[1], upper3x3[1]),
+            dot(upper3x3[2], upper3x3[2])));
 
-    float3 normalizedProj = mul(upper3x3, obbCenter) * rcpScales;
+        float3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales;
 
-	int3 region = int3(
-		normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1),
-		normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1),
-		normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1)
-	);
-	int configIndex = region.x + region.y * 3 + region.z * 9;
+        int3 region = int3(
+            normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1),
+            normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1),
+            normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1));
 
-	// uint32_t sil = packSilhouette(silhouettes[configIndex]);
-	uint32_t sil = binSilhouettes[configIndex];
+        int configIndex = region.x + region.y * 3 + region.z * 9;
 
-	int vertexCount = getSilhouetteSize(sil);
-	bool longSilhouette = (vertexCount == 6);
-	uint32_t silEdgeMask = 0;
+        // uint32_t sil = packSilhouette(silhouettes[configIndex]);
+        uint32_t sil = binSilhouettes[configIndex];
+
+        int vertexCount = getSilhouetteSize(sil);
+        uint32_t silEdgeMask = 0;
 
 #if DEBUG_DATA
-	{
-		for (int i = 0; i < vertexCount; i++)
-		{
-			int vIdx = i % vertexCount;
-			int v1Idx = (i + 1) % vertexCount;
-
-			int v0Corner = getSilhouetteVertex(sil, vIdx);
-			int v1Corner = getSilhouetteVertex(sil, v1Idx);
-			// Mark edge as part of silhouette 
-			for (int e = 0; e < 12; e++)
-			{
-				int2 edge = allEdges[e];
-				if ((edge.x == v0Corner && edge.y == v1Corner) ||
-				(edge.x == v1Corner && edge.y == v0Corner))
-				{
-					silEdgeMask |= (1u << e);
-				}
-			}
-		}
-		validateEdgeVisibility(sil, vertexCount, silEdgeMask);
-	}
+        {
+            for (int i = 0; i < vertexCount; i++)
+            {
+                int vIdx = i % vertexCount;
+                int v1Idx = (i + 1) % vertexCount;
+
+                int v0Corner = getSilhouetteVertex(sil, vIdx);
+                int v1Corner = getSilhouetteVertex(sil, v1Idx);
+                // Mark edge as part of silhouette
+                for (int e = 0; e < 12; e++)
+                {
+                    int2 edge = allEdges[e];
+                    if ((edge.x == v0Corner && edge.y == v1Corner) ||
+                        (edge.x == v1Corner && edge.y == v0Corner))
+                    {
+                        silEdgeMask |= (1u << e);
+                    }
+                }
+            }
+            validateEdgeVisibility(sil, vertexCount, silEdgeMask);
+        }
 #endif
-	// Build clip mask for vertices below horizon (z < 0)
-	uint32_t clipMask = 0u;
-	NBL_UNROLL
-	for (int i = 0; i < 6; i++)
-	{
-		if (i >= vertexCount) break;
-		clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i;
-	}
-
-	int clipCount = countbits(clipMask);
-
-	// Total clipped vertices
-	int clippedVertCount = vertexCount + (clipMask != 0u ? (2 - clipCount) : 0);
-
-	// Find rotation amount to place positive vertices first
-	int rotateAmount = 0;
-	if (clipMask != 0u)
-	{
-		uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u);
-		bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask >> (vertexCount - 1)) & 1u);
-
-		rotateAmount = wrapAround ?
-			((firstbithigh(invertedMask) + 1) % vertexCount) :
-			firstbitlow(clipMask);
-	}
-
-	// Rotate silhouette bits
-	uint32_t vertexBits = sil & 0x1FFFFFFF;
-	uint32_t rotatedVertexBits = rotr(vertexBits, rotateAmount * 3, vertexCount * 3);
-	uint32_t rotatedSil = (sil & 0xE0000000) | rotatedVertexBits;
-
-	// Rotate the clip mask to match
-	uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount);
-
-	// Draw clipped silhouette edges
-	for (int i = 0; i < clippedVertCount; i++)
-	{
-		int nextI = (i + 1) % clippedVertCount;
-
-		int vIdx = i % vertexCount;
-		int v1Idx = nextI % vertexCount;
-
-		// Extract clip bits directly
-		bool v0Clipped = (rotatedClipMask >> vIdx) & 1u;
-		bool v1Clipped = (rotatedClipMask >> v1Idx) & 1u;
-
-		// Skip if both clipped
-		if (v0Clipped && v1Clipped) continue;
-
-		int v0Corner = getSilhouetteVertex(rotatedSil, vIdx);
-		int v1Corner = getSilhouetteVertex(rotatedSil, v1Idx);
-
-		float3 v0 = normalize(corners[v0Corner]);
-		float3 v1 = normalize(corners[v1Corner]);
-
-		float3 points[2] = { corners[v0Corner], corners[v1Corner] };
-
-		// Clip using bit state
-		if (v0Clipped)
-		{
-			float t = v0.z / (v0.z - v1.z);
-			points[0] = normalize(lerp(corners[v0Corner], corners[v1Corner], t));
-		}
-		else if (v1Clipped)
-		{
-			float t = v0.z / (v0.z - v1.z);
-			points[1] = normalize(lerp(corners[v0Corner], corners[v1Corner], t));
-		}
-
-		// Draw edge
-		float4 edgeContribution = drawGreatCircleArc(spherePos, points, 1, aaWidth);
-		color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a);
-
-	}
-
-
-	setDebugData(sil, region, configIndex, clippedVertCount);
-
-	color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth);
-	color += drawCorners(spherePos, p, aaWidth);
-	color += drawRing(p, aaWidth);
-
-	if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f)))
-	{
-		return float4(colorLUT[configIndex], 1.0f);
-	}
-
-	return color;
+
+        uint32_t positiveCount = 0;
+        color += drawSilhouette(vertexCount, sil, spherePos, aaWidth);
+        setDebugData(sil, region, configIndex);
+
+        color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth);
+        color += drawCorners(spherePos, p, aaWidth);
+        color += drawRing(p, aaWidth);
+
+        if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f)))
+        {
+            return float4(colorLUT[configIndex], 1.0f);
+        }
+    }
+
+    return color;
 }
\ No newline at end of file
diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
index 3c87a48bc..c8532e796 100644
--- a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
+++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl
@@ -3,6 +3,7 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
 #define DEBUG_DATA 1
+#define FAST 1
 
 namespace nbl
 {
@@ -13,12 +14,19 @@ namespace nbl
         {
             uint32_t3 region;
             uint32_t silhouetteIndex;
-            
+
             uint32_t silhouetteVertexCount;
             uint32_t silhouette;
-            uint32_t clippedVertexCount;
+            uint32_t positiveVertCount;
             uint32_t edgeVisibilityMismatch;
 
+            uint32_t clipMask;
+            uint32_t clipCount;
+            uint32_t rotatedSil;
+            uint32_t wrapAround;
+            uint32_t rotatedClipMask;
+            uint32_t rotateAmount;
+            uint32_t MoreThanTwoBitTransitions;
             uint32_t vertices[6];
         };
 
@@ -29,24 +37,22 @@ namespace nbl
         };
 
         static const float32_t3 colorLUT[27] = {
-            float32_t3(0, 0, 0), 		float32_t3(1, 1, 1), 		float32_t3(0.5, 0.5, 0.5),
-            float32_t3(1, 0, 0), 		float32_t3(0, 1, 0), 		float32_t3(0, 0, 1),
-            float32_t3(1, 1, 0), 		float32_t3(1, 0, 1), 		float32_t3(0, 1, 1),
-            float32_t3(1, 0.5, 0), 		float32_t3(1, 0.65, 0), 	float32_t3(0.8, 0.4, 0),
-            float32_t3(1, 0.4, 0.7), 	float32_t3(1, 0.75, 0.8), 	float32_t3(0.7, 0.1, 0.3),
-            float32_t3(0.5, 0, 0.5), 	float32_t3(0.6, 0.4, 0.8), 	float32_t3(0.3, 0, 0.5),
-            float32_t3(0, 0.5, 0), 		float32_t3(0.5, 1, 0), 		float32_t3(0, 0.5, 0.25),
-            float32_t3(0, 0, 0.5), 		float32_t3(0.3, 0.7, 1), 	float32_t3(0, 0.4, 0.6),
-            float32_t3(0.6, 0.4, 0.2), 	float32_t3(0.8, 0.7, 0.3), 	float32_t3(0.4, 0.3, 0.1)
-        };
+            float32_t3(0, 0, 0), float32_t3(1, 1, 1), float32_t3(0.5, 0.5, 0.5),
+            float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1),
+            float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1),
+            float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0),
+            float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3),
+            float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5),
+            float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25),
+            float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6),
+            float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1)};
 
 #ifndef __HLSL_VERSION
-		static const char* colorNames[27] = {"Black",
-			"White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan",
-			"Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple",
-			"Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown",
-			"Tan/Beige", "Dark Brown"
-		};
+        static const char *colorNames[27] = {"Black",
+                                             "White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan",
+                                             "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple",
+                                             "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown",
+                                             "Tan/Beige", "Dark Brown"};
 #endif // __HLSL_VERSION
     }
 }
diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp
index 1c52547af..64f4cb100 100644
--- a/72_SolidAngleVisualizer/main.cpp
+++ b/72_SolidAngleVisualizer/main.cpp
@@ -475,13 +475,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 				instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex;
 				m_renderer->render(cb, viewParams); // draw the cube/OBB
 
-				// TODO: a better way to get identity matrix
-				float32_t3x4 origin = {
-					1.0f,0.0f,0.0f,0.0f,
-					0.0f,1.0f,0.0f,0.0f,
-					0.0f,0.0f,1.0f,0.0f
-				};
-				memcpy(&instance.world, &origin, sizeof(instance.world));
+				instance.world = float32_t3x4(1.0f);
 				instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk
 				m_renderer->render(cb, viewParams);
 			}
@@ -1112,8 +1106,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex);
 
 					ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount);
-					ImGui::Text("silhouette Clipped VertexCount: %u", m_GPUOutResulData.clippedVertexCount);
+					ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount);
 					ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false");
+					ImGui::Text("More Than Two Bit Transitions: %s", m_GPUOutResulData.MoreThanTwoBitTransitions ? "true" : "false");
 
 					{
 						float32_t3 xAxis = m_OBBModelMatrix[0].xyz;
@@ -1141,12 +1136,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 						lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex;
 					}
 
-					if (!m_GPUOutResulData.edgeVisibilityMismatch)
+					if (!m_GPUOutResulData.edgeVisibilityMismatch || !m_GPUOutResulData.MoreThanTwoBitTransitions)
 					{
 						// Reset flag when mismatch is cleared
 						modalShown = false;
 					}
-					if (m_GPUOutResulData.edgeVisibilityMismatch && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care
+					if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.MoreThanTwoBitTransitions) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care
 					{
 						// Open modal popup only once per configuration
 						ImGui::OpenPopup("Edge Visibility Mismatch Warning");
@@ -1165,10 +1160,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 
 						// Show configuration info
 						ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex);
-						ImGui::TextWrapped("Region: (%d, %d, %d)",
-							m_GPUOutResulData.region.x,
-							m_GPUOutResulData.region.y,
-							m_GPUOutResulData.region.z);
+						ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z);
 						ImGui::Spacing();
 
 						ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch);
@@ -1203,13 +1195,26 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					ImGui::Separator();
 
 					// Silhouette mask printed in binary
-					char buf[33];
-					for (int i = 0; i < 32; i++)
-						buf[i] = (m_GPUOutResulData.silhouette & (1u << (31 - i))) ? '1' : '0';
-					buf[32] = '\0';
 
-					ImGui::Text("silhouette: 0x%08X", m_GPUOutResulData.silhouette);
-					ImGui::Text("binary: %s", buf);
+
+					auto printBin = [](uint32_t bin, const char* name)
+						{
+							char buf[33];
+							for (int i = 0; i < 32; i++)
+								buf[i] = (bin & (1u << (31 - i))) ? '1' : '0';
+							buf[32] = '\0';
+							ImGui::Text("%s: 0x%08X", name, bin);
+							ImGui::Text("binary: 0b%s", buf);
+							ImGui::Separator();
+						};
+					printBin(m_GPUOutResulData.silhouette, "Silhouette");
+					printBin(m_GPUOutResulData.rotatedSil, "rotatedSilhouette");
+
+					printBin(m_GPUOutResulData.clipCount, "clipCount");
+					printBin(m_GPUOutResulData.clipMask, "clipMask");
+					printBin(m_GPUOutResulData.rotatedClipMask, "rotatedClipMask");
+					printBin(m_GPUOutResulData.rotateAmount, "rotateAmount");
+					printBin(m_GPUOutResulData.wrapAround, "wrapAround");
 				}
 				ImGui::End();
 			}
@@ -1240,29 +1245,56 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR
 					};
 
 				static RandomSampler rng(69); // Initialize RNG with seed
+
+				// Helper function to check if cube intersects unit sphere at origin
+				auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool {
+					float cubeRadius = glm::length(scale) * 0.5f;
+					float distanceToCenter = glm::length(translation);
+					return (distanceToCenter - cubeRadius) > 1.0f;
+				};
+
+				static TRS lastTRS = {};
 				if (ImGui::Button("Randomize Translation"))
 				{
-					m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
+					lastTRS = m_TRS; // Backup before randomizing
+					int attempts = 0;
+					do {
+						m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
+						attempts++;
+					} while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100);
 				}
 				ImGui::SameLine();
-
 				if (ImGui::Button("Randomize Rotation"))
 				{
+					lastTRS = m_TRS; // Backup before randomizing
 					m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f));
 				}
 				ImGui::SameLine();
-
 				if (ImGui::Button("Randomize Scale"))
 				{
-					m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+					lastTRS = m_TRS; // Backup before randomizing
+					int attempts = 0;
+					do {
+						m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+						attempts++;
+					} while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100);
 				}
-
-				ImGui::SameLine();
+				//ImGui::SameLine();
 				if (ImGui::Button("Randomize All"))
 				{
-					m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
-					m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f));
-					m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+					lastTRS = m_TRS; // Backup before randomizing
+					int attempts = 0;
+					do {
+						m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f));
+						m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f));
+						m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f));
+						attempts++;
+					} while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100);
+				}
+				ImGui::SameLine();
+				if (ImGui::Button("Revert to Last"))
+				{
+					m_TRS = lastTRS; // Restore backed-up TRS
 				}
 
 				addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]);