halide · shoaibkamil · Dec 15, 2025 · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -161,7 +161,7 @@ Checks: >
     -readability-use-std-min-max,
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
-ExcludeHeaderFilterRegex: '\.fbs\.h|common/cmdline\.h|/mini_\w*\.h'
+ExcludeHeaderFilterRegex: '\.fbs\.h|common/cmdline\.h|mini_[^/]*\.h'
 FormatStyle: 'file'
 CheckOptions:
     -   key: modernize-use-default-member-init.UseAssignment

diff --git a/Makefile b/Makefile
@@ -896,7 +896,8 @@ RUNTIME_CPP_COMPONENTS = \
   trace_helper \
   tracing \
   wasm_cpu_features \
-  webgpu_dawn \
+  webgpu_dawn_arm \
+  webgpu_dawn_x86 \
   webgpu_emscripten \
   windows_aarch64_cpu_features_arm \
   windows_clock \
@@ -1094,6 +1095,8 @@ RUNTIME_TRIPLE_WIN_GENERIC_64 = "x86_64-unknown-windows-unknown"
 
 RUNTIME_TRIPLE_WEBGPU_32 = "wasm32-unknown-unknown-unknown"
 RUNTIME_TRIPLE_WEBGPU_64 = "wasm64-unknown-unknown-unknown"
+RUNTIME_TRIPLE_WEBGPU_ARM_64 = "aarch64-unknown-unknown-unknown"
+RUNTIME_TRIPLE_WEBGPU_X86_64 = "x86_64-unknown-unknown-unknown"
 
 # `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++
 # standard but still skip threadsafe guards for static initialization in our runtime code)
@@ -1147,6 +1150,22 @@ $(BUILD_DIR)/initmod.windows_%_64.ll: $(SRC_DIR)/runtime/windows_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_GENERIC_64) -fshort-wchar -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_64.d
 
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_32.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_32.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_64.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_64.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_32.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_32.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_64.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_64.d
+
 $(BUILD_DIR)/initmod.webgpu_%_32.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32.d
@@ -1155,6 +1174,22 @@ $(BUILD_DIR)/initmod.webgpu_%_64.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_64.d
 
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_32_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_32_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_arm_64_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_ARM_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_arm.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_arm_64_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_32_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_32_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_dawn_x86_64_debug.ll: $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_X86_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_dawn_x86.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_dawn_x86_64_debug.d
+
 $(BUILD_DIR)/initmod.webgpu_%_32_debug.ll: $(SRC_DIR)/runtime/webgpu_%.cpp
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32_debug.d

diff --git a/doc/WebGPU.md b/doc/WebGPU.md
@@ -27,15 +27,16 @@ device codegen may be required before it becomes profitable to use.
 
 ## Running with WebAssembly via Emscripten: `HL_TARGET=wasm-32-wasmrt-webgpu`
 
-> _Tested with top-of-tree Emscripten as of 2023-02-23, against Chrome v113._
+> _Tested with Emscripten 5.0.0 (a7c5deabd7c88ba1c38ebe988112256775f944c6)_
+> _Tested with Node.js 25.5.0_
 
 Halide can generate WebGPU code that can be integrated with WASM code using
 Emscripten.
 
 When invoking `emcc` to link Halide-generated objects, include these flags:
-`-s USE_WEBGPU=1 -s ASYNCIFY`.
+`--use-port=emdawnwebgpu -s JSPI`.
 
-Tests that use AOT compilation can be run using a native WebGPU implementation
+Tests that use AOT compilation can be run using a WebGPU implementation
 that has Node.js bindings, such as [Dawn](https://dawn.googlesource.com/dawn/).
 You must set an environment variable named `HL_WEBGPU_NODE_BINDINGS` that
 has an absolute path to the bindings to run these tests, e.g. `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`.
@@ -47,17 +48,15 @@ JIT compilation is not supported when using WebGPU with WASM.
 
 ## Running natively: `HL_TARGET=host-webgpu`
 
-> _Tested with top-of-tree Dawn as of 2023-11-27 [commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4]._
+> _Tested with Dawn release branch chromium/7698 (536c572aba)_
 
 For testing purposes, Halide can also target native WebGPU libraries, such as
 [Dawn](https://dawn.googlesource.com/dawn/) or
 [wgpu](https://github.com/gfx-rs/wgpu).
 This is currently the only path that can run the JIT correctness tests.
 See [below](#setting-up-dawn) for instructions on building Dawn.
 
-> Note that as of 2023-11-27, wgpu is not supported due to
-> [lacking `override` support for WGSL](https://github.com/gfx-rs/wgpu/issues/1762)
-> which we require > in order to set GPU block sizes.
+> Note that as of 2026-02-17, wgpu is not supported due to lack of WaitAny timeout support.
 
 When targeting WebGPU with a native target, Halide defaults to looking for a
 build of Dawn (with several common names and suffixes); you can override this
@@ -120,9 +119,44 @@ The recommended method for updating `mini_webgpu.h` is to copy the
 `gen/include/dawn/webgpu.h` file from the Dawn build directory, then:
 - Restore the `// clang-format {off,on}` lines.
 - Comment out the `#include <std*>` lines.
-- Remove the `void` parameter from the `WGPUProc` declaration.
-
-This guarantees a version of the WebGPU header that is compatible with Dawn.
-When the native API eventually stabilizes, it should be possible to obtain a
-header from the `webgpu-native` GitHub organization that will be compatible
-with Dawn, wgpu, and Emscripten.
+- Include the following block to define things that would normally be defined in system headers:
+```
+// BEGIN Halide-specific changes
+//
+// For the Halide runtime, we can't include these headers,
+// so we define NULL, SIZE_MAX, and integer limit macros here.
+// #include <stdint.h>
+// #include <stddef.h>
+// #include <math.h>
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL nullptr
+#else
+#define NULL ((void*)0)
+#endif
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (~(uint32_t)0)
+#endif
+
+#ifndef UINT64_MAX
+#define UINT64_MAX (~(uint64_t)0)
+#endif
+
+// This _should_ be correct on all platforms we support, but needs checking.
+#ifndef UINT32_C
+#define UINT32_C(x) ((uint32_t)(x))
+#endif
+
+// END Halide-specific changes
+
+```
+
+This guarantees a version of the WebGPU header that is compatible with how
+Halide builds the runtime.
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
@@ -175,10 +175,6 @@ DECLARE_CPP_INITMOD(timer_profiler)
 DECLARE_CPP_INITMOD(to_string)
 DECLARE_CPP_INITMOD(trace_helper)
 DECLARE_CPP_INITMOD(tracing)
-// TODO(https://github.com/halide/Halide/issues/7248)
-// DECLARE_CPP_INITMOD(webgpu)
-DECLARE_CPP_INITMOD(webgpu_dawn)
-DECLARE_CPP_INITMOD(webgpu_emscripten)
 DECLARE_CPP_INITMOD(windows_clock)
 DECLARE_CPP_INITMOD(windows_cuda)
 DECLARE_CPP_INITMOD(windows_get_symbol)
@@ -284,6 +280,38 @@ DECLARE_NO_INITMOD(vulkan)
 DECLARE_NO_INITMOD(windows_vulkan)
 #endif  // WITH_VULKAN
 
+#ifdef WITH_WEBGPU
+// TODO(https://github.com/halide/Halide/issues/7248)
+#ifdef WITH_X86
+DECLARE_CPP_INITMOD(webgpu_dawn_x86)
+DECLARE_CPP_INITMOD(webgpu_dawn_x86_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_x86)
+DECLARE_NO_INITMOD(webgpu_dawn_x86_debug)
+#endif
+#ifdef WITH_ARM
+DECLARE_CPP_INITMOD(webgpu_dawn_arm)
+DECLARE_CPP_INITMOD(webgpu_dawn_arm_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_arm)
+DECLARE_NO_INITMOD(webgpu_dawn_arm_debug)
+#endif
+#ifdef WITH_WEBASSEMBLY
+DECLARE_CPP_INITMOD(webgpu_emscripten)
+DECLARE_CPP_INITMOD(webgpu_emscripten_debug)
+#else
+DECLARE_NO_INITMOD(webgpu_emscripten)
+DECLARE_NO_INITMOD(webgpu_emscripten_debug)
+#endif
+#else
+DECLARE_NO_INITMOD(webgpu_dawn_x86)
+DECLARE_NO_INITMOD(webgpu_dawn_arm)
+DECLARE_NO_INITMOD(webgpu_emscripten)
+DECLARE_NO_INITMOD(webgpu_dawn_x86_debug)
+DECLARE_NO_INITMOD(webgpu_dawn_arm_debug)
+DECLARE_NO_INITMOD(webgpu_emscripten_debug)
+#endif  // WITH_WEBGPU
+
 #ifdef WITH_X86
 DECLARE_LL_INITMOD(x86_amx)
 DECLARE_LL_INITMOD(x86_avx512)
@@ -1340,7 +1368,14 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
                 if (t.os == Target::WebAssemblyRuntime) {
                     modules.push_back(get_initmod_webgpu_emscripten(c, bits_64, debug));
                 } else {
-                    modules.push_back(get_initmod_webgpu_dawn(c, bits_64, debug));
+                    user_assert(bits_64) << "Native WebGPU target only available on 64-bit targets for now.\n";
+                    if (t.arch == Target::X86) {
+                        modules.push_back(get_initmod_webgpu_dawn_x86(c, bits_64, debug));
+                    } else if (t.arch == Target::ARM) {
+                        modules.push_back(get_initmod_webgpu_dawn_arm(c, bits_64, debug));
+                    } else {
+                        user_error << "WebGPU can only be used on X86 or ARM architectures.\n";
+                    }
                 }
             }
         }

diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
@@ -88,7 +88,8 @@ set(RUNTIME_CPP
     wasm_cpu_features
     # TODO(https://github.com/halide/Halide/issues/7248)
     # webgpu
-    webgpu_dawn
+    webgpu_dawn_arm
+    webgpu_dawn_x86
     webgpu_emscripten
     windows_aarch64_cpu_features_arm
     windows_clock
@@ -232,13 +233,29 @@ foreach (i IN LISTS RUNTIME_CPP)
                     set(TARGET "x86_64-unknown-windows-unknown")
                 endif ()
             endif ()
-        elseif (i MATCHES "webgpu")
+        elseif (i MATCHES "webgpu_emscripten")
+            # for WASM, we need to set a wasm target rather than a generic or native target
             if (j EQUAL 32)
-                # wasm32 will fail for some i386 builds, but i386 won't
                 set(TARGET "wasm32-unknown-unknown-unknown")
             else ()
                 set(TARGET "wasm64-unknown-unknown-unknown")
             endif ()
+        elseif (i MATCHES "webgpu_dawn_x86$")
+            if (j EQUAL 32)
+                # use default generic targets from below
+                set(TARGET "i386-unknown-unknown-unknown")
+            else ()
+                # due to struct passing, we need to use the correct arch
+                set(TARGET "x86_64-unknown-unknown-unknown")
+            endif ()
+        elseif (i MATCHES "webgpu_dawn_arm$")
+            if (j EQUAL 32 AND "ARM" IN_LIST Halide_LLVM_COMPONENTS)
+                # use default generic targets from below
+                set(TARGET "i386-unknown-unknown-unknown")
+            else ()
+                # due to struct passing, we need to use the correct arch
+                set(TARGET "aarch64-unknown-unknown-unknown")
+            endif ()
         else ()
             # don't be fooled: these are just generic 32/64-bit targets for our purposes here
             if (j EQUAL 32)