leejet · leejet · Dec 21, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/common.hpp b/common.hpp
@@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock {
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+            x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);

diff --git a/denoiser.hpp b/denoiser.hpp
@@ -366,18 +366,18 @@ struct KLOptimalScheduler : SigmaScheduler {
 
         for (uint32_t i = 0; i < n; ++i) {
             // t goes from 0.0 to 1.0
-            float t = static_cast<float>(i) / static_cast<float>(n-1);
+            float t = static_cast<float>(i) / static_cast<float>(n - 1);
 
             // Interpolate in the angle domain
             float angle = t * alpha_min + (1.0f - t) * alpha_max;
 
             // Convert back to sigma
             sigmas.push_back(std::tan(angle));
-            }
+        }
 
         // Append the final zero to sigma
         sigmas.push_back(0.0f);
-    
+
         return sigmas;
     }
 };

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -37,8 +37,9 @@ struct DiffusionModel {
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                             = 0;
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
-    virtual int64_t get_adm_in_channels()             = 0;
-    virtual void set_flash_attn_enabled(bool enabled) = 0;
+    virtual int64_t get_adm_in_channels()                            = 0;
+    virtual void set_flash_attn_enabled(bool enabled)                = 0;
+    virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
 };
 
 struct UNetModel : public DiffusionModel {
@@ -87,6 +88,10 @@ struct UNetModel : public DiffusionModel {
         unet.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        unet.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -148,6 +153,10 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        mmdit.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -210,6 +219,10 @@ struct FluxModel : public DiffusionModel {
         flux.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        flux.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -277,6 +290,10 @@ struct WanModel : public DiffusionModel {
         wan.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        wan.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -343,6 +360,10 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        qwen_image.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -406,6 +427,10 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        z_image.set_circular_axes(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,

diff --git a/examples/common/common.hpp b/examples/common/common.hpp
@@ -449,6 +449,10 @@ struct SDContextParams {
     bool diffusion_conv_direct  = false;
     bool vae_conv_direct        = false;
 
+    bool circular   = false;
+    bool circular_x = false;
+    bool circular_y = false;
+
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
@@ -605,6 +609,18 @@ struct SDContextParams {
              "--vae-conv-direct",
              "use ggml_conv2d_direct in the vae model",
              true, &vae_conv_direct},
+            {"",
+             "--circular",
+             "enable circular padding for convolutions",
+             true, &circular},
+            {"",
+             "--circularx",
+             "enable circular RoPE wrapping on x-axis (width) only",
+             true, &circular_x},
+            {"",
+             "--circulary",
+             "enable circular RoPE wrapping on y-axis (height) only",
+             true, &circular_y},
             {"",
              "--chroma-disable-dit-mask",
              "disable dit mask for chroma",
@@ -868,6 +884,9 @@ struct SDContextParams {
             << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
             << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
+            << "  circular: " << (circular ? "true" : "false") << ",\n"
+            << "  circular_x: " << (circular_x ? "true" : "false") << ",\n"
+            << "  circular_y: " << (circular_y ? "true" : "false") << ",\n"
             << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
             << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
             << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
@@ -928,6 +947,8 @@ struct SDContextParams {
             taesd_preview,
             diffusion_conv_direct,
             vae_conv_direct,
+            circular || circular_x,
+            circular || circular_y,
             force_sdxl_vae_conv_scale,
             chroma_use_dit_mask,
             chroma_use_t5_mask,

diff --git a/flux.hpp b/flux.hpp
@@ -860,14 +860,14 @@ namespace Flux {
             }
         }
 
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return x;
         }
 
@@ -893,11 +893,11 @@ namespace Flux {
             return x;
         }
 
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x) {
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
             return x;
         }
 
@@ -1076,7 +1076,7 @@ namespace Flux {
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
 
-            auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
+            auto img      = pad_to_patch_size(ctx, x);
             auto orig_img = img;
 
             if (params.chroma_radiance_params.use_patch_size_32) {
@@ -1150,16 +1150,16 @@ namespace Flux {
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
 
-            auto img            = process_img(ctx->ggml_ctx, x);
+            auto img            = process_img(ctx, x);
             uint64_t img_tokens = img->ne[1];
 
             if (params.version == VERSION_FLUX_FILL) {
                 GGML_ASSERT(c_concat != nullptr);
                 ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
-                masked = process_img(ctx->ggml_ctx, masked);
-                mask   = process_img(ctx->ggml_ctx, mask);
+                masked = process_img(ctx, masked);
+                mask   = process_img(ctx, mask);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
             } else if (params.version == VERSION_FLEX_2) {
@@ -1168,21 +1168,21 @@ namespace Flux {
                 ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = process_img(ctx->ggml_ctx, masked);
-                mask    = process_img(ctx->ggml_ctx, mask);
-                control = process_img(ctx->ggml_ctx, control);
+                masked  = process_img(ctx, masked);
+                mask    = process_img(ctx, mask);
+                control = process_img(ctx, control);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
             } else if (params.version == VERSION_FLUX_CONTROLS) {
                 GGML_ASSERT(c_concat != nullptr);
 
-                auto control = process_img(ctx->ggml_ctx, c_concat);
+                auto control = process_img(ctx, c_concat);
                 img          = ggml_concat(ctx->ggml_ctx, img, control, 0);
             }
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
@@ -1472,6 +1472,8 @@ namespace Flux {
                                             increase_ref_index,
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
+                                            circular_y_enabled,
+                                            circular_x_enabled,
                                             flux_params.axes_dim);
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);

diff --git a/ggml b/ggml
+1 −1		.github/workflows/ci.yml
+4 −0		CMakeLists.txt
+1 −0		include/ggml-cpu.h
+1 −1		scripts/sync-llama.last
+1 −1		scripts/sync-whisper.last
+1 −0		src/ggml-cann/ggml-cann.cpp
+2 −0		src/ggml-cpu/arch/arm/repack.cpp
+26 −0		src/ggml-cpu/ggml-cpu.c
+4 −0		src/ggml-cpu/ggml-cpu.cpp
+2 −1		src/ggml-cpu/repack.cpp
+14 −11		src/ggml-cuda/common.cuh
+3 −3		src/ggml-cuda/fattn-common.cuh
+3 −3		src/ggml-cuda/fattn-mma-f16.cuh
+2 −2		src/ggml-cuda/ggml-cuda.cu
+60 −5		src/ggml-cuda/mma.cuh
+5 −3		src/ggml-cuda/mmf.cu
+4 −1		src/ggml-cuda/mmvf.cu
+95 −15		src/ggml-cuda/solve_tri.cu
+4 −0		src/ggml-cuda/vendors/hip.h
+5 −0		src/ggml-cuda/vendors/musa.h
+107 −10		src/ggml-vulkan/ggml-vulkan.cpp
+29 −0		src/ggml-vulkan/vulkan-shaders/diag.comp
+3 −0		src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+3 −0		src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+2 −2		src/ggml-vulkan/vulkan-shaders/get_rows.comp
+42 −26		src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp
+8 −5		src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+62 −0		src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
+79 −0		src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
+65 −0		src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
+53 −0		src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
+9 −3		src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+14 −3		src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+26 −3		tests/test-backend-ops.cpp