From 86f2ba55f2e4a814fdb58ab30bb4afa76fb2e41d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 15 Jan 2026 19:12:41 -0600 Subject: [PATCH 1/2] Use __builtin_memcpy, if available, to copy overlapping byte ranges in copy_remaining_bytes to avoid a branch to MEMCPY. Additionally use a space as padding byte instead of an 'X' so it can be represented diretly on AArch64 with a single instruction. --- ext/json/ext/generator/generator.c | 36 +++++++++++++++++++++++++++--- ext/json/ext/simd/simd.h | 2 +- test/json/json_generator_test.rb | 16 +++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 27b2e353..f17a2a72 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -297,13 +297,43 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned char *s = (buf->ptr + buf->len); // Pad the buffer with dummy characters that won't need escaping. - // This seem wateful at first sight, but memset of vector length is very fast. - memset(s, 'X', vec_len); + // This seem wasteful at first sight, but memset of vector length is very fast. + // This is a space as it can be directly represented as an immediate on AArch64. + memset(s, ' ', vec_len); // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. - MEMCPY(s, search->ptr, char, len); +#if defined(__has_builtin) && __has_builtin(__builtin_memcpy) + +#ifdef RBIMPL_ASSERT_OR_ASSUME + RBIMPL_ASSERT_OR_ASSUME(len < 16); +#endif + if (vec_len == 16 && len >= 4) { + // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes. + // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy + // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct + // position in both copies. + + // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the + // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)), + // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional + // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch + // plus two loads and stores generated when using __builtin_memcpy. + if (len >= 8) { + __builtin_memcpy(s, search->ptr, 8); + __builtin_memcpy(s + len - 8, search->ptr + len - 8, 8); + } else { + __builtin_memcpy(s, search->ptr, 4); + __builtin_memcpy(s + len - 4, search->ptr + len - 4, 4); + } + } else { + MEMCPY(s, search->ptr, char, len); + } +#else + MEMCPY(s, search->ptr, char, len); +#endif + return s; } diff --git a/ext/json/ext/simd/simd.h b/ext/json/ext/simd/simd.h index f8e5ee18..84f6135a 100644 --- a/ext/json/ext/simd/simd.h +++ b/ext/json/ext/simd/simd.h @@ -58,7 +58,7 @@ static inline int trailing_zeros(int input) #ifdef JSON_ENABLE_SIMD -#define SIMD_MINIMUM_THRESHOLD 6 +#define SIMD_MINIMUM_THRESHOLD 4 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 9f8b35de..d7c4173e 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -650,6 +650,22 @@ def test_backslash json = '"\\nabc"' assert_equal json, generate(data) # + data = "\n" + json = '"\\n"' + assert_equal json, generate(data) + # + (0..16).each do |i| + data = ('a' * i) + "\n" + json = '"' + ('a' * i) + '\\n"' + assert_equal json, generate(data) + end + # + (0..16).each do |i| + data = "\n" + ('a' * i) + json = '"' + '\\n' + ('a' * i) + '"' + assert_equal json, generate(data) + end + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) From be60b0788625c167cf16ac4f01e4598bbd556f0b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Fri, 16 Jan 2026 18:43:08 +0100 Subject: [PATCH 2/2] Extract json_fast_memcpy16 for readability --- ext/json/ext/generator/generator.c | 34 ++++++------------------------ ext/json/ext/json.h | 4 ++++ ext/json/ext/simd/simd.h | 27 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 28 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index f17a2a72..dbba99c4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -288,6 +288,8 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) { + RBIMPL_ASSERT_OR_ASSUME(len < vec_len); + // Flush the buffer so everything up until the last 'len' characters are unflushed. search_flush(search); @@ -303,37 +305,13 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. -#if defined(__has_builtin) && __has_builtin(__builtin_memcpy) - -#ifdef RBIMPL_ASSERT_OR_ASSUME - RBIMPL_ASSERT_OR_ASSUME(len < 16); -#endif - - if (vec_len == 16 && len >= 4) { - // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes. - // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy - // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct - // position in both copies. - - // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the - // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)), - // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional - // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch - // plus two loads and stores generated when using __builtin_memcpy. - if (len >= 8) { - __builtin_memcpy(s, search->ptr, 8); - __builtin_memcpy(s + len - 8, search->ptr + len - 8, 8); - } else { - __builtin_memcpy(s, search->ptr, 4); - __builtin_memcpy(s + len - 4, search->ptr + len - 4, 4); - } + if (vec_len == 16) { + RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); + json_fast_memcpy16(s, search->ptr, len); } else { MEMCPY(s, search->ptr, char, len); } -#else - MEMCPY(s, search->ptr, char, len); -#endif - + return s; } diff --git a/ext/json/ext/json.h b/ext/json/ext/json.h index 28efa04c..9379d7ae 100644 --- a/ext/json/ext/json.h +++ b/ext/json/ext/json.h @@ -5,6 +5,10 @@ #include "ruby/encoding.h" #include +#ifndef RBIMPL_ASSERT_OR_ASSUME +# define RBIMPL_ASSERT_OR_ASSUME(x) +#endif + #if defined(RUBY_DEBUG) && RUBY_DEBUG # define JSON_ASSERT RUBY_ASSERT #else diff --git a/ext/json/ext/simd/simd.h b/ext/json/ext/simd/simd.h index 84f6135a..3bb86acd 100644 --- a/ext/json/ext/simd/simd.h +++ b/ext/json/ext/simd/simd.h @@ -60,6 +60,33 @@ static inline int trailing_zeros(int input) #define SIMD_MINIMUM_THRESHOLD 4 +ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len) +{ + RBIMPL_ASSERT_OR_ASSUME(len < 16); + RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4 +#if defined(__has_builtin) && __has_builtin(__builtin_memcpy) + // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes. + // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy + // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct + // position in both copies. + + // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the + // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)), + // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional + // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch + // plus two loads and stores generated when using __builtin_memcpy. + if (len >= 8) { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + len - 8, src + len - 8, 8); + } else { + __builtin_memcpy(dst, src, 4); + __builtin_memcpy(dst + len - 4, src + len - 4, 4); + } +#else + MEMCPY(dst, src, char, len); +#endif +} + #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include