Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,43 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
char *s = (buf->ptr + buf->len);

// Pad the buffer with dummy characters that won't need escaping.
// This seem wateful at first sight, but memset of vector length is very fast.
memset(s, 'X', vec_len);
// This seem wasteful at first sight, but memset of vector length is very fast.
// This is a space as it can be directly represented as an immediate on AArch64.
memset(s, ' ', vec_len);

// Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
// to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
MEMCPY(s, search->ptr, char, len);
#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)

#ifdef RBIMPL_ASSERT_OR_ASSUME
RBIMPL_ASSERT_OR_ASSUME(len < 16);
#endif

if (vec_len == 16 && len >= 4) {
// If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
// These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
// the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
// position in both copies.

// Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
// generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
// when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
// select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
// plus two loads and stores generated when using __builtin_memcpy.
if (len >= 8) {
__builtin_memcpy(s, search->ptr, 8);
__builtin_memcpy(s + len - 8, search->ptr + len - 8, 8);
} else {
__builtin_memcpy(s, search->ptr, 4);
__builtin_memcpy(s + len - 4, search->ptr + len - 4, 4);
}
Comment on lines +323 to +329
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this makes sense, but I'd extract it to some sort of fast_memcpy16 sort of static inline helper to make it easier to grasp, and with the fallback in it for when __builtin_memcpy isn't available.

Could even be in simd.h.

} else {
MEMCPY(s, search->ptr, char, len);
}
#else
MEMCPY(s, search->ptr, char, len);
#endif

return s;
}

Expand Down
2 changes: 1 addition & 1 deletion ext/json/ext/simd/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static inline int trailing_zeros(int input)

#ifdef JSON_ENABLE_SIMD

#define SIMD_MINIMUM_THRESHOLD 6
#define SIMD_MINIMUM_THRESHOLD 4

#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
Expand Down
16 changes: 16 additions & 0 deletions test/json/json_generator_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,22 @@ def test_backslash
json = '"\\nabc"'
assert_equal json, generate(data)
#
data = "\n"
json = '"\\n"'
assert_equal json, generate(data)
#
(0..16).each do |i|
data = ('a' * i) + "\n"
json = '"' + ('a' * i) + '\\n"'
assert_equal json, generate(data)
end
#
(0..16).each do |i|
data = "\n" + ('a' * i)
json = '"' + '\\n' + ('a' * i) + '"'
assert_equal json, generate(data)
end
#
data = ["'"]
json = '["\\\'"]'
assert_equal '["\'"]', generate(data)
Expand Down