From 0a25425e9e48b8cf4b97c046a89959acc4e2ac8d Mon Sep 17 00:00:00 2001
From: Derek Miller <derek@finitelabs.com>
Date: Tue, 27 Jan 2026 09:04:54 -0600
Subject: [PATCH 1/2] Add benchmarking infrastructure and benchmark functions
 to all modules

- Add bitn.utils.benchmark module with benchmark_op() utility
- Add benchmark() function to bit16, bit32, and bit64 modules
- Add run_benchmarks.sh script for running benchmarks
- Add make bench and make bench-<module> targets
- Update README.md and CLAUDE.md with benchmarking documentation

Each module benchmarks bitwise operations, shifts, rotates, arithmetic,
and byte conversions with 100K iterations per operation.
---
 CLAUDE.md                    |  35 +++++++-
 Makefile                     |  55 +++++++++---
 README.md                    |  55 ++++++++++--
 run_benchmarks.sh            | 161 +++++++++++++++++++++++++++++++++++
 run_benchmarks_matrix.sh     |  72 ++++++++++++++++
 run_tests_matrix.sh          |  12 ++-
 src/bitn/bit16.lua           |  87 +++++++++++++++++++
 src/bitn/bit32.lua           |  87 +++++++++++++++++++
 src/bitn/bit64.lua           | 135 +++++++++++++++++++++++++++++
 src/bitn/utils/benchmark.lua |  33 +++++++
 src/bitn/utils/init.lua      |   7 ++
 11 files changed, 716 insertions(+), 23 deletions(-)
 create mode 100755 run_benchmarks.sh
 create mode 100755 run_benchmarks_matrix.sh
 create mode 100644 src/bitn/utils/benchmark.lua
 create mode 100644 src/bitn/utils/init.lua
diff --git a/CLAUDE.md b/CLAUDE.md
index 5760f01..736fe3d 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -8,7 +8,10 @@ lua-bitn/
 │   ├── init.lua      # Module aggregator, exports bit16/bit32/bit64
 │   ├── bit16.lua     # 16-bit bitwise operations
 │   ├── bit32.lua     # 32-bit bitwise operations
-│   └── bit64.lua     # 64-bit bitwise operations (uses {high, low} pairs)
+│   ├── bit64.lua     # 64-bit bitwise operations (uses {high, low} pairs)
+│   └── utils/
+│       ├── init.lua      # Utils module aggregator
+│       └── benchmark.lua # Benchmarking utilities
 ├── tests/
 │   ├── test_bit16.lua    # 16-bit test vectors
 │   ├── test_bit32.lua    # 32-bit test vectors
@@ -18,6 +21,7 @@ lua-bitn/
 │   └── release.yml   # Release automation
 ├── run_tests.sh      # Main test runner
 ├── run_tests_matrix.sh   # Multi-version test runner
+├── run_benchmarks.sh # Benchmark runner
 └── Makefile          # Build automation
 ```
 
@@ -33,6 +37,12 @@ make test-bit32
 # Run across Lua versions
 make test-matrix
 
+# Run benchmarks
+make bench
+
+# Run specific module benchmark
+make bench-bit32
+
 # Format code
 make format
 
@@ -80,6 +90,29 @@ local test_vectors = {
 
 Run with: `./run_tests.sh` or `make test`
 
+## Benchmarking
+
+Each module includes a `benchmark()` function that measures performance of all
+operations. Benchmarks use the `bitn.utils.benchmark` module for consistent
+timing and output formatting.
+
+```bash
+# Run all benchmarks (uses LuaJIT by default for best performance)
+./run_benchmarks.sh or `make bench`
+
+# Run with specific Lua version
+LUA_BINARY=lua5.4 ./run_benchmarks.sh
+
+# Run specific module
+./run_benchmarks.sh bit32
+make bench-bit64
+```
+
+The benchmark utility performs:
+- 3 warmup iterations before timing
+- Configurable iteration count (default: 100, modules use 10000)
+- Reports ms/op and ops/sec metrics
+
 ## Building
 
 The build process uses `amalg` to create a single-file distribution:
diff --git a/Makefile b/Makefile
index 32b2ca1..51f3f24 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 # Luarocks path for amalg and other tools
 LUAROCKS_PATH := $(shell luarocks path --lr-path 2>/dev/null)
 
+# Lua path for local modules (src, vendor)
+LUA_PATH_LOCAL := ./?.lua;./?/init.lua;./src/?.lua;./src/?/init.lua;./vendor/?.lua;$(LUAROCKS_PATH)
+
 # Default target
 .PHONY: all
 all: format lint test build
@@ -25,11 +28,31 @@ test-matrix-%:
 test-%:
 	./run_tests.sh $*
 
+# Run benchmarks
+.PHONY: bench
+bench:
+	./run_benchmarks.sh
+
+# Run bench matrix
+.PHONY: bench-matrix
+bench-matrix:
+	./run_benchmarks_matrix.sh
+
+# Run specific bench suite for bench matrix
+.PHONY: bench-matrix-%
+bench-matrix-%:
+	./run_benchmarks_matrix.sh $*
+
+# Run specific benchmark suite
+.PHONY: bench-%
+bench-%:
+	./run_benchmarks.sh $*
+
 build/amalg.cache: src/bitn/init.lua
 	@echo "Generating amalgamation cache..."
 	@mkdir -p build
 	@if command -v amalg.lua >/dev/null 2>&1; then \
-		LUA_PATH="./?.lua;./?/init.lua;./src/?.lua;./src/?/init.lua;$(LUAROCKS_PATH)" lua -lamalg src/bitn/init.lua && mv amalg.cache build || exit 1; \
+		LUA_PATH="$(LUA_PATH_LOCAL)" lua -lamalg src/bitn/init.lua && mv amalg.cache build || exit 1; \
 		echo "Generated amalg.cache"; \
 	else \
 		echo "Error: amalg not found."; \
@@ -43,7 +66,7 @@ build/amalg.cache: src/bitn/init.lua
 build: build/amalg.cache
 	@echo "Building single-file distribution..."
 	@if command -v amalg.lua >/dev/null 2>&1; then \
-		LUA_PATH="./?.lua;./?/init.lua;./src/?.lua;./src/?/init.lua;$(LUAROCKS_PATH)" amalg.lua -o build/bitn.lua -C ./build/amalg.cache || exit 1;\
+		LUA_PATH="$(LUA_PATH_LOCAL)" amalg.lua -o build/bitn.lua -C ./build/amalg.cache || exit 1;\
 		echo "Built build/bitn.lua"; \
 		VERSION=$$(git describe --exact-match --tags 2>/dev/null || echo "dev"); \
 		if [ "$$VERSION" != "dev" ]; then \
@@ -144,20 +167,28 @@ help:
 	@echo "Lua bitN Library - Makefile targets"
 	@echo ""
 	@echo "Testing:"
-	@echo "  make test              - Run all tests"
-	@echo "  make test-<name>       - Run specific test (e.g., make test-bit32)"
-	@echo "  make test-matrix       - Run test matrix across Lua versions"
+	@echo "  make test               - Run all tests"
+	@echo "  make test-<name>        - Run specific test (e.g., make test-bit32)"
+	@echo "  make test-matrix        - Run tests across all Lua versions"
+	@echo "  make test-matrix-<name> - Run specific test across all Lua versions"
+	@echo ""
+	@echo "Benchmarking:"
+	@echo "  make bench               - Run all benchmarks"
+	@echo "  make bench-<name>        - Run specific benchmark (e.g., make bench-bit32)"
+	@echo "  make bench-matrix        - Run benchmarks across all Lua versions"
+	@echo "  make bench-matrix-<name> - Run specific benchmark across all Lua versions"
 	@echo ""
 	@echo "Building:"
-	@echo "  make build             - Build single-file distribution"
+	@echo "  make build              - Build single-file distribution"
 	@echo ""
 	@echo "Code Quality:"
-	@echo "  make format            - Format all code (Lua)"
-	@echo "  make format-check      - Check code formatting"
-	@echo "  make lint              - Lint code with luacheck"
+	@echo "  make check              - Run format-check and lint"
+	@echo "  make format             - Format code with stylua"
+	@echo "  make format-check       - Check code formatting"
+	@echo "  make lint               - Lint code with luacheck"
 	@echo ""
 	@echo "Setup:"
-	@echo "  make install-deps      - Install all development dependencies"
-	@echo "  make clean             - Remove generated files"
+	@echo "  make install-deps       - Install development dependencies"
+	@echo "  make clean              - Remove generated files"
 	@echo ""
-	@echo "  make help              - Show this help"
+	@echo "  make help               - Show this help"
diff --git a/README.md b/README.md
index eb6d184..3edaf69 100644
--- a/README.md
+++ b/README.md
@@ -68,22 +68,59 @@ local xored = bit64.bxor(
 
 Example: `0x123456789ABCDEF0` is represented as `{0x12345678, 0x9ABCDEF0}`
 
-## Testing
+## Development
 
-Run the test suite:
+### Setup
 
 ```bash
-# Run all tests with default Lua interpreter
-./run_tests.sh
+# Install development dependencies (stylua, luacheck, amalg)
+make install-deps
+```
+
+### Testing
 
-# Run with specific Lua version
+```bash
+make test                # Run all tests
+make test-bit32          # Run specific module tests
+make test-matrix         # Run tests across all Lua versions
+make test-matrix-bit32   # Run specific module across all Lua versions
+
+# Or use scripts directly with custom Lua binary
 LUA_BINARY=lua5.1 ./run_tests.sh
+```
+
+### Benchmarking
+
+```bash
+make bench               # Run all benchmarks
+make bench-bit32         # Run specific module benchmark
+make bench-matrix        # Run benchmarks across all Lua versions
+make bench-matrix-bit64  # Run specific module across all Lua versions
+
+# Or use scripts directly with custom Lua binary
+LUA_BINARY=lua5.4 ./run_benchmarks.sh
+```
 
-# Run specific module
-./run_tests.sh bit32
+### Code Quality
 
-# Run test matrix across all Lua versions
-./run_tests_matrix.sh
+```bash
+make check               # Run format check and lint
+make format              # Format code with stylua
+make format-check        # Check formatting without modifying
+make lint                # Run luacheck
+```
+
+### Building
+
+```bash
+make build               # Build single-file distribution (build/bitn.lua)
+make clean               # Remove generated files
+```
+
+### Help
+
+```bash
+make help                # Show all available targets
 ```
 
 ## Current Limitations
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
new file mode 100755
index 0000000..5427916
--- /dev/null
+++ b/run_benchmarks.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# Lua bitN Library Benchmark Runner
+# Runs performance benchmarks for bit operation modules
+#
+# Usage: ./run_benchmarks.sh [module_names...]
+#
+# Examples:
+#   ./run_benchmarks.sh                   # Run all benchmarks
+#   ./run_benchmarks.sh bit32 bit64       # Run only bit32 and bit64 benchmarks
+#
+# Available modules: bit16, bit32, bit64
+
+set -e  # Exit on any error
+
+echo "============================================="
+echo "⚡ Lua bitN Library - Benchmark Runner"
+echo "============================================="
+echo
+
+# Colors for output
+green='\033[0;32m'
+red='\033[0;31m'
+blue='\033[0;34m'
+nc='\033[0m' # No Color
+
+# Track overall results
+completed_modules=()
+failed_modules=()
+
+lua_binary="${LUA_BINARY:-luajit}"  # Use luajit by default, can be overridden
+
+# Check if the lua binary is available
+if ! command -v "$lua_binary" &> /dev/null; then
+    echo -e "${red}❌ Error: '$lua_binary' command not found.${nc}"
+    exit 1
+fi
+echo "$($lua_binary -v)"
+echo
+
+# Get script directory
+script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+# Add repository root to Lua's package path
+lua_path="$script_dir/?.lua;$script_dir/?/init.lua;$script_dir/src/?.lua;$script_dir/src/?/init.lua;$LUA_PATH"
+
+# Parse command line arguments to determine which modules to run
+default_modules=("bit16" "bit32" "bit64")
+all_modules=("bit16" "bit32" "bit64")
+modules_to_run=("$@")
+
+# Validate modules if specified
+if [ ${#modules_to_run[@]} -gt 0 ] && [ "${modules_to_run[0]}" != "all" ]; then
+    for module in "${modules_to_run[@]}"; do
+        valid=0
+        for valid_module in "${all_modules[@]}"; do
+            if [ "$module" = "$valid_module" ]; then
+                valid=1
+                break
+            fi
+        done
+        if [ $valid -eq 0 ]; then
+            echo -e "${red}❌ Error: Unknown module '$module' or benchmark not implemented${nc}"
+            echo "Available modules: ${all_modules[*]}"
+            exit 1
+        fi
+    done
+fi
+
+if [ ${#modules_to_run[@]} -eq 0 ]; then
+    # No arguments provided, run all benchmarks
+    modules_to_run=("${default_modules[@]}")
+    echo "Running default benchmarks: ${modules_to_run[*]}"
+elif [ "${modules_to_run[0]}" = "all" ]; then
+    modules_to_run=("${all_modules[@]}")
+    echo "Running all benchmarks: ${modules_to_run[*]}"
+else
+    echo "Running specified benchmarks: ${modules_to_run[*]}"
+fi
+echo
+
+# Function to check if a module should be run
+should_run_module() {
+    local module_key="$1"
+    for module in "${modules_to_run[@]}"; do
+        if [ "$module" = "$module_key" ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+# Function to run a benchmark and capture result
+run_benchmark() {
+    local module_name="$1"
+    local module_key="$2"
+    local lua_command="$3"
+
+    if ! should_run_module "$module_key"; then
+        return
+    fi
+
+    echo "---------------------------------------------"
+    echo -e "${blue}Benchmarking $module_name...${nc}"
+    echo "---------------------------------------------"
+
+    if LUA_PATH="$lua_path" "$lua_binary" -e "$lua_command" 2>&1; then
+        echo -e "\n${green}✅ $module_name: BENCHMARK COMPLETED${nc}"
+        completed_modules+=("$module_name")
+    else
+        echo -e "\n${red}❌ $module_name: BENCHMARK FAILED${nc}"
+        failed_modules+=("$module_name")
+    fi
+
+    echo
+}
+
+run_module_benchmark() {
+  local module_name="$1"
+  local module_key="$2"
+  local lua_module="$3"
+  run_benchmark "$module_name" "$module_key" "
+    require('$lua_module').benchmark()
+  "
+}
+
+# Run benchmarks
+run_module_benchmark "16-bit Operations" "bit16" "bitn.bit16"
+run_module_benchmark "32-bit Operations" "bit32" "bitn.bit32"
+run_module_benchmark "64-bit Operations" "bit64" "bitn.bit64"
+
+completed_count=${#completed_modules[@]}
+failed_count=${#failed_modules[@]}
+total_count=$((completed_count + failed_count))
+
+# If only one module is run, no need to summarize
+if [ $total_count -eq 1 ]; then
+    exit 0
+fi
+
+# Summary
+echo "============================================="
+echo "📊 BENCHMARK SUMMARY"
+echo "============================================="
+
+if [ ${#failed_modules[@]} -eq 0 ]; then
+    echo -e "${green}🎉 ALL BENCHMARKS COMPLETED: $completed_count/$total_count${nc}"
+    echo
+    echo "Completed benchmarks:"
+    for module in "${completed_modules[@]}"; do
+        echo "• $module: ✅ COMPLETE"
+    done
+else
+    echo -e "${red}⚠️  SOME BENCHMARKS FAILED: $failed_count/$total_count${nc}"
+    echo
+    echo "Failed benchmarks:"
+    for module in "${failed_modules[@]}"; do
+        echo "• $module: ❌ FAILED"
+    done
+    exit 1
+fi
diff --git a/run_benchmarks_matrix.sh b/run_benchmarks_matrix.sh
new file mode 100755
index 0000000..c12ee16
--- /dev/null
+++ b/run_benchmarks_matrix.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Lua bitN Library - Benchmark Matrix Runner
+# Runs benchmarks across multiple Lua versions using luaenv
+#
+# Usage: ./run_benchmarks_matrix.sh [module_names...]
+#
+# Examples:
+#   ./run_benchmarks_matrix.sh                   # Run all benchmarks on all versions
+#   ./run_benchmarks_matrix.sh bit32             # Run bit32 benchmarks on all versions
+#   ./run_benchmarks_matrix.sh bit32 bit64       # Run specific benchmarks on all versions
+
+# List of luaenv versions to benchmark
+LUA_VERSIONS=("5.1.5" "5.2.4" "5.3.6" "5.4.8" "luajit-2.1-dev")
+
+# Colors for output
+green='\033[0;32m'
+yellow='\033[1;33m'
+red='\033[0;31m'
+nc='\033[0m' # No Color
+
+luaenv_binary="${LUAENV_BINARY:-luaenv}"
+
+# Get script directory
+script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+if ! command -v "$luaenv_binary" &> /dev/null; then
+    echo -e "${red}❌ Error: $luaenv_binary command not found.${nc}"
+    exit 1
+fi
+
+if [ ! -d "$($luaenv_binary prefix)/../../plugins/luaenv-luarocks" ]; then
+    echo -e "${red}❌ Error: luaenv-luarocks plugin not found. Please install it first.${nc}"
+    exit 1
+fi
+
+# Track overall results
+failed_versions=()
+passed_versions=()
+
+for lua_version in "${LUA_VERSIONS[@]}"; do
+    echo -e "${yellow}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~${nc}"
+    echo -e "${yellow}Running benchmarks with $lua_version${nc}"
+    echo -e "${yellow}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~${nc}"
+    echo
+
+    "$luaenv_binary" install -s $lua_version
+    lua_prefix="$($luaenv_binary prefix $lua_version)"
+    lua_binary="$lua_prefix/bin/lua"
+
+    # Run the benchmarks and pass all arguments
+    if ! LUA_BINARY="$lua_binary" "$script_dir/run_benchmarks.sh" "$@"; then
+        failed_versions+=("$lua_version")
+    else
+        passed_versions+=("$lua_version")
+    fi
+done
+
+# Final summary
+echo "============================================="
+echo "📊 Matrix Benchmark Summary"
+echo "============================================="
+
+if [ ${#failed_versions[@]} -eq 0 ]; then
+    echo -e "${green}✅ All LUA VERSIONS COMPLETED:${nc}"
+    printf '%s\n' "${passed_versions[@]}"
+    exit 0
+else
+    echo -e "${red}💥 SOME LUA VERSIONS FAILED:${nc}"
+    printf '%s\n' "${failed_versions[@]}"
+    exit 1
+fi
diff --git a/run_tests_matrix.sh b/run_tests_matrix.sh
index 4d81e76..cbe4642 100755
--- a/run_tests_matrix.sh
+++ b/run_tests_matrix.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
 
+# Lua bitN Library - Test Matrix Runner
+# Runs tests across multiple Lua versions using luaenv
+#
+# Usage: ./run_tests_matrix.sh [module_names...]
+#
+# Examples:
+#   ./run_tests_matrix.sh                   # Run all tests on all versions
+#   ./run_tests_matrix.sh bit32             # Run bit32 tests on all versions
+#   ./run_tests_matrix.sh bit32 bit64       # Run specific tests on all versions
+
 # List of luaenv versions to test
 LUA_VERSIONS=("5.1.5" "5.2.4" "5.3.6" "5.4.8" "luajit-2.1-dev")
 
@@ -9,7 +19,7 @@ yellow='\033[1;33m'
 red='\033[0;31m'
 nc='\033[0m' # No Color
 
-luaenv_binary="${LUAENV_BINARY:-luaenv}"  # Use luaenv by default, can be overridden
+luaenv_binary="${LUAENV_BINARY:-luaenv}"
 
 # Get script directory
 script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
diff --git a/src/bitn/bit16.lua b/src/bitn/bit16.lua
index 8646d66..c66e90f 100644
--- a/src/bitn/bit16.lua
+++ b/src/bitn/bit16.lua
@@ -442,4 +442,91 @@ function bit16.selftest()
   return passed == total
 end
 
+--------------------------------------------------------------------------------
+-- Benchmarking
+--------------------------------------------------------------------------------
+
+local benchmark_op = require("bitn.utils.benchmark").benchmark_op
+
+--- Run performance benchmarks for 16-bit operations.
+function bit16.benchmark()
+  local iterations = 1000000
+
+  print("16-bit Bitwise Operations:")
+
+  -- Test values
+  local a, b = 0xAAAA, 0x5555
+
+  benchmark_op("band", function()
+    bit16.band(a, b)
+  end, iterations)
+
+  benchmark_op("bor", function()
+    bit16.bor(a, b)
+  end, iterations)
+
+  benchmark_op("bxor", function()
+    bit16.bxor(a, b)
+  end, iterations)
+
+  benchmark_op("bnot", function()
+    bit16.bnot(a)
+  end, iterations)
+
+  print("\n16-bit Shift Operations:")
+
+  benchmark_op("lshift", function()
+    bit16.lshift(a, 4)
+  end, iterations)
+
+  benchmark_op("rshift", function()
+    bit16.rshift(a, 4)
+  end, iterations)
+
+  benchmark_op("arshift", function()
+    bit16.arshift(0x8000, 4)
+  end, iterations)
+
+  print("\n16-bit Rotate Operations:")
+
+  benchmark_op("rol", function()
+    bit16.rol(a, 4)
+  end, iterations)
+
+  benchmark_op("ror", function()
+    bit16.ror(a, 4)
+  end, iterations)
+
+  print("\n16-bit Arithmetic:")
+
+  benchmark_op("add", function()
+    bit16.add(a, b)
+  end, iterations)
+
+  benchmark_op("mask", function()
+    bit16.mask(0x12345)
+  end, iterations)
+
+  print("\n16-bit Byte Conversions:")
+
+  local bytes_be = bit16.u16_to_be_bytes(0x1234)
+  local bytes_le = bit16.u16_to_le_bytes(0x1234)
+
+  benchmark_op("u16_to_be_bytes", function()
+    bit16.u16_to_be_bytes(0x1234)
+  end, iterations)
+
+  benchmark_op("u16_to_le_bytes", function()
+    bit16.u16_to_le_bytes(0x1234)
+  end, iterations)
+
+  benchmark_op("be_bytes_to_u16", function()
+    bit16.be_bytes_to_u16(bytes_be)
+  end, iterations)
+
+  benchmark_op("le_bytes_to_u16", function()
+    bit16.le_bytes_to_u16(bytes_le)
+  end, iterations)
+end
+
 return bit16
diff --git a/src/bitn/bit32.lua b/src/bitn/bit32.lua
index 5f3589c..0650d14 100644
--- a/src/bitn/bit32.lua
+++ b/src/bitn/bit32.lua
@@ -494,4 +494,91 @@ function bit32.selftest()
   return passed == total
 end
 
+--------------------------------------------------------------------------------
+-- Benchmarking
+--------------------------------------------------------------------------------
+
+local benchmark_op = require("bitn.utils.benchmark").benchmark_op
+
+--- Run performance benchmarks for 32-bit operations.
+function bit32.benchmark()
+  local iterations = 1000000
+
+  print("32-bit Bitwise Operations:")
+
+  -- Test values
+  local a, b = 0xAAAAAAAA, 0x55555555
+
+  benchmark_op("band", function()
+    bit32.band(a, b)
+  end, iterations)
+
+  benchmark_op("bor", function()
+    bit32.bor(a, b)
+  end, iterations)
+
+  benchmark_op("bxor", function()
+    bit32.bxor(a, b)
+  end, iterations)
+
+  benchmark_op("bnot", function()
+    bit32.bnot(a)
+  end, iterations)
+
+  print("\n32-bit Shift Operations:")
+
+  benchmark_op("lshift", function()
+    bit32.lshift(a, 8)
+  end, iterations)
+
+  benchmark_op("rshift", function()
+    bit32.rshift(a, 8)
+  end, iterations)
+
+  benchmark_op("arshift", function()
+    bit32.arshift(0x80000000, 8)
+  end, iterations)
+
+  print("\n32-bit Rotate Operations:")
+
+  benchmark_op("rol", function()
+    bit32.rol(a, 8)
+  end, iterations)
+
+  benchmark_op("ror", function()
+    bit32.ror(a, 8)
+  end, iterations)
+
+  print("\n32-bit Arithmetic:")
+
+  benchmark_op("add", function()
+    bit32.add(a, b)
+  end, iterations)
+
+  benchmark_op("mask", function()
+    bit32.mask(0x123456789)
+  end, iterations)
+
+  print("\n32-bit Byte Conversions:")
+
+  local bytes_be = bit32.u32_to_be_bytes(0x12345678)
+  local bytes_le = bit32.u32_to_le_bytes(0x12345678)
+
+  benchmark_op("u32_to_be_bytes", function()
+    bit32.u32_to_be_bytes(0x12345678)
+  end, iterations)
+
+  benchmark_op("u32_to_le_bytes", function()
+    bit32.u32_to_le_bytes(0x12345678)
+  end, iterations)
+
+  benchmark_op("be_bytes_to_u32", function()
+    bit32.be_bytes_to_u32(bytes_be)
+  end, iterations)
+
+  benchmark_op("le_bytes_to_u32", function()
+    bit32.le_bytes_to_u32(bytes_le)
+  end, iterations)
+end
+
 return bit32
diff --git a/src/bitn/bit64.lua b/src/bitn/bit64.lua
index cef30c6..34eb794 100644
--- a/src/bitn/bit64.lua
+++ b/src/bitn/bit64.lua
@@ -933,4 +933,139 @@ function bit64.selftest()
   return passed == total
 end
 
+--------------------------------------------------------------------------------
+-- Benchmarking
+--------------------------------------------------------------------------------
+
+local benchmark_op = require("bitn.utils.benchmark").benchmark_op
+
+--- Run performance benchmarks for 64-bit operations.
+function bit64.benchmark()
+  local iterations = 1000000
+
+  print("64-bit Bitwise Operations:")
+
+  -- Test values
+  local a = bit64.new(0xAAAAAAAA, 0x55555555)
+  local b = bit64.new(0x55555555, 0xAAAAAAAA)
+
+  benchmark_op("band", function()
+    bit64.band(a, b)
+  end, iterations)
+
+  benchmark_op("bor", function()
+    bit64.bor(a, b)
+  end, iterations)
+
+  benchmark_op("bxor", function()
+    bit64.bxor(a, b)
+  end, iterations)
+
+  benchmark_op("bnot", function()
+    bit64.bnot(a)
+  end, iterations)
+
+  print("\n64-bit Shift Operations:")
+
+  benchmark_op("lshift (small)", function()
+    bit64.lshift(a, 8)
+  end, iterations)
+
+  benchmark_op("lshift (large)", function()
+    bit64.lshift(a, 40)
+  end, iterations)
+
+  benchmark_op("rshift (small)", function()
+    bit64.rshift(a, 8)
+  end, iterations)
+
+  benchmark_op("rshift (large)", function()
+    bit64.rshift(a, 40)
+  end, iterations)
+
+  benchmark_op("arshift", function()
+    bit64.arshift(bit64.new(0x80000000, 0), 8)
+  end, iterations)
+
+  print("\n64-bit Rotate Operations:")
+
+  benchmark_op("rol (small)", function()
+    bit64.rol(a, 8)
+  end, iterations)
+
+  benchmark_op("rol (large)", function()
+    bit64.rol(a, 40)
+  end, iterations)
+
+  benchmark_op("ror (small)", function()
+    bit64.ror(a, 8)
+  end, iterations)
+
+  benchmark_op("ror (large)", function()
+    bit64.ror(a, 40)
+  end, iterations)
+
+  print("\n64-bit Arithmetic:")
+
+  benchmark_op("add", function()
+    bit64.add(a, b)
+  end, iterations)
+
+  benchmark_op("add (with carry)", function()
+    bit64.add(bit64.new(0, 0xFFFFFFFF), bit64.new(0, 1))
+  end, iterations)
+
+  print("\n64-bit Byte Conversions:")
+
+  local val = bit64.new(0x12345678, 0x9ABCDEF0)
+  local bytes_be = bit64.u64_to_be_bytes(val)
+  local bytes_le = bit64.u64_to_le_bytes(val)
+
+  benchmark_op("u64_to_be_bytes", function()
+    bit64.u64_to_be_bytes(val)
+  end, iterations)
+
+  benchmark_op("u64_to_le_bytes", function()
+    bit64.u64_to_le_bytes(val)
+  end, iterations)
+
+  benchmark_op("be_bytes_to_u64", function()
+    bit64.be_bytes_to_u64(bytes_be)
+  end, iterations)
+
+  benchmark_op("le_bytes_to_u64", function()
+    bit64.le_bytes_to_u64(bytes_le)
+  end, iterations)
+
+  print("\n64-bit Utility Functions:")
+
+  benchmark_op("new", function()
+    bit64.new(0x12345678, 0x9ABCDEF0)
+  end, iterations)
+
+  benchmark_op("is_int64", function()
+    bit64.is_int64(a)
+  end, iterations)
+
+  benchmark_op("to_hex", function()
+    bit64.to_hex(a)
+  end, iterations)
+
+  benchmark_op("to_number", function()
+    bit64.to_number(a)
+  end, iterations)
+
+  benchmark_op("from_number", function()
+    bit64.from_number(12345678901234)
+  end, iterations)
+
+  benchmark_op("eq", function()
+    bit64.eq(a, b)
+  end, iterations)
+
+  benchmark_op("is_zero", function()
+    bit64.is_zero(a)
+  end, iterations)
+end
+
 return bit64
diff --git a/src/bitn/utils/benchmark.lua b/src/bitn/utils/benchmark.lua
new file mode 100644
index 0000000..e813bef
--- /dev/null
+++ b/src/bitn/utils/benchmark.lua
@@ -0,0 +1,33 @@
+--- @module "bitn.utils.benchmark"
+--- Common benchmarking utilities for performance testing
+local benchmark = {}
+
+--- Run a benchmarked operation with warmup and timing
+--- @param name string Operation name for display
+--- @param func function Function to benchmark
+--- @param iterations? integer Number of iterations (default: 100)
+--- @return number ms_per_op Milliseconds per operation
+function benchmark.benchmark_op(name, func, iterations)
+  iterations = iterations or 100
+
+  -- Warmup
+  for _ = 1, 3 do
+    func()
+  end
+
+  -- Actual benchmark
+  local start = os.clock()
+  for _ = 1, iterations do
+    func()
+  end
+  local elapsed = os.clock() - start
+
+  local per_op = (elapsed / iterations) * 1000 -- ms
+  local ops_per_sec = iterations / elapsed
+
+  print(string.format("%-30s: %8.3f ms/op, %8.1f ops/sec", name, per_op, ops_per_sec))
+
+  return per_op
+end
+
+return benchmark
diff --git a/src/bitn/utils/init.lua b/src/bitn/utils/init.lua
new file mode 100644
index 0000000..9004ddb
--- /dev/null
+++ b/src/bitn/utils/init.lua
@@ -0,0 +1,7 @@
+--- @module "bitn.utils"
+--- Utility modules for bitn
+local utils = {
+  benchmark = require("bitn.utils.benchmark"),
+}
+
+return utils

From d45c42597bc2dd70e9182cdd9a383d4d57cdf81f Mon Sep 17 00:00:00 2001
From: Derek Miller <derek@finitelabs.com>
Date: Tue, 27 Jan 2026 11:30:53 -0600
Subject: [PATCH 2/2] Introduce `_compat` module for unified bitwise
 compatibility layer and optimize bit16, bit32, and bit64 modules.

- Add `_compat` module to detect and leverage native bitwise operations or fallback implementations.
- Refactor bit16, bit32, and bit64 modules to utilize `_compat` for enhanced performance, maintainability, and cross-version support.
- Consolidate all bitwise operations, shifts, and byte conversions under unified compatibility logic.
---
 CLAUDE.md            |  13 +-
 README.md            |  13 +-
 src/bitn/_compat.lua | 318 +++++++++++++++++++++++++++++++++++++++++++
 src/bitn/bit16.lua   | 141 +++++++------------
 src/bitn/bit32.lua   | 148 +++++++-------------
 src/bitn/bit64.lua   |  82 ++++++-----
 src/bitn/init.lua    |  10 +-
 7 files changed, 484 insertions(+), 241 deletions(-)
 create mode 100644 src/bitn/_compat.lua

diff --git a/CLAUDE.md b/CLAUDE.md
index 736fe3d..2281558 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,6 +6,7 @@
 lua-bitn/
 ├── src/bitn/
 │   ├── init.lua      # Module aggregator, exports bit16/bit32/bit64
+│   ├── _compat.lua   # Internal compatibility layer, feature detection
 │   ├── bit16.lua     # 16-bit bitwise operations
 │   ├── bit32.lua     # 32-bit bitwise operations
 │   ├── bit64.lua     # 64-bit bitwise operations (uses {high, low} pairs)
@@ -72,10 +73,16 @@ Each bit module (bit16, bit32, bit64) provides the same API:
 local value = {0x12345678, 0x9ABCDEF0}
 ```
 
-### Pure Lua Implementation
+### Compatibility Layer (_compat)
 
-All operations are implemented using basic Lua arithmetic to ensure
-compatibility across all Lua versions without native bit library dependencies.
+The `_compat` module provides automatic feature detection and optimized primitives:
+- **Lua 5.3+**: Uses native bitwise operators (`&`, `|`, `~`, `<<`, `>>`)
+- **Lua 5.2**: Uses built-in `bit32` library
+- **LuaJIT**: Uses `bit` library with signed-to-unsigned conversion
+- **Lua 5.1**: Falls back to pure Lua arithmetic implementation
+
+This ensures optimal performance on modern Lua while maintaining compatibility
+with older versions.
 
 ## Testing
 
diff --git a/README.md b/README.md
index 3edaf69..fc3826e 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
 # lua-bitn
 
-A pure Lua implementation of bitwise operations for 16-bit, 32-bit, and 64-bit
-integers with **zero external dependencies**. This library provides a complete,
-portable implementation that runs on Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT.
+A portable bitwise operations library for 16-bit, 32-bit, and 64-bit integers
+with **zero external dependencies**. This library provides a complete,
+cross-platform implementation that runs on Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT.
 
 ## Features
 
-- **Zero Dependencies**: Pure Lua implementation, no C extensions or external
-  libraries required
+- **Zero Dependencies**: No C extensions or external libraries required
+- **Automatic Optimization**: Uses native bit operations when available (Lua 5.2+
+  bit32 library, Lua 5.3+ operators, LuaJIT bit library) with pure Lua fallback
 - **Portable**: Runs on any Lua interpreter (5.1+)
 - **Complete**: Full bitwise operations API for 16-bit, 32-bit, and 64-bit integers
 - **Byte Conversions**: Big-endian and little-endian byte string conversions
@@ -125,7 +126,7 @@ make help                # Show all available targets
 
 ## Current Limitations
 
-- Pure Lua performance is slower than native bit libraries
+- Pure Lua fallback (Lua 5.1 without LuaJIT) is slower than native bit libraries
 - No constant-time guarantees
 
 ## License
diff --git a/src/bitn/_compat.lua b/src/bitn/_compat.lua
new file mode 100644
index 0000000..ccbd1d6
--- /dev/null
+++ b/src/bitn/_compat.lua
@@ -0,0 +1,318 @@
+--- @module "bitn._compat"
+--- Internal compatibility layer for bitwise operations.
+--- Provides feature detection and optimized primitives for use by bit16/bit32/bit64.
+local _compat = {}
+
+--------------------------------------------------------------------------------
+-- Helper functions (needed by all implementations)
+--------------------------------------------------------------------------------
+
+local math_floor = math.floor
+local math_pow = math.pow or function(x, y)
+  return x ^ y
+end
+
+--- Convert signed 32-bit to unsigned (for LuaJIT which returns signed values)
+--- @param n number Potentially signed 32-bit value
+--- @return number Unsigned 32-bit value
+local function to_unsigned(n)
+  if n < 0 then
+    return n + 0x100000000
+  end
+  return n
+end
+
+_compat.to_unsigned = to_unsigned
+
+-- Constants
+local MASK32 = 0xFFFFFFFF
+
+--------------------------------------------------------------------------------
+-- Implementation 1: Native operators (Lua 5.3+)
+--------------------------------------------------------------------------------
+
+local ok, result = pcall(load, "return function(a,b) return a & b end")
+if ok and result then
+  local fn = result()
+  if fn then
+    -- Native operators available - define all functions using them
+    local native_band = fn
+    local native_bor = assert(load("return function(a,b) return a | b end"))()
+    local native_bxor = assert(load("return function(a,b) return a ~ b end"))()
+    local native_bnot = assert(load("return function(a) return ~a end"))()
+    local native_lshift = assert(load("return function(a,n) return a << n end"))()
+    local native_rshift = assert(load("return function(a,n) return a >> n end"))()
+
+    _compat.has_native_ops = true
+    _compat.has_bit_lib = false
+    _compat.is_luajit = false
+
+    function _compat.impl_name()
+      return "native operators (Lua 5.3+)"
+    end
+
+    function _compat.band(a, b)
+      return native_band(a, b)
+    end
+
+    function _compat.bor(a, b)
+      return native_bor(a, b)
+    end
+
+    function _compat.bxor(a, b)
+      return native_bxor(a, b)
+    end
+
+    function _compat.bnot(a)
+      return native_band(native_bnot(a), MASK32)
+    end
+
+    function _compat.lshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return native_band(native_lshift(a, n), MASK32)
+    end
+
+    function _compat.rshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return native_rshift(native_band(a, MASK32), n)
+    end
+
+    function _compat.arshift(a, n)
+      a = native_band(a, MASK32)
+      local is_negative = a >= 0x80000000
+      if n >= 32 then
+        return is_negative and MASK32 or 0
+      end
+      local r = native_rshift(a, n)
+      if is_negative then
+        local fill_mask = native_lshift(MASK32, 32 - n)
+        r = native_bor(r, native_band(fill_mask, MASK32))
+      end
+      return native_band(r, MASK32)
+    end
+
+    return _compat
+  end
+end
+
+--------------------------------------------------------------------------------
+-- Implementation 2: Bit library (LuaJIT or Lua 5.2)
+--------------------------------------------------------------------------------
+
+local bit_lib
+local is_luajit = false
+
+-- Try LuaJIT's bit library first
+ok, result = pcall(require, "bit")
+if ok and result then
+  bit_lib = result
+  is_luajit = true
+else
+  -- Try Lua 5.2's bit32 library (use rawget to avoid recursion with our module name)
+  bit_lib = rawget(_G, "bit32")
+end
+
+if bit_lib then
+  -- Bit library available - define all functions using it
+  local bit_band = assert(bit_lib.band)
+  local bit_bor = assert(bit_lib.bor)
+  local bit_bxor = assert(bit_lib.bxor)
+  local bit_bnot = assert(bit_lib.bnot)
+  local bit_lshift = assert(bit_lib.lshift)
+  local bit_rshift = assert(bit_lib.rshift)
+  local bit_arshift = assert(bit_lib.arshift)
+
+  _compat.has_native_ops = false
+  _compat.has_bit_lib = true
+  _compat.is_luajit = is_luajit
+
+  function _compat.impl_name()
+    return "bit library"
+  end
+
+  if is_luajit then
+    -- LuaJIT returns signed integers, need to convert to unsigned
+    function _compat.band(a, b)
+      return to_unsigned(bit_band(a, b))
+    end
+
+    function _compat.bor(a, b)
+      return to_unsigned(bit_bor(a, b))
+    end
+
+    function _compat.bxor(a, b)
+      return to_unsigned(bit_bxor(a, b))
+    end
+
+    function _compat.bnot(a)
+      return to_unsigned(bit_bnot(a))
+    end
+
+    function _compat.lshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return to_unsigned(bit_lshift(a, n))
+    end
+
+    function _compat.rshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return to_unsigned(bit_rshift(a, n))
+    end
+
+    function _compat.arshift(a, n)
+      a = to_unsigned(bit_band(a, MASK32))
+      if n >= 32 then
+        local is_negative = a >= 0x80000000
+        return is_negative and MASK32 or 0
+      end
+      return to_unsigned(bit_arshift(a, n))
+    end
+  else
+    -- Lua 5.2 bit32 library returns unsigned integers
+    function _compat.band(a, b)
+      return bit_band(a, b)
+    end
+
+    function _compat.bor(a, b)
+      return bit_bor(a, b)
+    end
+
+    function _compat.bxor(a, b)
+      return bit_bxor(a, b)
+    end
+
+    function _compat.bnot(a)
+      return bit_band(bit_bnot(a), MASK32)
+    end
+
+    function _compat.lshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return bit_band(bit_lshift(a, n), MASK32)
+    end
+
+    function _compat.rshift(a, n)
+      if n >= 32 then
+        return 0
+      end
+      return bit_rshift(bit_band(a, MASK32), n)
+    end
+
+    function _compat.arshift(a, n)
+      a = bit_band(a, MASK32)
+      if n >= 32 then
+        local is_negative = a >= 0x80000000
+        return is_negative and MASK32 or 0
+      end
+      return bit_band(bit_arshift(a, n), MASK32)
+    end
+  end
+
+  return _compat
+end
+
+--------------------------------------------------------------------------------
+-- Implementation 3: Pure Lua fallback
+--------------------------------------------------------------------------------
+
+_compat.has_native_ops = false
+_compat.has_bit_lib = false
+_compat.is_luajit = false
+
+function _compat.impl_name()
+  return "pure Lua"
+end
+
+function _compat.band(a, b)
+  local r = 0
+  local bit_val = 1
+  for _ = 0, 31 do
+    if (a % 2 == 1) and (b % 2 == 1) then
+      r = r + bit_val
+    end
+    a = math_floor(a / 2)
+    b = math_floor(b / 2)
+    bit_val = bit_val * 2
+    if a == 0 and b == 0 then
+      break
+    end
+  end
+  return r
+end
+
+function _compat.bor(a, b)
+  local r = 0
+  local bit_val = 1
+  for _ = 0, 31 do
+    if (a % 2 == 1) or (b % 2 == 1) then
+      r = r + bit_val
+    end
+    a = math_floor(a / 2)
+    b = math_floor(b / 2)
+    bit_val = bit_val * 2
+    if a == 0 and b == 0 then
+      break
+    end
+  end
+  return r
+end
+
+function _compat.bxor(a, b)
+  local r = 0
+  local bit_val = 1
+  for _ = 0, 31 do
+    if (a % 2) ~= (b % 2) then
+      r = r + bit_val
+    end
+    a = math_floor(a / 2)
+    b = math_floor(b / 2)
+    bit_val = bit_val * 2
+    if a == 0 and b == 0 then
+      break
+    end
+  end
+  return r
+end
+
+function _compat.bnot(a)
+  return MASK32 - (math_floor(a) % 0x100000000)
+end
+
+function _compat.lshift(a, n)
+  if n >= 32 then
+    return 0
+  end
+  return math_floor((a * math_pow(2, n)) % 0x100000000)
+end
+
+function _compat.rshift(a, n)
+  if n >= 32 then
+    return 0
+  end
+  a = math_floor(a) % 0x100000000
+  return math_floor(a / math_pow(2, n))
+end
+
+function _compat.arshift(a, n)
+  a = math_floor(a) % 0x100000000
+  local is_negative = a >= 0x80000000
+  if n >= 32 then
+    return is_negative and MASK32 or 0
+  end
+  local r = math_floor(a / math_pow(2, n))
+  if is_negative then
+    local fill_mask = MASK32 - (math_pow(2, 32 - n) - 1)
+    r = _compat.bor(r, fill_mask)
+  end
+  return r
+end
+
+return _compat
diff --git a/src/bitn/bit16.lua b/src/bitn/bit16.lua
index c66e90f..842a05c 100644
--- a/src/bitn/bit16.lua
+++ b/src/bitn/bit16.lua
@@ -1,19 +1,35 @@
 --- @module "bitn.bit16"
---- Pure Lua 16-bit bitwise operations library.
+--- 16-bit bitwise operations library.
 --- This module provides a complete, version-agnostic implementation of 16-bit
---- bitwise operations that works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT
---- without depending on any built-in bit libraries.
---- @class bit16
+--- bitwise operations that works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT.
+--- Uses native bit operations where available for optimal performance.
 local bit16 = {}
 
+local _compat = require("bitn._compat")
+
+-- Cache methods as locals for faster access
+local compat_band = _compat.band
+local compat_bor = _compat.bor
+local compat_bxor = _compat.bxor
+local compat_bnot = _compat.bnot
+local compat_lshift = _compat.lshift
+local compat_rshift = _compat.rshift
+local impl_name = _compat.impl_name
+
 -- 16-bit mask constant
 local MASK16 = 0xFFFF
 
+local math_floor = math.floor
+
+--------------------------------------------------------------------------------
+-- Core operations
+--------------------------------------------------------------------------------
+
 --- Ensure value fits in 16-bit unsigned integer.
 --- @param n number Input value
 --- @return integer result 16-bit unsigned integer (0 to 0xFFFF)
 function bit16.mask(n)
-  return math.floor(n % 0x10000)
+  return compat_band(math_floor(n), MASK16)
 end
 
 --- Bitwise AND operation.
@@ -21,26 +37,7 @@ end
 --- @param b integer Second operand (16-bit)
 --- @return integer result Result of a AND b
 function bit16.band(a, b)
-  a = bit16.mask(a)
-  b = bit16.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 15 do
-    if (a % 2 == 1) and (b % 2 == 1) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_band(a, MASK16), compat_band(b, MASK16))
 end
 
 --- Bitwise OR operation.
@@ -48,26 +45,7 @@ end
 --- @param b integer Second operand (16-bit)
 --- @return integer result Result of a OR b
 function bit16.bor(a, b)
-  a = bit16.mask(a)
-  b = bit16.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 15 do
-    if (a % 2 == 1) or (b % 2 == 1) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_bor(a, b), MASK16)
 end
 
 --- Bitwise XOR operation.
@@ -75,33 +53,14 @@ end
 --- @param b integer Second operand (16-bit)
 --- @return integer result Result of a XOR b
 function bit16.bxor(a, b)
-  a = bit16.mask(a)
-  b = bit16.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 15 do
-    if (a % 2) ~= (b % 2) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_bxor(a, b), MASK16)
 end
 
 --- Bitwise NOT operation.
 --- @param a integer Operand (16-bit)
 --- @return integer result Result of NOT a
 function bit16.bnot(a)
-  return bit16.mask(MASK16 - bit16.mask(a))
+  return compat_band(compat_bnot(a), MASK16)
 end
 
 --- Left shift operation.
@@ -113,7 +72,7 @@ function bit16.lshift(a, n)
   if n >= 16 then
     return 0
   end
-  return bit16.mask(bit16.mask(a) * math.pow(2, n))
+  return compat_band(compat_lshift(compat_band(a, MASK16), n), MASK16)
 end
 
 --- Logical right shift operation (fills with 0s).
@@ -122,11 +81,10 @@ end
 --- @return integer result Result of a >> n (logical)
 function bit16.rshift(a, n)
   assert(n >= 0, "Shift amount must be non-negative")
-  a = bit16.mask(a)
   if n >= 16 then
     return 0
   end
-  return math.floor(a / math.pow(2, n))
+  return compat_rshift(compat_band(a, MASK16), n)
 end
 
 --- Arithmetic right shift operation (sign-extending, fills with sign bit).
@@ -135,27 +93,25 @@ end
 --- @return integer result Result of a >> n with sign extension
 function bit16.arshift(a, n)
   assert(n >= 0, "Shift amount must be non-negative")
-  a = bit16.mask(a)
+  a = compat_band(a, MASK16)
 
   -- Check if sign bit is set (bit 15)
   local is_negative = a >= 0x8000
 
   if n >= 16 then
-    -- All bits shift out, result is all 1s if negative, all 0s if positive
-    return is_negative and 0xFFFF or 0
+    return is_negative and MASK16 or 0
   end
 
-  -- Perform logical right shift first
-  local result = math.floor(a / math.pow(2, n))
+  -- Perform logical right shift
+  local result = compat_rshift(a, n)
 
   -- If original was negative, fill high bits with 1s
   if is_negative then
-    -- Create mask for high bits that need to be 1
-    local fill_mask = MASK16 - (math.floor(2 ^ (16 - n)) - 1)
-    result = bit16.bor(result, fill_mask)
+    local fill_mask = compat_band(compat_lshift(MASK16, 16 - n), MASK16)
+    result = compat_bor(result, fill_mask)
   end
 
-  return result
+  return compat_band(result, MASK16)
 end
 
 --- Left rotate operation.
@@ -164,8 +120,8 @@ end
 --- @return integer result Result of rotating x left by n positions
 function bit16.rol(x, n)
   n = n % 16
-  x = bit16.mask(x)
-  return bit16.mask(bit16.lshift(x, n) + bit16.rshift(x, 16 - n))
+  x = compat_band(x, MASK16)
+  return compat_band(compat_bor(compat_lshift(x, n), compat_rshift(x, 16 - n)), MASK16)
 end
 
 --- Right rotate operation.
@@ -174,8 +130,8 @@ end
 --- @return integer result Result of rotating x right by n positions
 function bit16.ror(x, n)
   n = n % 16
-  x = bit16.mask(x)
-  return bit16.mask(bit16.rshift(x, n) + bit16.lshift(x, 16 - n))
+  x = compat_band(x, MASK16)
+  return compat_band(compat_bor(compat_rshift(x, n), compat_lshift(x, 16 - n)), MASK16)
 end
 
 --- 16-bit addition with overflow handling.
@@ -183,27 +139,30 @@ end
 --- @param b integer Second operand (16-bit)
 --- @return integer result Result of (a + b) mod 2^16
 function bit16.add(a, b)
-  return bit16.mask(bit16.mask(a) + bit16.mask(b))
+  return compat_band(compat_band(a, MASK16) + compat_band(b, MASK16), MASK16)
 end
 
 --------------------------------------------------------------------------------
 -- Byte conversion functions
 --------------------------------------------------------------------------------
 
+local string_char = string.char
+local string_byte = string.byte
+
 --- Convert 16-bit unsigned integer to 2 bytes (big-endian).
 --- @param n integer 16-bit unsigned integer
 --- @return string bytes 2-byte string in big-endian order
 function bit16.u16_to_be_bytes(n)
-  n = bit16.mask(n)
-  return string.char(math.floor(n / 256), n % 256)
+  n = compat_band(n, MASK16)
+  return string_char(math_floor(n / 256), n % 256)
 end
 
 --- Convert 16-bit unsigned integer to 2 bytes (little-endian).
 --- @param n integer 16-bit unsigned integer
 --- @return string bytes 2-byte string in little-endian order
 function bit16.u16_to_le_bytes(n)
-  n = bit16.mask(n)
-  return string.char(n % 256, math.floor(n / 256))
+  n = compat_band(n, MASK16)
+  return string_char(n % 256, math_floor(n / 256))
 end
 
 --- Convert 2 bytes to 16-bit unsigned integer (big-endian).
@@ -213,7 +172,7 @@ end
 function bit16.be_bytes_to_u16(str, offset)
   offset = offset or 1
   assert(#str >= offset + 1, "Insufficient bytes for u16")
-  local b1, b2 = string.byte(str, offset, offset + 1)
+  local b1, b2 = string_byte(str, offset, offset + 1)
   return b1 * 256 + b2
 end
 
@@ -224,7 +183,7 @@ end
 function bit16.le_bytes_to_u16(str, offset)
   offset = offset or 1
   assert(#str >= offset + 1, "Insufficient bytes for u16")
-  local b1, b2 = string.byte(str, offset, offset + 1)
+  local b1, b2 = string_byte(str, offset, offset + 1)
   return b1 + b2 * 256
 end
 
@@ -239,6 +198,7 @@ local unpack_fn = unpack or table.unpack
 --- @return boolean result True if all tests pass, false otherwise
 function bit16.selftest()
   print("Running 16-bit operations test vectors...")
+  print(string.format("  Using: %s", impl_name()))
   local passed = 0
   local total = 0
 
@@ -450,9 +410,10 @@ local benchmark_op = require("bitn.utils.benchmark").benchmark_op
 
 --- Run performance benchmarks for 16-bit operations.
 function bit16.benchmark()
-  local iterations = 1000000
+  local iterations = 100000
 
   print("16-bit Bitwise Operations:")
+  print(string.format("  Implementation: %s", impl_name()))
 
   -- Test values
   local a, b = 0xAAAA, 0x5555
diff --git a/src/bitn/bit32.lua b/src/bitn/bit32.lua
index 0650d14..2c0a83b 100644
--- a/src/bitn/bit32.lua
+++ b/src/bitn/bit32.lua
@@ -1,19 +1,36 @@
 --- @module "bitn.bit32"
---- Pure Lua 32-bit bitwise operations library.
+--- 32-bit bitwise operations library.
 --- This module provides a complete, version-agnostic implementation of 32-bit
---- bitwise operations that works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT
---- without depending on any built-in bit libraries.
---- @class bit32
+--- bitwise operations that works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT.
+--- Uses native bit operations where available for optimal performance.
 local bit32 = {}
 
+local _compat = require("bitn._compat")
+
+-- Cache methods as locals for faster access
+local compat_band = _compat.band
+local compat_bor = _compat.bor
+local compat_bxor = _compat.bxor
+local compat_bnot = _compat.bnot
+local compat_lshift = _compat.lshift
+local compat_rshift = _compat.rshift
+local compat_arshift = _compat.arshift
+local impl_name = _compat.impl_name
+
 -- 32-bit mask constant
 local MASK32 = 0xFFFFFFFF
 
+local math_floor = math.floor
+
+--------------------------------------------------------------------------------
+-- Core operations
+--------------------------------------------------------------------------------
+
 --- Ensure value fits in 32-bit unsigned integer.
 --- @param n number Input value
 --- @return integer result 32-bit unsigned integer (0 to 0xFFFFFFFF)
 function bit32.mask(n)
-  return math.floor(n % 0x100000000)
+  return compat_band(math_floor(n), MASK32)
 end
 
 --- Bitwise AND operation.
@@ -21,26 +38,7 @@ end
 --- @param b integer Second operand (32-bit)
 --- @return integer result Result of a AND b
 function bit32.band(a, b)
-  a = bit32.mask(a)
-  b = bit32.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 31 do
-    if (a % 2 == 1) and (b % 2 == 1) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_band(a, MASK32), compat_band(b, MASK32))
 end
 
 --- Bitwise OR operation.
@@ -48,26 +46,7 @@ end
 --- @param b integer Second operand (32-bit)
 --- @return integer result Result of a OR b
 function bit32.bor(a, b)
-  a = bit32.mask(a)
-  b = bit32.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 31 do
-    if (a % 2 == 1) or (b % 2 == 1) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_bor(a, b), MASK32)
 end
 
 --- Bitwise XOR operation.
@@ -75,33 +54,14 @@ end
 --- @param b integer Second operand (32-bit)
 --- @return integer result Result of a XOR b
 function bit32.bxor(a, b)
-  a = bit32.mask(a)
-  b = bit32.mask(b)
-
-  local result = 0
-  local bit_val = 1
-
-  for _ = 0, 31 do
-    if (a % 2) ~= (b % 2) then
-      result = result + bit_val
-    end
-    a = math.floor(a / 2)
-    b = math.floor(b / 2)
-    bit_val = bit_val * 2
-
-    if a == 0 and b == 0 then
-      break
-    end
-  end
-
-  return result
+  return compat_band(compat_bxor(a, b), MASK32)
 end
 
 --- Bitwise NOT operation.
 --- @param a integer Operand (32-bit)
 --- @return integer result Result of NOT a
 function bit32.bnot(a)
-  return bit32.mask(MASK32 - bit32.mask(a))
+  return compat_band(compat_bnot(a), MASK32)
 end
 
 --- Left shift operation.
@@ -113,7 +73,7 @@ function bit32.lshift(a, n)
   if n >= 32 then
     return 0
   end
-  return bit32.mask(bit32.mask(a) * math.pow(2, n))
+  return compat_band(compat_lshift(compat_band(a, MASK32), n), MASK32)
 end
 
 --- Logical right shift operation (fills with 0s).
@@ -122,11 +82,10 @@ end
 --- @return integer result Result of a >> n (logical)
 function bit32.rshift(a, n)
   assert(n >= 0, "Shift amount must be non-negative")
-  a = bit32.mask(a)
   if n >= 32 then
     return 0
   end
-  return math.floor(a / math.pow(2, n))
+  return compat_rshift(compat_band(a, MASK32), n)
 end
 
 --- Arithmetic right shift operation (sign-extending, fills with sign bit).
@@ -135,27 +94,7 @@ end
 --- @return integer result Result of a >> n with sign extension
 function bit32.arshift(a, n)
   assert(n >= 0, "Shift amount must be non-negative")
-  a = bit32.mask(a)
-
-  -- Check if sign bit is set (bit 31)
-  local is_negative = a >= 0x80000000
-
-  if n >= 32 then
-    -- All bits shift out, result is all 1s if negative, all 0s if positive
-    return is_negative and 0xFFFFFFFF or 0
-  end
-
-  -- Perform logical right shift first
-  local result = math.floor(a / math.pow(2, n))
-
-  -- If original was negative, fill high bits with 1s
-  if is_negative then
-    -- Create mask for high bits that need to be 1
-    local fill_mask = MASK32 - (math.pow(2, 32 - n) - 1)
-    result = bit32.bor(result, fill_mask)
-  end
-
-  return result
+  return compat_arshift(a, n)
 end
 
 --- Left rotate operation.
@@ -164,8 +103,8 @@ end
 --- @return integer result Result of rotating x left by n positions
 function bit32.rol(x, n)
   n = n % 32
-  x = bit32.mask(x)
-  return bit32.mask(bit32.lshift(x, n) + bit32.rshift(x, 32 - n))
+  x = compat_band(x, MASK32)
+  return compat_band(compat_bor(compat_lshift(x, n), compat_rshift(x, 32 - n)), MASK32)
 end
 
 --- Right rotate operation.
@@ -174,8 +113,8 @@ end
 --- @return integer result Result of rotating x right by n positions
 function bit32.ror(x, n)
   n = n % 32
-  x = bit32.mask(x)
-  return bit32.mask(bit32.rshift(x, n) + bit32.lshift(x, 32 - n))
+  x = compat_band(x, MASK32)
+  return compat_band(compat_bor(compat_rshift(x, n), compat_lshift(x, 32 - n)), MASK32)
 end
 
 --- 32-bit addition with overflow handling.
@@ -183,27 +122,30 @@ end
 --- @param b integer Second operand (32-bit)
 --- @return integer result Result of (a + b) mod 2^32
 function bit32.add(a, b)
-  return bit32.mask(bit32.mask(a) + bit32.mask(b))
+  return compat_band(compat_band(a, MASK32) + compat_band(b, MASK32), MASK32)
 end
 
 --------------------------------------------------------------------------------
 -- Byte conversion functions
 --------------------------------------------------------------------------------
 
+local string_char = string.char
+local string_byte = string.byte
+
 --- Convert 32-bit unsigned integer to 4 bytes (big-endian).
 --- @param n integer 32-bit unsigned integer
 --- @return string bytes 4-byte string in big-endian order
 function bit32.u32_to_be_bytes(n)
-  n = bit32.mask(n)
-  return string.char(math.floor(n / 16777216) % 256, math.floor(n / 65536) % 256, math.floor(n / 256) % 256, n % 256)
+  n = compat_band(n, MASK32)
+  return string_char(math_floor(n / 16777216) % 256, math_floor(n / 65536) % 256, math_floor(n / 256) % 256, n % 256)
 end
 
 --- Convert 32-bit unsigned integer to 4 bytes (little-endian).
 --- @param n integer 32-bit unsigned integer
 --- @return string bytes 4-byte string in little-endian order
 function bit32.u32_to_le_bytes(n)
-  n = bit32.mask(n)
-  return string.char(n % 256, math.floor(n / 256) % 256, math.floor(n / 65536) % 256, math.floor(n / 16777216) % 256)
+  n = compat_band(n, MASK32)
+  return string_char(n % 256, math_floor(n / 256) % 256, math_floor(n / 65536) % 256, math_floor(n / 16777216) % 256)
 end
 
 --- Convert 4 bytes to 32-bit unsigned integer (big-endian).
@@ -213,7 +155,7 @@ end
 function bit32.be_bytes_to_u32(str, offset)
   offset = offset or 1
   assert(#str >= offset + 3, "Insufficient bytes for u32")
-  local b1, b2, b3, b4 = string.byte(str, offset, offset + 3)
+  local b1, b2, b3, b4 = string_byte(str, offset, offset + 3)
   return b1 * 16777216 + b2 * 65536 + b3 * 256 + b4
 end
 
@@ -224,7 +166,7 @@ end
 function bit32.le_bytes_to_u32(str, offset)
   offset = offset or 1
   assert(#str >= offset + 3, "Insufficient bytes for u32")
-  local b1, b2, b3, b4 = string.byte(str, offset, offset + 3)
+  local b1, b2, b3, b4 = string_byte(str, offset, offset + 3)
   return b1 + b2 * 256 + b3 * 65536 + b4 * 16777216
 end
 
@@ -239,6 +181,7 @@ local unpack_fn = unpack or table.unpack
 --- @return boolean result True if all tests pass, false otherwise
 function bit32.selftest()
   print("Running 32-bit operations test vectors...")
+  print(string.format("  Using: %s", impl_name()))
   local passed = 0
   local total = 0
 
@@ -502,9 +445,10 @@ local benchmark_op = require("bitn.utils.benchmark").benchmark_op
 
 --- Run performance benchmarks for 32-bit operations.
 function bit32.benchmark()
-  local iterations = 1000000
+  local iterations = 100000
 
   print("32-bit Bitwise Operations:")
+  print(string.format("  Implementation: %s", impl_name()))
 
   -- Test values
   local a, b = 0xAAAAAAAA, 0x55555555
diff --git a/src/bitn/bit64.lua b/src/bitn/bit64.lua
index 34eb794..066063d 100644
--- a/src/bitn/bit64.lua
+++ b/src/bitn/bit64.lua
@@ -1,13 +1,27 @@
 --- @module "bitn.bit64"
---- Pure Lua 64-bit bitwise operations library.
+--- 64-bit bitwise operations library.
 --- This module provides 64-bit bitwise operations using {high, low} pairs,
 --- where high is the upper 32 bits and low is the lower 32 bits.
---- Works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT without depending on
---- any built-in bit libraries.
---- @class bit64
+--- Works across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT.
+--- Uses native bit operations where available for optimal performance.
 local bit64 = {}
 
 local bit32 = require("bitn.bit32")
+local _compat = require("bitn._compat")
+local impl_name = _compat.impl_name
+
+-- Cache bit32 methods as locals for faster access
+local bit32_band = bit32.band
+local bit32_bor = bit32.bor
+local bit32_bxor = bit32.bxor
+local bit32_bnot = bit32.bnot
+local bit32_lshift = bit32.lshift
+local bit32_rshift = bit32.rshift
+local bit32_arshift = bit32.arshift
+local bit32_u32_to_be_bytes = bit32.u32_to_be_bytes
+local bit32_u32_to_le_bytes = bit32.u32_to_le_bytes
+local bit32_be_bytes_to_u32 = bit32.be_bytes_to_u32
+local bit32_le_bytes_to_u32 = bit32.le_bytes_to_u32
 
 -- Private metatable for Int64 type identification
 local Int64Meta = { __name = "Int64" }
@@ -43,7 +57,7 @@ end
 --- @param b Int64HighLow Second operand {high, low}
 --- @return Int64HighLow result {high, low} AND result
 function bit64.band(a, b)
-  return bit64.new(bit32.band(a[1], b[1]), bit32.band(a[2], b[2]))
+  return bit64.new(bit32_band(a[1], b[1]), bit32_band(a[2], b[2]))
 end
 
 --- Bitwise OR operation.
@@ -51,7 +65,7 @@ end
 --- @param b Int64HighLow Second operand {high, low}
 --- @return Int64HighLow result {high, low} OR result
 function bit64.bor(a, b)
-  return bit64.new(bit32.bor(a[1], b[1]), bit32.bor(a[2], b[2]))
+  return bit64.new(bit32_bor(a[1], b[1]), bit32_bor(a[2], b[2]))
 end
 
 --- Bitwise XOR operation.
@@ -59,14 +73,14 @@ end
 --- @param b Int64HighLow Second operand {high, low}
 --- @return Int64HighLow result {high, low} XOR result
 function bit64.bxor(a, b)
-  return bit64.new(bit32.bxor(a[1], b[1]), bit32.bxor(a[2], b[2]))
+  return bit64.new(bit32_bxor(a[1], b[1]), bit32_bxor(a[2], b[2]))
 end
 
 --- Bitwise NOT operation.
 --- @param a Int64HighLow Operand {high, low}
 --- @return Int64HighLow result {high, low} NOT result
 function bit64.bnot(a)
-  return bit64.new(bit32.bnot(a[1]), bit32.bnot(a[2]))
+  return bit64.new(bit32_bnot(a[1]), bit32_bnot(a[2]))
 end
 
 --------------------------------------------------------------------------------
@@ -84,11 +98,11 @@ function bit64.lshift(x, n)
     return bit64.new(0, 0)
   elseif n >= 32 then
     -- Shift by 32 or more: low becomes 0, high gets bits from low
-    return bit64.new(bit32.lshift(x[2], n - 32), 0)
+    return bit64.new(bit32_lshift(x[2], n - 32), 0)
   else
     -- Shift by less than 32
-    local new_high = bit32.bor(bit32.lshift(x[1], n), bit32.rshift(x[2], 32 - n))
-    local new_low = bit32.lshift(x[2], n)
+    local new_high = bit32_bor(bit32_lshift(x[1], n), bit32_rshift(x[2], 32 - n))
+    local new_low = bit32_lshift(x[2], n)
     return bit64.new(new_high, new_low)
   end
 end
@@ -104,11 +118,11 @@ function bit64.rshift(x, n)
     return bit64.new(0, 0)
   elseif n >= 32 then
     -- Shift by 32 or more: high becomes 0, low gets bits from high
-    return bit64.new(0, bit32.rshift(x[1], n - 32))
+    return bit64.new(0, bit32_rshift(x[1], n - 32))
   else
     -- Shift by less than 32
-    local new_low = bit32.bor(bit32.rshift(x[2], n), bit32.lshift(x[1], 32 - n))
-    local new_high = bit32.rshift(x[1], n)
+    local new_low = bit32_bor(bit32_rshift(x[2], n), bit32_lshift(x[1], 32 - n))
+    local new_high = bit32_rshift(x[1], n)
     return bit64.new(new_high, new_low)
   end
 end
@@ -123,7 +137,7 @@ function bit64.arshift(x, n)
   end
 
   -- Check sign bit (bit 31 of high word)
-  local is_negative = bit32.band(x[1], 0x80000000) ~= 0
+  local is_negative = bit32_band(x[1], 0x80000000) ~= 0
 
   if n >= 64 then
     -- All bits shift out, result is all 1s if negative, all 0s if positive
@@ -134,13 +148,13 @@ function bit64.arshift(x, n)
     end
   elseif n >= 32 then
     -- High word shifts into low, high fills with sign
-    local new_low = bit32.arshift(x[1], n - 32)
+    local new_low = bit32_arshift(x[1], n - 32)
     local new_high = is_negative and 0xFFFFFFFF or 0
     return bit64.new(new_high, new_low)
   else
     -- Shift by less than 32
-    local new_low = bit32.bor(bit32.rshift(x[2], n), bit32.lshift(x[1], 32 - n))
-    local new_high = bit32.arshift(x[1], n)
+    local new_low = bit32_bor(bit32_rshift(x[2], n), bit32_lshift(x[1], 32 - n))
+    local new_high = bit32_arshift(x[1], n)
     return bit64.new(new_high, new_low)
   end
 end
@@ -166,14 +180,14 @@ function bit64.rol(x, n)
     return bit64.new(low, high)
   elseif n < 32 then
     -- Rotate within 32-bit boundaries
-    local new_high = bit32.bor(bit32.lshift(high, n), bit32.rshift(low, 32 - n))
-    local new_low = bit32.bor(bit32.lshift(low, n), bit32.rshift(high, 32 - n))
+    local new_high = bit32_bor(bit32_lshift(high, n), bit32_rshift(low, 32 - n))
+    local new_low = bit32_bor(bit32_lshift(low, n), bit32_rshift(high, 32 - n))
     return bit64.new(new_high, new_low)
   else
     -- n > 32: rotate by (n - 32) after swapping
     n = n - 32
-    local new_high = bit32.bor(bit32.lshift(low, n), bit32.rshift(high, 32 - n))
-    local new_low = bit32.bor(bit32.lshift(high, n), bit32.rshift(low, 32 - n))
+    local new_high = bit32_bor(bit32_lshift(low, n), bit32_rshift(high, 32 - n))
+    local new_low = bit32_bor(bit32_lshift(high, n), bit32_rshift(low, 32 - n))
     return bit64.new(new_high, new_low)
   end
 end
@@ -195,14 +209,14 @@ function bit64.ror(x, n)
     return bit64.new(low, high)
   elseif n < 32 then
     -- Rotate within 32-bit boundaries
-    local new_low = bit32.bor(bit32.rshift(low, n), bit32.lshift(high, 32 - n))
-    local new_high = bit32.bor(bit32.rshift(high, n), bit32.lshift(low, 32 - n))
+    local new_low = bit32_bor(bit32_rshift(low, n), bit32_lshift(high, 32 - n))
+    local new_high = bit32_bor(bit32_rshift(high, n), bit32_lshift(low, 32 - n))
     return bit64.new(new_high, new_low)
   else
     -- n > 32: rotate by (n - 32) after swapping
     n = n - 32
-    local new_low = bit32.bor(bit32.rshift(high, n), bit32.lshift(low, 32 - n))
-    local new_high = bit32.bor(bit32.rshift(low, n), bit32.lshift(high, 32 - n))
+    local new_low = bit32_bor(bit32_rshift(high, n), bit32_lshift(low, 32 - n))
+    local new_high = bit32_bor(bit32_rshift(low, n), bit32_lshift(high, 32 - n))
     return bit64.new(new_high, new_low)
   end
 end
@@ -239,14 +253,14 @@ end
 --- @param x Int64HighLow 64-bit value {high, low}
 --- @return string bytes 8-byte string in big-endian order
 function bit64.u64_to_be_bytes(x)
-  return bit32.u32_to_be_bytes(x[1]) .. bit32.u32_to_be_bytes(x[2])
+  return bit32_u32_to_be_bytes(x[1]) .. bit32_u32_to_be_bytes(x[2])
 end
 
 --- Convert 64-bit value to 8 bytes (little-endian).
 --- @param x Int64HighLow 64-bit value {high, low}
 --- @return string bytes 8-byte string in little-endian order
 function bit64.u64_to_le_bytes(x)
-  return bit32.u32_to_le_bytes(x[2]) .. bit32.u32_to_le_bytes(x[1])
+  return bit32_u32_to_le_bytes(x[2]) .. bit32_u32_to_le_bytes(x[1])
 end
 
 --- Convert 8 bytes to 64-bit value (big-endian).
@@ -256,8 +270,8 @@ end
 function bit64.be_bytes_to_u64(str, offset)
   offset = offset or 1
   assert(#str >= offset + 7, "Insufficient bytes for u64")
-  local high = bit32.be_bytes_to_u32(str, offset)
-  local low = bit32.be_bytes_to_u32(str, offset + 4)
+  local high = bit32_be_bytes_to_u32(str, offset)
+  local low = bit32_be_bytes_to_u32(str, offset + 4)
   return bit64.new(high, low)
 end
 
@@ -268,8 +282,8 @@ end
 function bit64.le_bytes_to_u64(str, offset)
   offset = offset or 1
   assert(#str >= offset + 7, "Insufficient bytes for u64")
-  local low = bit32.le_bytes_to_u32(str, offset)
-  local high = bit32.le_bytes_to_u32(str, offset + 4)
+  local low = bit32_le_bytes_to_u32(str, offset)
+  local high = bit32_le_bytes_to_u32(str, offset + 4)
   return bit64.new(high, low)
 end
 
@@ -384,6 +398,7 @@ end
 --- @return boolean result True if all tests pass, false otherwise
 function bit64.selftest()
   print("Running 64-bit operations test vectors...")
+  print(string.format("  Using: %s", impl_name()))
   local passed = 0
   local total = 0
 
@@ -941,9 +956,10 @@ local benchmark_op = require("bitn.utils.benchmark").benchmark_op
 
 --- Run performance benchmarks for 64-bit operations.
 function bit64.benchmark()
-  local iterations = 1000000
+  local iterations = 100000
 
   print("64-bit Bitwise Operations:")
+  print(string.format("  Implementation: %s", impl_name()))
 
   -- Test values
   local a = bit64.new(0xAAAAAAAA, 0x55555555)
diff --git a/src/bitn/init.lua b/src/bitn/init.lua
index 6134acb..214300b 100644
--- a/src/bitn/init.lua
+++ b/src/bitn/init.lua
@@ -1,9 +1,9 @@
 --- @module "bitn"
---- Pure Lua bitwise operations library.
+--- Portable bitwise operations library with automatic optimization.
 --- This library provides standalone, version-agnostic implementations of
 --- bitwise operations for 16-bit, 32-bit, and 64-bit integers. It works
---- across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT without depending on any
---- built-in bit libraries.
+--- across Lua 5.1, 5.2, 5.3, 5.4, and LuaJIT with zero external dependencies.
+--- Automatically uses native bit operations when available for optimal performance.
 ---
 --- @usage
 --- local bitn = require("bitn")
@@ -18,13 +18,9 @@
 --- -- 16-bit operations
 --- local shifted = bitn.bit16.lshift(1, 8)  -- 256
 ---
---- @class bitn
 local bitn = {
-  --- @type bit16
   bit16 = require("bitn.bit16"),
-  --- @type bit32
   bit32 = require("bitn.bit32"),
-  --- @type bit64
   bit64 = require("bitn.bit64"),
 }