From e3c475698ac1c164a2d53a36689e917e9862811d Mon Sep 17 00:00:00 2001 From: Adam Korczynski Date: Wed, 18 Feb 2026 21:10:26 +0000 Subject: [PATCH] Add module fuzzers Signed-off-by: Adam Korczynski --- module-fuzzers/fuzz_crypto.cpp | 503 +++++++++++++ module-fuzzers/fuzz_dataops.cpp | 1166 +++++++++++++++++++++++++++++++ module-fuzzers/fuzz_decode.cpp | 1029 +++++++++++++++++++++++++++ module-fuzzers/fuzz_helpers.h | 139 ++++ module-fuzzers/fuzz_ioops.cpp | 1015 +++++++++++++++++++++++++++ module-fuzzers/fuzz_parsers.cpp | 744 ++++++++++++++++++++ module-fuzzers/fuzz_textops.cpp | 467 +++++++++++++ 7 files changed, 5063 insertions(+) create mode 100644 module-fuzzers/fuzz_crypto.cpp create mode 100644 module-fuzzers/fuzz_dataops.cpp create mode 100644 module-fuzzers/fuzz_decode.cpp create mode 100644 module-fuzzers/fuzz_helpers.h create mode 100644 module-fuzzers/fuzz_ioops.cpp create mode 100644 module-fuzzers/fuzz_parsers.cpp create mode 100644 module-fuzzers/fuzz_textops.cpp diff --git a/module-fuzzers/fuzz_crypto.cpp b/module-fuzzers/fuzz_crypto.cpp new file mode 100644 index 0000000..1ce4b9e --- /dev/null +++ b/module-fuzzers/fuzz_crypto.cpp @@ -0,0 +1,503 @@ +// fuzz_crypto.cpp — Fuzzer for CPython's hash and HMAC C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _md5, _sha1, _sha2 — MD5, SHA-1, SHA-224/256/384/512 +// _sha3 — SHA3-224/256/384/512, SHAKE-128/256 +// _blake2 — BLAKE2b (64-byte key/16-byte salt/person), +// BLAKE2s (32-byte key/8-byte salt/person) +// _hmac — Low-level compute_md5/sha1/sha256/sha512 +// hmac (Python module) — hmac.new(), hmac.digest(), hmac.compare_digest() +// hashlib (Python module) — hashlib.new(), hashlib.pbkdf2_hmac(), +// hashlib.file_digest() +// +// The first byte of fuzz input selects one of 13 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (algorithm choice, key/salt/data sizes, action sequences). +// +// Operations fall into two categories: +// +// Chained — Create a hash/HMAC object, then loop up to 100 actions +// chosen from: .update(data), .digest(), .hexdigest(), .copy().digest(), +// and reading .name/.digest_size/.block_size attributes. Used for +// standard hashes, SHAKE (variable-length digest), BLAKE2 (keyed + +// variable digest_size), hmac.new(), and hashlib.new(). +// +// One-shot — A single function call: _hmac.compute_*(key, msg), +// hmac.digest(key, msg, algo), hmac.compare_digest(a, b), +// hashlib.file_digest(BytesIO, algo), hashlib.pbkdf2_hmac(algo, pw, salt, 1). +// +// All module functions and constructors are imported once during init and +// cached as static PyObject* pointers. PyRef (RAII) prevents reference leaks. +// PyGC_Collect() runs every 200 iterations. Max input size: 1 MB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +static PyObject *ctor_md5, *ctor_sha1; +static PyObject *ctor_sha224, *ctor_sha256, *ctor_sha384, *ctor_sha512; +static PyObject *ctor_sha3_224, *ctor_sha3_256, *ctor_sha3_384, *ctor_sha3_512; +static PyObject *ctor_shake_128, *ctor_shake_256; +static PyObject *ctor_blake2b, *ctor_blake2s; + +static PyObject **all_hash_ctors[] = { + &ctor_md5, &ctor_sha1, &ctor_sha224, &ctor_sha256, + &ctor_sha384, &ctor_sha512, &ctor_sha3_224, &ctor_sha3_256, + &ctor_sha3_384, &ctor_sha3_512, &ctor_blake2b, &ctor_blake2s, +}; +static constexpr int kNumHashCtors = + sizeof(all_hash_ctors) / sizeof(all_hash_ctors[0]); + +static PyObject **shake_ctors[] = {&ctor_shake_128, &ctor_shake_256}; +static constexpr int kNumShakeCtors = 2; + +static PyObject *hmac_compute_funcs[4]; +static int num_hmac_compute_funcs = 0; + +static PyObject *hashlib_new, *hashlib_pbkdf2_hmac, *hashlib_file_digest; +static PyObject *py_hmac_new, *py_hmac_digest, *py_hmac_compare_digest; +static PyObject *bytesio_ctor; + +static const char *kHmacAlgos[] = { + "md5", "sha224", "sha256", "sha384", "sha512", "sha3_256", "blake2s", +}; +static constexpr int kNumHmacAlgos = + sizeof(kHmacAlgos) / sizeof(kHmacAlgos[0]); + +static const char *kPbkdf2Algos[] = {"sha1", "sha256", "sha512"}; +static constexpr int kNumPbkdf2Algos = 3; + +static const char *kHashlibAlgos[] = {"md5", "sha256", "sha3_256", "sha512"}; +static constexpr int kNumHashlibAlgos = 4; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_crypto(void) { + if (initialized) return; + + struct { + PyObject **dest; + const char *mod, *attr; + } inits[] = { + {&ctor_md5, "_md5", "md5"}, + {&ctor_sha1, "_sha1", "sha1"}, + {&ctor_sha224, "_sha2", "sha224"}, + {&ctor_sha256, "_sha2", "sha256"}, + {&ctor_sha384, "_sha2", "sha384"}, + {&ctor_sha512, "_sha2", "sha512"}, + {&ctor_sha3_224, "_sha3", "sha3_224"}, + {&ctor_sha3_256, "_sha3", "sha3_256"}, + {&ctor_sha3_384, "_sha3", "sha3_384"}, + {&ctor_sha3_512, "_sha3", "sha3_512"}, + {&ctor_shake_128, "_sha3", "shake_128"}, + {&ctor_shake_256, "_sha3", "shake_256"}, + {&ctor_blake2b, "_blake2", "blake2b"}, + {&ctor_blake2s, "_blake2", "blake2s"}, + }; + for (auto &i : inits) + *i.dest = import_attr(i.mod, i.attr); + + PyObject *hmac_mod = PyImport_ImportModule("_hmac"); + if (hmac_mod) { + const char *names[] = { + "compute_md5", "compute_sha1", "compute_sha256", "compute_sha512", + }; + for (auto name : names) { + PyObject *fn = PyObject_GetAttrString(hmac_mod, name); + if (fn) + hmac_compute_funcs[num_hmac_compute_funcs++] = fn; + else + PyErr_Clear(); + } + Py_DECREF(hmac_mod); + } else { + PyErr_Clear(); + } + + hashlib_new = import_attr("hashlib", "new"); + hashlib_pbkdf2_hmac = import_attr("hashlib", "pbkdf2_hmac"); + hashlib_file_digest = import_attr("hashlib", "file_digest"); + py_hmac_new = import_attr("hmac", "new"); + py_hmac_digest = import_attr("hmac", "digest"); + py_hmac_compare_digest = import_attr("hmac", "compare_digest"); + bytesio_ctor = import_attr("io", "BytesIO"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Chained action loop — shared by OP_HASH_CHAIN, OP_SHAKE_CHAIN, +// OP_BLAKE2*_KEYED, OP_BLAKE2*_VARDIGEST, OP_PYHMAC_CHAIN, and +// OP_HASHLIB_CHAIN. +// +// Takes a borrowed reference to a hash-like object and loops up to 100 +// fuzz-driven actions: .update(data), .digest(), .hexdigest(), +// .copy().digest(), and attribute reads (.name, .digest_size, .block_size). +// --------------------------------------------------------------------------- + +static void chain_hash_actions(PyObject *h, FuzzedDataProvider &fdp) { + for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { + switch (fdp.ConsumeIntegralInRange(0, 4)) { + case 0: { // .update(data) + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000))); + PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); + CHECK(r); + break; + } + case 1: { + PyRef d = PyObject_CallMethod(h, "digest", NULL); + CHECK(d); + break; + } + case 2: { + PyRef d = PyObject_CallMethod(h, "hexdigest", NULL); + CHECK(d); + break; + } + case 3: { // .copy().digest() + PyRef h2 = PyObject_CallMethod(h, "copy", NULL); + CHECK(h2); + PyRef d = PyObject_CallMethod(h2, "digest", NULL); + CHECK(d); + break; + } + case 4: { // .name, .digest_size, .block_size + PyRef n = PyObject_GetAttrString(h, "name"); + CHECK(n); + PyRef ds = PyObject_GetAttrString(h, "digest_size"); + CHECK(ds); + PyRef bs = PyObject_GetAttrString(h, "block_size"); + CHECK(bs); + break; + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations (13 ops). +// --------------------------------------------------------------------------- + +// OP_HASH_CHAIN: Create a hash object from one of 12 C module constructors +// (_md5.md5, _sha1.sha1, _sha2.sha224/256/384/512, _sha3.sha3_224/256/384/512, +// _blake2.blake2b/s) with fuzz-chosen initial data, then run chained actions. +static void op_hash_chain(PyObject *ctor, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_SHAKE_CHAIN: Create a SHAKE-128 or SHAKE-256 XOF object, then loop +// up to 100 actions: .update(data), .digest(variable_length), or +// .copy().digest(variable_length). Exercises the variable-output-length +// code paths in _sha3. +static void op_shake_chain(PyObject *ctor, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); + CHECK(h); + for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { + switch (fdp.ConsumeIntegralInRange(0, 2)) { + case 0: { + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000))); + PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); + CHECK(r); + break; + } + case 1: { + int len = fdp.ConsumeIntegralInRange(1, 10000); + PyRef d = PyObject_CallMethod(h, "digest", "i", len); + CHECK(d); + break; + } + case 2: { + PyRef h2 = PyObject_CallMethod(h, "copy", NULL); + CHECK(h2); + int len = fdp.ConsumeIntegralInRange(1, 10000); + PyRef d = PyObject_CallMethod(h2, "digest", "i", len); + CHECK(d); + break; + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BLAKE2B_KEYED / OP_BLAKE2S_KEYED: Create a BLAKE2 object with +// fuzz-chosen key, salt, and person parameters (up to max_key/max_salt/ +// max_person bytes respectively), then run chained hash actions. +// BLAKE2b: key<=64, salt<=16, person<=16. BLAKE2s: key<=32, salt<=8, person<=8. +static void op_blake2_keyed(PyObject *ctor, int max_key, int max_salt, + int max_person, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_key)); + std::string salt = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_salt)); + std::string person = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_person)); + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef k = PyBytes_FromStringAndSize(Y(key)); + CHECK(k); + PyRef s = PyBytes_FromStringAndSize(Y(salt)); + CHECK(s); + PyRef p = PyBytes_FromStringAndSize(Y(person)); + CHECK(p); + PyDict_SetItemString(kwargs, "key", k); + PyDict_SetItemString(kwargs, "salt", s); + PyDict_SetItemString(kwargs, "person", p); + + PyRef d = PyBytes_FromStringAndSize(Y(data)); + CHECK(d); + PyRef args = PyTuple_Pack(1, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(ctor, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_BLAKE2B_VARDIGEST / OP_BLAKE2S_VARDIGEST: Create a BLAKE2 object with +// a fuzz-chosen digest_size (1 to max_ds bytes), then run chained actions. +// Exercises the variable output length code path in _blake2. +static void op_blake2_vardigest(PyObject *ctor, int max_ds, + FuzzedDataProvider &fdp) { + int ds = fdp.ConsumeIntegralInRange(1, max_ds); + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef dsobj = PyLong_FromLong(ds); + CHECK(dsobj); + PyDict_SetItemString(kwargs, "digest_size", dsobj); + + PyRef d = PyBytes_FromStringAndSize(Y(data)); + CHECK(d); + PyRef args = PyTuple_Pack(1, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(ctor, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HMAC_COMPUTE: One-shot call to one of _hmac.compute_md5/sha1/sha256/sha512 +// with fuzz-chosen key and message. These are the low-level C implementations +// of HMAC in the _hmac module (not the Python hmac wrapper). +static void op_hmac_compute(PyObject *func, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + std::string msg = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(func, "y#y#", Y(key), Y(msg)); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PYHMAC_CHAIN: Create an HMAC object via hmac.new(key, digestmod=algo) +// where algo is fuzz-chosen from {md5, sha224, sha256, sha384, sha512, +// sha3_256, blake2s}, then run chained hash actions (update/digest/copy/etc). +// Exercises the Python hmac module which delegates to C hash constructors. +static void op_pyhmac_chain(const char *algo, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef dm = PyUnicode_FromString(algo); + CHECK(dm); + PyDict_SetItemString(kwargs, "digestmod", dm); + PyRef kb = PyBytes_FromStringAndSize(Y(key)); + CHECK(kb); + PyRef args = PyTuple_Pack(1, (PyObject *)kb); + CHECK(args); + PyRef h = PyObject_Call(py_hmac_new, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HMAC_DIGEST: One-shot call to hmac.digest(key, msg, "sha256"). +// Exercises the fast single-call HMAC path without creating an HMAC object. +static void op_hmac_digest(FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + std::string msg = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(py_hmac_digest, "y#y#s", + Y(key), Y(msg), "sha256"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_HMAC_COMPARE: Compute HMAC-SHA256 of fuzz data, then call +// hmac.compare_digest() against a zero-padded 32-byte buffer derived from +// the same data. Exercises the constant-time comparison code path. +static void op_hmac_compare(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef h = PyObject_CallFunction(py_hmac_new, "sy#s", + "k", Y(data), "sha256"); + CHECK(h); + PyRef dig = PyObject_CallMethod(h, "digest", NULL); + CHECK(dig); + char padded[32] = {}; + memcpy(padded, data.data(), data.size() < 32 ? data.size() : 32); + PyRef padobj = PyBytes_FromStringAndSize(padded, 32); + CHECK(padobj); + PyRef r = PyObject_CallFunction(py_hmac_compare_digest, "OO", + (PyObject *)dig, (PyObject *)padobj); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_HASHLIB_CHAIN: Create a hash object via hashlib.new(algo, data, +// usedforsecurity=False) where algo is fuzz-chosen from {md5, sha256, +// sha3_256, sha512}, then run chained actions. Unlike OP_HASH_CHAIN which +// uses the C module constructors directly, this goes through hashlib's +// dispatch logic (OpenSSL vs builtin). +static void op_hashlib_chain(const char *algo, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "usedforsecurity", Py_False); + PyRef name = PyUnicode_FromString(algo); + CHECK(name); + PyRef d = PyBytes_FromStringAndSize(Y(init)); + CHECK(d); + PyRef args = PyTuple_Pack(2, (PyObject *)name, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(hashlib_new, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HASHLIB_FILE_DIGEST: One-shot call to hashlib.file_digest(BytesIO(data), +// algo) with fuzz-chosen algorithm, then .hexdigest(). Exercises the +// file-based hashing path that reads from a file-like object. +static void op_hashlib_file_digest(const char *algo, FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "y#", Y(data)); + CHECK(bio); + PyRef h = PyObject_CallFunction(hashlib_file_digest, "Os", + (PyObject *)bio, algo); + CHECK(h); + PyRef r = PyObject_CallMethod(h, "hexdigest", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PBKDF2: One-shot call to hashlib.pbkdf2_hmac(algo, password, salt, 1) +// with fuzz-chosen algorithm from {sha1, sha256, sha512}. Uses 1 iteration +// to keep execution fast while still exercising the PBKDF2 code path. +static void op_pbkdf2(const char *algo, FuzzedDataProvider &fdp) { + std::string salt = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (salt.empty()) salt.push_back('\x00'); + std::string pw = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(hashlib_pbkdf2_hmac, "sy#y#i", + algo, Y(pw), Y(salt), 1); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_HASH_CHAIN, + OP_SHAKE_CHAIN, + OP_BLAKE2B_KEYED, + OP_BLAKE2S_KEYED, + OP_BLAKE2B_VARDIGEST, + OP_BLAKE2S_VARDIGEST, + OP_HMAC_COMPUTE, + OP_PYHMAC_CHAIN, + OP_HMAC_DIGEST, + OP_HMAC_COMPARE, + OP_HASHLIB_CHAIN, + OP_HASHLIB_FILE_DIGEST, + OP_PBKDF2, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_crypto(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_HASH_CHAIN: { + int ci = fdp.ConsumeIntegralInRange(0, kNumHashCtors - 1); + op_hash_chain(*all_hash_ctors[ci], fdp); + break; + } + case OP_SHAKE_CHAIN: { + int ci = fdp.ConsumeIntegralInRange(0, kNumShakeCtors - 1); + op_shake_chain(*shake_ctors[ci], fdp); + break; + } + case OP_BLAKE2B_KEYED: + op_blake2_keyed(ctor_blake2b, 64, 16, 16, fdp); + break; + case OP_BLAKE2S_KEYED: + op_blake2_keyed(ctor_blake2s, 32, 8, 8, fdp); + break; + case OP_BLAKE2B_VARDIGEST: + op_blake2_vardigest(ctor_blake2b, 64, fdp); + break; + case OP_BLAKE2S_VARDIGEST: + op_blake2_vardigest(ctor_blake2s, 32, fdp); + break; + case OP_HMAC_COMPUTE: + if (num_hmac_compute_funcs > 0) { + int fi = fdp.ConsumeIntegralInRange( + 0, num_hmac_compute_funcs - 1); + op_hmac_compute(hmac_compute_funcs[fi], fdp); + } + break; + case OP_PYHMAC_CHAIN: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHmacAlgos - 1); + op_pyhmac_chain(kHmacAlgos[ai], fdp); + break; + } + case OP_HMAC_DIGEST: + op_hmac_digest(fdp); + break; + case OP_HMAC_COMPARE: + op_hmac_compare(fdp); + break; + case OP_HASHLIB_CHAIN: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); + op_hashlib_chain(kHashlibAlgos[ai], fdp); + break; + } + case OP_HASHLIB_FILE_DIGEST: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); + op_hashlib_file_digest(kHashlibAlgos[ai], fdp); + break; + } + case OP_PBKDF2: { + int ai = fdp.ConsumeIntegralInRange(0, kNumPbkdf2Algos - 1); + op_pbkdf2(kPbkdf2Algos[ai], fdp); + break; + } + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_dataops.cpp b/module-fuzzers/fuzz_dataops.cpp new file mode 100644 index 0000000..10afe04 --- /dev/null +++ b/module-fuzzers/fuzz_dataops.cpp @@ -0,0 +1,1166 @@ +// fuzz_dataops.cpp — Fuzzer for CPython's data-structure C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// array — array(typecode) with frombytes, tobytes, tolist, +// reverse, byteswap, append, extend, pop, count, +// index, insert, remove, buffer_info, __sizeof__, +// __contains__, __iter__, slice ops, comparison, +// concatenation, repetition, fromlist +// _ctypes — c_char/c_int/c_double.from_buffer_copy(), +// create_string_buffer, (c_char*N).from_buffer_copy, +// Structure.from_buffer_copy +// mmap — anonymous mmap: write, find, rfind, read, readline, +// seek, resize, move, getitem, setitem, flush, size, +// tell, close, context manager +// _locale — strxfrm, strcoll +// _dbm — dbm.open, write, read, keys, delete, iteration +// _sqlite3 — connect(':memory:'), execute, executemany, +// executescript, complete_statement, create_function, +// create_aggregate, set_authorizer, create_collation, +// Row factory, blobopen, register_adapter +// +// The first byte of fuzz input selects one of 9 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (typecode, sub-operation, SQL, key/value splits). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. Two helper classes (Structure +// subclass, Aggregate class) are defined via PyRun_String at init time. +// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 +// iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// array +static PyObject *array_array; + +// ctypes +static PyObject *ct_c_char, *ct_c_int, *ct_c_double; +static PyObject *ct_create_string_buffer, *ct_sizeof; +static PyObject *ct_Structure_cls; + +// mmap +static PyObject *mmap_mmap; + +// locale +static PyObject *locale_strxfrm, *locale_strcoll; + +// dbm +static PyObject *dbm_open; + +// sqlite3 +static PyObject *sqlite3_connect, *sqlite3_complete_statement; +static PyObject *sqlite3_register_adapter, *sqlite3_Row; +static long sqlite3_SQLITE_OK_val; +static PyObject *sqlite3_Aggregate_cls; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_dataops(void) { + if (initialized) return; + + // array + array_array = import_attr("array", "array"); + + // ctypes + ct_c_char = import_attr("ctypes", "c_char"); + ct_c_int = import_attr("ctypes", "c_int"); + ct_c_double = import_attr("ctypes", "c_double"); + ct_create_string_buffer = import_attr("ctypes", "create_string_buffer"); + ct_sizeof = import_attr("ctypes", "sizeof"); + + // ctypes Structure subclass. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import ctypes\n" + "class _S(ctypes.Structure):\n" + " _fields_ = [('a', ctypes.c_int), ('b', ctypes.c_double)]\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + ct_Structure_cls = PyDict_GetItemString(globals, "_S"); + Py_INCREF(ct_Structure_cls); + Py_DECREF(globals); + } + + // mmap + mmap_mmap = import_attr("mmap", "mmap"); + + // locale + locale_strxfrm = import_attr("locale", "strxfrm"); + locale_strcoll = import_attr("locale", "strcoll"); + + // dbm + dbm_open = import_attr("dbm", "open"); + + // sqlite3 + sqlite3_connect = import_attr("sqlite3", "connect"); + sqlite3_complete_statement = import_attr("sqlite3", "complete_statement"); + sqlite3_register_adapter = import_attr("sqlite3", "register_adapter"); + sqlite3_Row = import_attr("sqlite3", "Row"); + { + PyObject *v = import_attr("sqlite3", "SQLITE_OK"); + sqlite3_SQLITE_OK_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // Aggregate class for sqlite3. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "class _Agg:\n" + " def __init__(self): self.vals = []\n" + " def step(self, v): self.vals.append(v)\n" + " def finalize(self): return len(self.vals)\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + sqlite3_Aggregate_cls = PyDict_GetItemString(globals, "_Agg"); + Py_INCREF(sqlite3_Aggregate_cls); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Item sizes for array typecodes. +static int typecode_itemsize(char tc) { + switch (tc) { + case 'b': case 'B': return 1; + case 'H': return 2; + case 'i': case 'I': case 'l': case 'L': case 'f': return 4; + case 'd': case 'q': case 'Q': return 8; + default: return 1; + } +} + +// Create an array with the given typecode and aligned data. +static PyObject *make_array(char tc, const std::string &data) { + int item_sz = typecode_itemsize(tc); + size_t aligned_len = (data.size() / item_sz) * item_sz; + if (aligned_len == 0) aligned_len = item_sz; + + char tc_str[2] = {tc, '\0'}; + PyObject *arr = PyObject_CallFunction(array_array, "s", tc_str); + if (!arr) return NULL; + + // frombytes with aligned data. + std::string aligned = data.substr(0, aligned_len); + if (aligned.size() < (size_t)item_sz) { + aligned.resize(item_sz, '\0'); + } + PyRef pydata = PyBytes_FromStringAndSize(aligned.data(), aligned.size()); + if (!pydata) { Py_DECREF(arr); return NULL; } + PyRef r = PyObject_CallMethod(arr, "frombytes", "O", (PyObject *)pydata); + if (!r) { PyErr_Clear(); Py_DECREF(arr); return NULL; } + return arr; +} + +// --------------------------------------------------------------------------- +// Operations (9 ops). +// --------------------------------------------------------------------------- + +// OP_ARRAY_FROMBYTES: FDP selects typecode, creates array from aligned fuzz +// data, then calls tobytes/tolist/reverse/byteswap. Exercises the array C +// module's core buffer and conversion operations. +static void op_array_frombytes(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef arr(make_array(tc, data)); + CHECK(arr); + + { + PyRef r = PyObject_CallMethod(arr, "tobytes", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "tolist", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "reverse", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "byteswap", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ARRAY_METHODS: FDP selects typecode, creates array, then exercises +// append/extend/pop/count/index/insert/remove/buffer_info/__sizeof__/ +// __contains__/__iter__/len. Exercises the array C module's element ops. +static void op_array_methods(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef arr(make_array(tc, data)); + CHECK(arr); + + // append(0) + { + PyRef zero = PyLong_FromLong(0); + CHECK(zero); + PyRef r = PyObject_CallMethod(arr, "append", "O", (PyObject *)zero); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // extend with a slice. + { + PyRef slice = PySequence_GetSlice(arr, 0, 1); + if (slice) { + PyRef r = PyObject_CallMethod(arr, "extend", "O", (PyObject *)slice); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // pop() + { + PyRef r = PyObject_CallMethod(arr, "pop", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // count(first_element) and index(first_element) + { + PyRef first = PySequence_GetItem(arr, 0); + if (first) { + PyRef c = PyObject_CallMethod(arr, "count", "O", (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef idx = PyObject_CallMethod(arr, "index", "O", (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // insert(0, 42) + remove(42) + { + PyRef val = PyLong_FromLong(42); + CHECK(val); + PyRef r = PyObject_CallMethod(arr, "insert", "iO", 0, (PyObject *)val); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(arr, "remove", "O", (PyObject *)val); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // buffer_info, __sizeof__ + { + PyRef bi = PyObject_CallMethod(arr, "buffer_info", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef sz = PyObject_CallMethod(arr, "__sizeof__", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // __contains__, iter, len + { + PyRef first = PySequence_GetItem(arr, 0); + if (first) { + int r = PySequence_Contains(arr, first); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + Py_ssize_t len = PyObject_Length(arr); + (void)len; + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ARRAY_SLICE: FDP selects typecode, creates two arrays, does slice read, +// slice assignment, concatenation, repetition, comparison. Exercises the +// array C module's sequence protocol paths. +static void op_array_slice(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef a1(make_array(tc, data)); + CHECK(a1); + PyRef a2(make_array(tc, data)); + CHECK(a2); + + // Slice read a1[0:N]. + { + Py_ssize_t len = PyObject_Length(a1); + Py_ssize_t n = len < 4 ? len : 4; + PyRef sl = PySequence_GetSlice(a1, 0, n); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Slice assignment a1[::2] = array of zeros. + { + Py_ssize_t len = PyObject_Length(a1); + if (len > 0) { + // Count elements in a1[::2]. + Py_ssize_t slice_len = (len + 1) / 2; + // Build array of zeros with same typecode. + char tc_str[2] = {tc, '\0'}; + PyRef zeros_arr = PyObject_CallFunction(array_array, "s", tc_str); + if (zeros_arr) { + std::string zero_data(slice_len * typecode_itemsize(tc), '\0'); + PyRef pydata = PyBytes_FromStringAndSize(zero_data.data(), + zero_data.size()); + if (pydata) { + PyRef fb = PyObject_CallMethod(zeros_arr, "frombytes", "O", + (PyObject *)pydata); + if (fb) { + PyRef step = PyLong_FromLong(2); + PyRef sl = PySlice_New(NULL, NULL, step); + if (sl) { + int r = PyObject_SetItem(a1, sl, zeros_arr); + (void)r; + } + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); + } + } + + // Concatenation a1 + a2. + { + PyRef r = PySequence_Concat(a1, a2); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Repetition a1 * min(len, 3). + { + Py_ssize_t len = PyObject_Length(a1); + int rep = len < 3 ? (int)len : 3; + PyRef r = PySequence_Repeat(a1, rep); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Comparison a1 == a2. + { + PyRef r = PyObject_RichCompare(a1, a2, Py_EQ); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_CTYPES: FDP selects sub-op for different ctypes from_buffer_copy calls. +// Exercises the _ctypes C module's buffer copy and array creation paths. +static void op_ctypes(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // c_char.from_buffer_copy(1 byte) + std::string buf = data.substr(0, 1); + if (buf.empty()) buf.push_back('\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_char, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 1: { + // c_int.from_buffer_copy(4 bytes) + std::string buf = data.substr(0, 4); + buf.resize(4, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_int, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 2: { + // c_double.from_buffer_copy(8 bytes) + std::string buf = data.substr(0, 8); + buf.resize(8, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_double, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 3: { + // create_string_buffer(data[:256]) + std::string buf = data.substr(0, 256); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallFunction(ct_create_string_buffer, "O", + (PyObject *)pydata); + break; + } + case 4: { + // (c_char * N).from_buffer_copy(data) + if (data.empty()) break; + PyRef n = PyLong_FromLong(data.size()); + CHECK(n); + PyRef arr_type = PyNumber_Multiply(ct_c_char, n); + CHECK(arr_type); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(arr_type, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 5: { + // Structure.from_buffer_copy(padded data) + PyRef sz = PyObject_CallFunction(ct_sizeof, "O", ct_Structure_cls); + CHECK(sz); + long struct_sz = PyLong_AsLong(sz); + std::string buf = data.substr(0, struct_sz); + buf.resize(struct_sz, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_Structure_cls, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_MMAP: Create anonymous mmap, write data, then FDP selects actions. +// Exercises the mmap C module's core operations. +static void op_mmap(FuzzedDataProvider &fdp) { + int action = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + if (data.empty()) data.push_back('\0'); + + // mmap(-1, size) + Py_ssize_t map_size = data.size(); + PyRef mm = PyObject_CallFunction(mmap_mmap, "in", -1, map_size); + CHECK(mm); + + // Write data. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef r = PyObject_CallMethod(mm, "write", "O", (PyObject *)pydata); + if (!r) { PyErr_Clear(); goto cleanup; } + } + + // Seek to 0. + { + PyRef r = PyObject_CallMethod(mm, "seek", "i", 0); + if (!r) { PyErr_Clear(); goto cleanup; } + } + + switch (action) { + case 0: { + // find + rfind + size_t pat_len = data.size() < 4 ? data.size() : 4; + PyRef pat = PyBytes_FromStringAndSize(data.data(), pat_len); + CHECK(pat); + { + PyRef r = PyObject_CallMethod(mm, "find", "O", (PyObject *)pat); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "rfind", "O", (PyObject *)pat); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // read + readline + { + long n = map_size < 4 ? map_size : 4; + PyRef r = PyObject_CallMethod(mm, "read", "l", n); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(mm, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(mm, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // resize + move + long new_size = map_size * 2; + if (new_size < 1) new_size = 1; + { + PyRef r = PyObject_CallMethod(mm, "resize", "l", new_size); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long src = map_size < 2 ? 0 : 1; + long count = map_size < 2 ? 0 : (map_size / 2 < new_size / 2 ? + map_size / 2 : new_size / 2); + PyRef r = PyObject_CallMethod(mm, "move", "lll", + (long)0, src, count); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // getitem + setitem + { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef r = PyObject_GetItem(mm, idx); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + Py_ssize_t n = map_size < 4 ? map_size : 4; + PyRef sl = PySlice_New(PyLong_FromLong(0), PyLong_FromLong(n), NULL); + CHECK(sl); + PyRef r = PyObject_GetItem(mm, sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (data.size() > 0) { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef val = PyLong_FromLong((unsigned char)data[0]); + CHECK(val); + PyObject_SetItem(mm, idx, val); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 4: { + // flush + size + tell + { + PyRef r = PyObject_CallMethod(mm, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "size", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 5: { + // read all + { + PyRef r = PyObject_CallMethod(mm, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } + +cleanup: + { + PyRef r = PyObject_CallMethod(mm, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_LOCALE: FDP selects strxfrm or strcoll. Exercises the _locale C module. +static void op_locale(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + bool use_strcoll = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + if (use_strcoll) { + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r = PyObject_CallFunction(locale_strcoll, "OO", + (PyObject *)half1, (PyObject *)half2); + } else { + PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_DBM: Open an in-memory dbm, write N key-value pairs, read back, iterate. +// Exercises the _dbm C extension module's storage operations. +static void op_dbm(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + // Use a unique filename based on the gc counter to avoid conflicts. + char dbpath[64]; + snprintf(dbpath, sizeof(dbpath), "/tmp/_fuzz_dbm_%lu", gc_counter); + + PyRef db = PyObject_CallFunction(dbm_open, "ss", dbpath, "n"); + CHECK(db); + + // Write key-value pairs from fuzz data. + size_t limit = data.size() < 64 ? data.size() : 64; + for (size_t i = 0; i + 3 < limit; i += 4) { + PyRef key = PyBytes_FromStringAndSize(data.data() + i, 2); + if (!key) { PyErr_Clear(); continue; } + PyRef val = PyBytes_FromStringAndSize(data.data() + i + 2, 2); + if (!val) { PyErr_Clear(); continue; } + int r = PyObject_SetItem(db, key, val); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Read keys. + { + PyRef keys = PyObject_CallMethod(db, "keys", NULL); + if (keys) { + PyRef it = PyObject_GetIter(keys); + if (it) { + PyObject *k; + while ((k = PyIter_Next(it)) != NULL) { + PyRef val = PyObject_GetItem(db, k); + Py_DECREF(k); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Check membership. + { + PyRef test_key = PyBytes_FromStringAndSize("k", 1); + if (test_key) { + int r = PySequence_Contains(db, test_key); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } + } + + // Close. + { + PyRef r = PyObject_CallMethod(db, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// Helper: Create a memory connection with PRAGMA max_page_count=100. +static PyObject *make_sqlite_conn() { + PyObject *conn = PyObject_CallFunction(sqlite3_connect, "s", ":memory:"); + if (!conn) return NULL; + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "PRAGMA max_page_count=100"); + if (!r) { + PyErr_Clear(); + Py_DECREF(conn); + return NULL; + } + return conn; +} + +// OP_SQLITE3_BASIC: connect(':memory:'), then FDP selects: execute fuzz SQL, +// parameterized queries, executemany, executescript, complete_statement. +// Exercises the _sqlite3 C module's basic execution paths. +static void op_sqlite3_basic(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef conn(make_sqlite_conn()); + CHECK(conn); + + switch (variant) { + case 0: { + // Execute fuzz SQL. + PyRef r = PyObject_CallMethod(conn, "execute", "O", (PyObject *)pystr); + break; + } + case 1: { + // Parameterized INSERT/SELECT/UPDATE/DELETE. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b BLOB)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef params = PyTuple_Pack(2, (PyObject *)pystr, (PyObject *)pydata); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?, ?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + // SELECT. + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "SELECT * FROM t WHERE a LIKE ?", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // executemany. + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(v INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef rows = PyList_New(0); + CHECK(rows); + size_t limit = data.size() < 64 ? data.size() : 64; + for (size_t i = 0; i < limit; i++) { + PyRef val = PyLong_FromLong((unsigned char)data[i]); + PyRef tup = PyTuple_Pack(1, (PyObject *)val); + if (tup) PyList_Append(rows, tup); + } + PyRef r = PyObject_CallMethod(conn, "executemany", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)rows); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 3: { + // executescript. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; + if (!sql) { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallMethod(conn, "executescript", "O", + (PyObject *)def); + } else { + PyRef r = PyObject_CallMethod(conn, "executescript", "O", sql); + } + break; + } + case 4: { + // complete_statement. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; + if (!sql) { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", + (PyObject *)def); + } else { + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", sql); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef cl = PyObject_CallMethod(conn, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_SQLITE3_ADVANCED: connect(':memory:'), then FDP selects: create_function, +// create_aggregate, set_authorizer, create_collation, Row factory, blobopen, +// register_adapter. Exercises the _sqlite3 C module's advanced features. +static void op_sqlite3_advanced(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 6); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef conn(make_sqlite_conn()); + CHECK(conn); + + switch (variant) { + case 0: { + // create_function + SELECT. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef fn = PyRun_String("lambda x: x", Py_eval_input, globals, globals); + CHECK(fn); + { + PyRef r = PyObject_CallMethod(conn, "create_function", "siO", + "fuzzfn", 1, (PyObject *)fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT fuzzfn(a) FROM t"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 1: { + // create_aggregate + SELECT. + { + PyRef r = PyObject_CallMethod(conn, "create_aggregate", "siO", + "fuzzagg", 1, + sqlite3_Aggregate_cls); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(v INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef rows = PyList_New(0); + CHECK(rows); + size_t limit = data.size() < 32 ? data.size() : 32; + for (size_t i = 0; i < limit; i++) { + PyRef val = PyLong_FromLong((unsigned char)data[i]); + PyRef tup = PyTuple_Pack(1, (PyObject *)val); + if (tup) PyList_Append(rows, tup); + } + PyRef r = PyObject_CallMethod(conn, "executemany", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)rows); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT fuzzagg(v) FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 2: { + // set_authorizer + SELECT. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef code_str = PyUnicode_FromFormat( + "lambda *a: %ld", sqlite3_SQLITE_OK_val); + CHECK(code_str); + PyRef auth_fn = PyRun_String(PyUnicode_AsUTF8(code_str), + Py_eval_input, globals, globals); + CHECK(auth_fn); + { + PyRef r = PyObject_CallMethod(conn, "set_authorizer", "O", + (PyObject *)auth_fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 16); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 3: { + // create_collation + ORDER BY. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef coll_fn = PyRun_String( + "lambda a, b: (a > b) - (a < b)", + Py_eval_input, globals, globals); + CHECK(coll_fn); + { + PyRef r = PyObject_CallMethod(conn, "create_collation", "sO", + "fuzz", (PyObject *)coll_fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef params = PyTuple_Pack(1, (PyObject *)pystr); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t ORDER BY a COLLATE fuzz"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 4: { + // Row factory + SELECT. + PyObject_SetAttrString(conn, "row_factory", sqlite3_Row); + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 8); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(2, (PyObject *)sub, PyLong_FromLong(42)); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?, ?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (row && row.p != Py_None) { + PyRef a = PyObject_GetItem(row, PyUnicode_FromString("a")); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef b = PyObject_GetItem(row, PyUnicode_FromString("b")); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef keys = PyObject_CallMethod(row, "keys", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 5: { + // blobopen + read/write. + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a BLOB)"); + if (!r) { PyErr_Clear(); break; } + } + { + std::string blob_data = data.substr(0, 64); + PyRef pydata = PyBytes_FromStringAndSize(blob_data.data(), + blob_data.size()); + CHECK(pydata); + PyRef params = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT rowid FROM t"); + if (!cur) { PyErr_Clear(); break; } + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (!row || row.p == Py_None) { PyErr_Clear(); break; } + PyRef rid = PySequence_GetItem(row, 0); + CHECK(rid); + PyRef blob = PyObject_CallMethod(conn, "blobopen", "sssO", + "main", "t", "a", (PyObject *)rid); + if (!blob) { PyErr_Clear(); break; } + { + PyRef rd = PyObject_CallMethod(blob, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(blob, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + size_t wr_len = data.size() < 64 ? data.size() : 64; + PyRef wr_data = PyBytes_FromStringAndSize(data.data(), wr_len); + if (wr_data) { + PyRef wr = PyObject_CallMethod(blob, "write", "O", + (PyObject *)wr_data); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + { + PyRef cl = PyObject_CallMethod(blob, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + break; + } + case 6: { + // register_adapter. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef r = PyRun_String( + "class _AdaptMe:\n" + " def __init__(self, v): self.v = v\n", + Py_file_input, globals, globals); + CHECK(r); + PyRef adapt_cls = PyRef(PyDict_GetItemString(globals, "_AdaptMe")); + Py_INCREF(adapt_cls.p); + CHECK(adapt_cls); + + PyRef adapter_fn = PyRun_String( + "lambda a: str(a.v)", Py_eval_input, globals, globals); + CHECK(adapter_fn); + + { + PyRef reg = PyObject_CallFunction(sqlite3_register_adapter, "OO", + (PyObject *)adapt_cls, + (PyObject *)adapter_fn); + if (!reg) { PyErr_Clear(); break; } + } + { + PyRef r2 = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r2) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 8); + if (!sub) { PyErr_Clear(); break; } + PyRef obj = PyObject_CallFunction(adapt_cls, "O", (PyObject *)sub); + if (!obj) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)obj); + CHECK(params); + PyRef r3 = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef cl = PyObject_CallMethod(conn, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_ARRAY_FROMBYTES, + OP_ARRAY_METHODS, + OP_ARRAY_SLICE, + OP_CTYPES, + OP_MMAP, + OP_LOCALE, + OP_DBM, + OP_SQLITE3_BASIC, + OP_SQLITE3_ADVANCED, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_dataops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_ARRAY_FROMBYTES: + op_array_frombytes(fdp); + break; + case OP_ARRAY_METHODS: + op_array_methods(fdp); + break; + case OP_ARRAY_SLICE: + op_array_slice(fdp); + break; + case OP_CTYPES: + op_ctypes(fdp); + break; + case OP_MMAP: + op_mmap(fdp); + break; + case OP_LOCALE: + op_locale(fdp); + break; + case OP_DBM: + op_dbm(fdp); + break; + case OP_SQLITE3_BASIC: + op_sqlite3_basic(fdp); + break; + case OP_SQLITE3_ADVANCED: + op_sqlite3_advanced(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_decode.cpp b/module-fuzzers/fuzz_decode.cpp new file mode 100644 index 0000000..234a265 --- /dev/null +++ b/module-fuzzers/fuzz_decode.cpp @@ -0,0 +1,1029 @@ +// fuzz_decode.cpp — Fuzzer for CPython's compression, encoding, serialization, +// and certificate-parsing C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// zlib — compress/decompress (one-shot and streaming via +// compressobj/decompressobj with wbits, zdict, copy, +// flush), crc32, adler32 +// _bz2 — BZ2Decompressor.decompress(), bz2.compress() +// _lzma — LZMADecompressor.decompress() with FORMAT_AUTO/XZ/ALONE +// and 16 MB memlimit, lzma.compress() +// binascii — 6 decoders: a2b_base64 (with strict_mode), a2b_hex, +// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85 +// 6 encoders: b2a_base64 (with newline), b2a_hex, +// b2a_uu (clamped to 45 bytes), b2a_qp, +// b2a_ascii85 (with foldspaces/wrapcol), b2a_base85 +// Checksums: crc32, crc_hqx +// Round-trip: hexlify -> unhexlify +// _pickle — pickle.dumps() with 8 container types (bytes, str, +// list, tuple, set, frozenset, bytearray, dict) across +// protocols 0-5 and fix_imports flag. +// pickle.loads() via RestrictedUnpickler (blocks +// find_class), PersistentUnpickler (handles PERSID/ +// BINPERSID), and RestrictedUnpickler with +// encoding='bytes'. +// Pickler chain: dump, clear_memo, dump, getvalue. +// Round-trip: dumps then loads. +// _ssl — ssl.DER_cert_to_PEM_cert(), then optionally +// SSLContext(PROTOCOL_TLS_CLIENT).load_verify_locations() +// _multibytecodec, +// _codecs_jp, _codecs_cn, _codecs_kr, +// _codecs_hk, _codecs_tw, _codecs_iso2022 +// — codecs.decode() with 17 codecs including shift_jis, +// euc-jp, gb2312, big5, gb18030, iso-2022-jp, etc. +// codecs.encode() with 19 codecs. +// Incremental decoders (shift_jis, gb18030, utf-16): +// split input at midpoint, decode halves, getstate, reset. +// Incremental encoders (shift_jis, utf-8): +// split string at midpoint, encode, reset, getstate. +// StreamReader: codecs.getreader('utf-8')(BytesIO).read() +// +// The first byte of fuzz input selects one of 20 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (algorithm/codec selection, compression level, protocol number, +// container type, wbits value, boolean flags, data splits). +// +// All module functions, constructors, and format constants are imported once +// during init and cached as static PyObject* and long pointers. Two pickle +// Unpickler subclasses (RestrictedUnpickler, PersistentUnpickler) are defined +// via PyRun_String at init time and cached as class objects. +// +// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 +// iterations. Max input size: 1 MB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// zlib +static PyObject *zlib_compress, *zlib_decompress; +static PyObject *zlib_decompressobj, *zlib_compressobj; +static PyObject *zlib_crc32, *zlib_adler32; + +// bz2 +static PyObject *bz2_compress, *bz2_BZ2Decompressor; + +// lzma +static PyObject *lzma_LZMADecompressor, *lzma_compress; +static long lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val; + +// binascii +static PyObject *ba_a2b_base64, *ba_a2b_hex, *ba_a2b_uu, *ba_a2b_qp; +static PyObject *ba_a2b_ascii85, *ba_a2b_base85; +static PyObject *ba_b2a_base64, *ba_b2a_hex, *ba_b2a_uu, *ba_b2a_qp; +static PyObject *ba_b2a_ascii85, *ba_b2a_base85; +static PyObject *ba_crc32, *ba_crc_hqx, *ba_hexlify, *ba_unhexlify; + +// pickle +static PyObject *pickle_dumps, *pickle_loads; + +// codecs +static PyObject *codecs_decode, *codecs_encode; +static PyObject *codecs_getincrementaldecoder, *codecs_getincrementalencoder; +static PyObject *codecs_getreader; + +// ssl +static PyObject *ssl_DER_cert_to_PEM_cert, *ssl_SSLContext; +static long ssl_PROTOCOL_TLS_CLIENT_val; + +// io +static PyObject *bytesio_ctor; + +// pickle helper classes +static PyObject *RestrictedUnpickler_cls, *PersistentUnpickler_cls; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_decode(void) { + if (initialized) return; + + // zlib + zlib_compress = import_attr("zlib", "compress"); + zlib_decompress = import_attr("zlib", "decompress"); + zlib_decompressobj = import_attr("zlib", "decompressobj"); + zlib_compressobj = import_attr("zlib", "compressobj"); + zlib_crc32 = import_attr("zlib", "crc32"); + zlib_adler32 = import_attr("zlib", "adler32"); + + // bz2 + bz2_compress = import_attr("bz2", "compress"); + bz2_BZ2Decompressor = import_attr("bz2", "BZ2Decompressor"); + + // lzma + lzma_LZMADecompressor = import_attr("lzma", "LZMADecompressor"); + lzma_compress = import_attr("lzma", "compress"); + { + PyObject *v; + v = import_attr("lzma", "FORMAT_AUTO"); + lzma_FORMAT_AUTO_val = PyLong_AsLong(v); + Py_DECREF(v); + v = import_attr("lzma", "FORMAT_XZ"); + lzma_FORMAT_XZ_val = PyLong_AsLong(v); + Py_DECREF(v); + v = import_attr("lzma", "FORMAT_ALONE"); + lzma_FORMAT_ALONE_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // binascii + ba_a2b_base64 = import_attr("binascii", "a2b_base64"); + ba_a2b_hex = import_attr("binascii", "a2b_hex"); + ba_a2b_uu = import_attr("binascii", "a2b_uu"); + ba_a2b_qp = import_attr("binascii", "a2b_qp"); + ba_a2b_ascii85 = import_attr("binascii", "a2b_ascii85"); + ba_a2b_base85 = import_attr("binascii", "a2b_base85"); + ba_b2a_base64 = import_attr("binascii", "b2a_base64"); + ba_b2a_hex = import_attr("binascii", "b2a_hex"); + ba_b2a_uu = import_attr("binascii", "b2a_uu"); + ba_b2a_qp = import_attr("binascii", "b2a_qp"); + ba_b2a_ascii85 = import_attr("binascii", "b2a_ascii85"); + ba_b2a_base85 = import_attr("binascii", "b2a_base85"); + ba_crc32 = import_attr("binascii", "crc32"); + ba_crc_hqx = import_attr("binascii", "crc_hqx"); + ba_hexlify = import_attr("binascii", "hexlify"); + ba_unhexlify = import_attr("binascii", "unhexlify"); + + // pickle + pickle_dumps = import_attr("pickle", "dumps"); + pickle_loads = import_attr("pickle", "loads"); + + // codecs + codecs_decode = import_attr("codecs", "decode"); + codecs_encode = import_attr("codecs", "encode"); + codecs_getincrementaldecoder = import_attr("codecs", + "getincrementaldecoder"); + codecs_getincrementalencoder = import_attr("codecs", + "getincrementalencoder"); + codecs_getreader = import_attr("codecs", "getreader"); + + // ssl + ssl_DER_cert_to_PEM_cert = import_attr("ssl", "DER_cert_to_PEM_cert"); + ssl_SSLContext = import_attr("ssl", "SSLContext"); + { + PyObject *v = import_attr("ssl", "PROTOCOL_TLS_CLIENT"); + ssl_PROTOCOL_TLS_CLIENT_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // io + bytesio_ctor = import_attr("io", "BytesIO"); + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + // Pickle helper classes via PyRun_String. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import pickle, io\n" + "class RestrictedUnpickler(pickle.Unpickler):\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n" + "class PersistentUnpickler(pickle.Unpickler):\n" + " def persistent_load(self, pid): return pid\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n", + Py_file_input, globals, globals); + if (!r) { + PyErr_Print(); + abort(); + } + Py_DECREF(r); + RestrictedUnpickler_cls = + PyDict_GetItemString(globals, "RestrictedUnpickler"); + Py_INCREF(RestrictedUnpickler_cls); + PersistentUnpickler_cls = + PyDict_GetItemString(globals, "PersistentUnpickler"); + Py_INCREF(PersistentUnpickler_cls); + Py_DECREF(globals); + } + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations — Compression (6 ops) +// --------------------------------------------------------------------------- + +// OP_ZLIB_DECOMPRESS: Create a zlib.decompressobj with fuzz-chosen wbits +// from {-15 (raw), 0 (auto), 15 (zlib), 31 (gzip), 47 (auto-detect)} and +// an optional zdict (first 32 bytes of data). Call .decompress(data, 1MB), +// optionally .flush(), and optionally .copy() + decompress on the copy. +// Exercises Decomp_Type, zlib_Decompress_decompress, copy, flush paths. +static void op_zlib_decompress(FuzzedDataProvider &fdp) { + static const int kWbitsChoices[] = {-15, 0, 15, 31, 47}; + int wbits = kWbitsChoices[fdp.ConsumeIntegralInRange(0, 4)]; + bool use_zdict = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef wbits_obj = PyLong_FromLong(wbits); + CHECK(wbits_obj); + PyRef args_dobj = PyTuple_Pack(1, (PyObject *)wbits_obj); + CHECK(args_dobj); + + if (use_zdict && data.size() > 32) { + PyRef zdict = PyBytes_FromStringAndSize(data.data(), 32); + CHECK(zdict); + PyDict_SetItemString(kwargs, "zdict", zdict); + data = data.substr(32); + } + + PyRef dobj = PyObject_Call(zlib_decompressobj, args_dobj, kwargs); + CHECK(dobj); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (!r) { + PyErr_Clear(); + return; + } + + if (fdp.remaining_bytes() > 0 || data.size() % 2 == 0) { + PyRef flush_r = PyObject_CallMethod(dobj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + if (data.size() % 3 == 0) { + PyRef copy_obj = PyObject_CallMethod(dobj, "copy", NULL); + if (copy_obj) { + PyRef r2 = PyObject_CallMethod(copy_obj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } +} + +// OP_ZLIB_COMPRESS: Either one-shot zlib.compress(data, level) or streaming +// via compressobj(level).compress(data).flush(), with optional .copy().flush(). +// Level is fuzz-chosen 0-9. Exercises Compress_Type and zlib_compress_impl. +static void op_zlib_compress(FuzzedDataProvider &fdp) { + int level = fdp.ConsumeIntegralInRange(0, 9); + bool use_obj = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_obj) { + PyRef cobj = PyObject_CallFunction(zlib_compressobj, "i", level); + CHECK(cobj); + PyRef r1 = PyObject_CallMethod(cobj, "compress", "O", + (PyObject *)pydata); + CHECK(r1); + if (data.size() % 2 == 0) { + PyRef copy_obj = PyObject_CallMethod(cobj, "copy", NULL); + if (copy_obj) { + PyRef r2 = PyObject_CallMethod(copy_obj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + PyRef r3 = PyObject_CallMethod(cobj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyRef r = PyObject_CallFunction(zlib_compress, "Oi", + (PyObject *)pydata, level); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ZLIB_CHECKSUM: Call either zlib.crc32(data) or zlib.adler32(data), +// fuzz-chosen. Exercises the checksum C implementations in zlibmodule.c. +static void op_zlib_checksum(FuzzedDataProvider &fdp) { + bool use_crc = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction( + use_crc ? zlib_crc32 : zlib_adler32, "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BZ2: Either bz2.compress(data) or BZ2Decompressor().decompress(data, 1MB), +// fuzz-chosen. Exercises the _bz2 C extension (BZ2Compressor/BZ2Decompressor). +static void op_bz2(FuzzedDataProvider &fdp) { + bool do_compress = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (do_compress) { + PyRef r = PyObject_CallFunction(bz2_compress, "O", + (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyRef dobj = PyObject_CallFunction(bz2_BZ2Decompressor, NULL); + CHECK(dobj); + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_LZMA_DECOMPRESS: Create LZMADecompressor with fuzz-chosen format from +// {FORMAT_AUTO, FORMAT_XZ, FORMAT_ALONE} and 16 MB memlimit, then call +// .decompress(data, 1MB). Exercises the _lzma C extension decompressor. +static void op_lzma_decompress(FuzzedDataProvider &fdp) { + long fmt_vals[] = { + lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val, + }; + long fmt = fmt_vals[fdp.ConsumeIntegralInRange(0, 2)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef fmt_obj = PyLong_FromLong(fmt); + CHECK(fmt_obj); + PyDict_SetItemString(kwargs, "format", fmt_obj); + PyRef memlimit = PyLong_FromLong(16 * 1024 * 1024); + CHECK(memlimit); + PyDict_SetItemString(kwargs, "memlimit", memlimit); + + PyRef empty_args = PyTuple_New(0); + CHECK(empty_args); + PyRef dobj = PyObject_Call(lzma_LZMADecompressor, empty_args, kwargs); + CHECK(dobj); + + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_LZMA_COMPRESS: One-shot lzma.compress(data). Exercises the _lzma +// C extension compressor with default settings. +static void op_lzma_compress(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction(lzma_compress, "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Binascii (4 ops) +// --------------------------------------------------------------------------- + +// OP_BINASCII_DECODE: Call one of 6 binary-to-binary decoders from the +// binascii C module: a2b_base64 (with optional strict_mode=True), a2b_hex, +// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85. Fuzz selects which decoder. +static void op_binascii_decode(FuzzedDataProvider &fdp) { + int which = fdp.ConsumeIntegralInRange(0, 5); + bool strict = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyObject *funcs[] = { + ba_a2b_base64, ba_a2b_hex, ba_a2b_uu, + ba_a2b_qp, ba_a2b_ascii85, ba_a2b_base85, + }; + + if (which == 0 && strict) { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "strict_mode", Py_True); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_a2b_base64, args, kwargs); + } else { + PyRef r = PyObject_CallFunction(funcs[which], "O", + (PyObject *)pydata); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_ENCODE: Call one of 6 binary-to-text encoders from the +// binascii C module: b2a_base64 (with optional newline kwarg), b2a_hex, +// b2a_uu (input clamped to 45 bytes), b2a_qp, b2a_ascii85 (with optional +// foldspaces and wrapcol=72), b2a_base85. Fuzz selects which encoder. +static void op_binascii_encode(FuzzedDataProvider &fdp) { + int which = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + // b2a_uu requires <= 45 bytes. + if (which == 2 && data.size() > 45) data.resize(45); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyObject *funcs[] = { + ba_b2a_base64, ba_b2a_hex, ba_b2a_uu, + ba_b2a_qp, ba_b2a_ascii85, ba_b2a_base85, + }; + + if (which == 0) { + // b2a_base64 with optional newline kwarg. + bool newline = fdp.ConsumeBool(); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "newline", newline ? Py_True : Py_False); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_b2a_base64, args, kwargs); + } else if (which == 4) { + // b2a_ascii85 with optional foldspaces/wrapcol. + bool foldspaces = fdp.ConsumeBool(); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + if (foldspaces) + PyDict_SetItemString(kwargs, "foldspaces", Py_True); + PyRef wrapcol = PyLong_FromLong(72); + CHECK(wrapcol); + PyDict_SetItemString(kwargs, "wrapcol", wrapcol); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_b2a_ascii85, args, kwargs); + } else { + PyRef r = PyObject_CallFunction(funcs[which], "O", + (PyObject *)pydata); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_CHECKSUM: Call either binascii.crc32(data) or +// binascii.crc_hqx(data, 0), fuzz-chosen. +static void op_binascii_checksum(FuzzedDataProvider &fdp) { + bool use_crc32 = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_crc32) { + PyRef r = PyObject_CallFunction(ba_crc32, "O", (PyObject *)pydata); + } else { + PyRef r = PyObject_CallFunction(ba_crc_hqx, "Oi", + (PyObject *)pydata, 0); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_ROUNDTRIP: binascii.hexlify(data) then binascii.unhexlify() +// on the result. Exercises both directions of hex encoding. +static void op_binascii_roundtrip(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef hexed = PyObject_CallFunction(ba_hexlify, "O", + (PyObject *)pydata); + CHECK(hexed); + PyRef r = PyObject_CallFunction(ba_unhexlify, "O", (PyObject *)hexed); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Pickle (4 ops) +// --------------------------------------------------------------------------- + +// Build a Python container from fuzz bytes for pickle.dumps operations. +// type selects: 0=bytes, 1=str, 2=list of ints, 3=tuple of ints, +// 4=set of ints, 5=frozenset of ints, 6=bytearray, 7=dict(int->None). +// Capped at 256 elements to keep serialization fast. +// str_enc selects the byte-to-str decoding (see fuzz_bytes_to_str). +static PyObject *build_pickle_container(int type, const uint8_t *buf, + size_t len, int str_enc) { + if (len > 256) len = 256; + switch (type) { + case 0: // raw bytes + return PyBytes_FromStringAndSize((const char *)buf, len); + case 1: { // str + std::string s((const char *)buf, len); + return fuzz_bytes_to_str(s, str_enc); + } + case 2: { // list of ints + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + return lst; + } + case 3: { // tuple of ints + PyObject *tup = PyTuple_New((Py_ssize_t)len); + if (!tup) return NULL; + for (size_t i = 0; i < len; i++) + PyTuple_SET_ITEM(tup, i, PyLong_FromLong(buf[i])); + return tup; + } + case 4: { // set + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + PyObject *s = PySet_New(lst); + Py_DECREF(lst); + return s; + } + case 5: { // frozenset + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + PyObject *s = PyFrozenSet_New(lst); + Py_DECREF(lst); + return s; + } + case 6: // bytearray + return PyByteArray_FromStringAndSize((const char *)buf, len); + case 7: { // dict.fromkeys + PyObject *d = PyDict_New(); + if (!d) return NULL; + for (size_t i = 0; i < len; i++) { + PyRef key = PyLong_FromLong(buf[i]); + if (key) PyDict_SetItem(d, key, Py_None); + } + return d; + } + default: + return PyBytes_FromStringAndSize((const char *)buf, len); + } +} + +// OP_PICKLE_DUMPS: Build a fuzz-chosen container type (see +// build_pickle_container; str containers use fuzz_bytes_to_str for +// fuzz-chosen byte-to-str decoding), then call pickle.dumps(obj, protocol=N, +// fix_imports=bool). Protocol is fuzz-chosen 0-5, exercising all pickle +// opcodes: MARK, SHORT_BINBYTES, BINUNICODE, EMPTY_SET, ADDITEMS, +// FROZENSET, BYTEARRAY8, SETITEMS, etc. +static void op_pickle_dumps(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, 7); + int protocol = fdp.ConsumeIntegralInRange(0, 5); + bool fix_imports = fdp.ConsumeBool(); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef obj(build_pickle_container( + container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef proto = PyLong_FromLong(protocol); + CHECK(proto); + PyDict_SetItemString(kwargs, "protocol", proto); + PyDict_SetItemString(kwargs, "fix_imports", + fix_imports ? Py_True : Py_False); + PyRef args = PyTuple_Pack(1, (PyObject *)obj); + CHECK(args); + PyRef r = PyObject_Call(pickle_dumps, args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_LOADS: Wrap fuzz data in BytesIO, then unpickle via one of 3 +// Unpickler subclass variants (fuzz-chosen): +// 0 — RestrictedUnpickler: blocks find_class (safe against arbitrary code) +// 1 — PersistentUnpickler: handles PERSID/BINPERSID opcodes, blocks find_class +// 2 — RestrictedUnpickler with fix_imports=True, encoding='bytes' (Py2 compat) +// Exercises the _pickle C extension's Unpickler_Type code paths. +static void op_pickle_loads(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", + (PyObject *)pydata); + CHECK(bio); + + PyObject *cls = nullptr; + PyRef kwargs_ref; + switch (variant) { + case 0: // RestrictedUnpickler + cls = RestrictedUnpickler_cls; + break; + case 1: // PersistentUnpickler + cls = PersistentUnpickler_cls; + break; + case 2: { // RestrictedUnpickler with fix_imports + encoding='bytes' + cls = RestrictedUnpickler_cls; + kwargs_ref = PyRef(PyDict_New()); + CHECK(kwargs_ref); + PyDict_SetItemString(kwargs_ref, "fix_imports", Py_True); + PyRef enc = PyUnicode_FromString("bytes"); + CHECK(enc); + PyDict_SetItemString(kwargs_ref, "encoding", enc); + break; + } + } + + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef unpickler = PyObject_Call( + cls, args, kwargs_ref.p ? (PyObject *)kwargs_ref : NULL); + CHECK(unpickler); + PyRef r = PyObject_CallMethod(unpickler, "load", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_PICKLER: Create pickle.Pickler(BytesIO, protocol=N), then chain: +// .dump(list_of_ints), .clear_memo(), .dump(str), .getvalue(). +// The str object for the second dump is built via fuzz_bytes_to_str with a +// fuzz-chosen decoding. Exercises the Pickler_Type, memo proxy clear, and +// multi-dump sequences in the _pickle C extension. Protocol is fuzz-chosen 0-5. +static void op_pickle_pickler(FuzzedDataProvider &fdp) { + int protocol = fdp.ConsumeIntegralInRange(0, 5); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef bio = PyObject_CallFunction(bytesio_ctor, NULL); + CHECK(bio); + + // Import pickle.Pickler (cached after first call). + static PyObject *pickle_Pickler = nullptr; + if (!pickle_Pickler) { + pickle_Pickler = import_attr("pickle", "Pickler"); + } + + PyRef pickler = PyObject_CallFunction(pickle_Pickler, "Oi", + (PyObject *)bio, protocol); + CHECK(pickler); + + // Build first object: list of ints. + PyRef obj1(build_pickle_container( + 2, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj1); + + PyRef r1 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef cm = PyObject_CallMethod(pickler, "clear_memo", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + // Build second object: string. + PyRef obj2(fuzz_bytes_to_str(data, str_enc)); + CHECK(obj2); + PyRef r2 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj2); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef val = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_ROUNDTRIP: Build a fuzz-chosen container (str containers use +// fuzz_bytes_to_str for fuzz-chosen byte-to-str decoding), pickle.dumps() it, +// then pickle.loads() the result. Exercises both Pickler and Unpickler in +// a single iteration, ensuring round-trip consistency. +static void op_pickle_roundtrip(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, 7); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef obj(build_pickle_container( + container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef dumped = PyObject_CallFunction(pickle_dumps, "O", (PyObject *)obj); + if (!dumped) { + PyErr_Clear(); + return; + } + PyRef loaded = PyObject_CallFunction(pickle_loads, "O", + (PyObject *)dumped); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Codecs (5 ops) +// +// These exercise the _multibytecodec C engine and per-language codec +// C modules (_codecs_jp, _codecs_cn, _codecs_kr, _codecs_hk, _codecs_tw, +// _codecs_iso2022) as well as built-in codecs (utf-7/8/16/32, ascii, +// latin-1, charmap, unicode_escape, raw_unicode_escape, cp1252). +// --------------------------------------------------------------------------- + +// Codec names for OP_CODECS_DECODE: 17 decoders covering multibyte CJK +// codecs plus single-byte and Unicode escape codecs. +static const char *kCodecDecoders[] = { + "utf-7", "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", + "euc-kr", "gb18030", "big5hkscs", "charmap", "ascii", "latin-1", + "cp1252", "unicode_escape", "raw_unicode_escape", "utf-16", "utf-32", +}; +static constexpr int kNumCodecDecoders = + sizeof(kCodecDecoders) / sizeof(kCodecDecoders[0]); + +// Codec names for OP_CODECS_ENCODE: 19 encoders covering multibyte CJK +// codecs plus Unicode, UTF, and single-byte encoders. +static const char *kCodecEncoders[] = { + "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", "euc-kr", + "gb18030", "big5hkscs", "unicode_escape", "raw_unicode_escape", + "utf-7", "utf-8", "utf-16", "utf-16-le", "utf-16-be", "utf-32", + "latin-1", "ascii", "charmap", +}; +static constexpr int kNumCodecEncoders = + sizeof(kCodecEncoders) / sizeof(kCodecEncoders[0]); + +// OP_CODECS_DECODE: Call codecs.decode(bytes, codec, 'replace') with a +// fuzz-chosen codec from 17 decoders. The 'replace' error handler ensures +// no UnicodeDecodeError is raised. Exercises the multibytecodec_decode and +// built-in codec decode paths. +static void op_codecs_decode(FuzzedDataProvider &fdp) { + int ci = fdp.ConsumeIntegralInRange(0, kNumCodecDecoders - 1); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction(codecs_decode, "Oss", + (PyObject *)pydata, + kCodecDecoders[ci], "replace"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_ENCODE: Convert fuzz bytes to a Python str using a fuzz-chosen +// decoding (Latin-1, UTF-8, UTF-16-LE, or UTF-32-LE — see fuzz_bytes_to_str), +// then call codecs.encode(str, codec, 'replace') with a fuzz-chosen codec +// from 19 encoders. Exercises the multibytecodec_encode and built-in codec +// encode paths. +static void op_codecs_encode(FuzzedDataProvider &fdp) { + int ci = fdp.ConsumeIntegralInRange(0, kNumCodecEncoders - 1); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + PyRef r = PyObject_CallFunction(codecs_encode, "Oss", + (PyObject *)pystr, + kCodecEncoders[ci], "replace"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_INCREMENTAL_DECODE: Get an IncrementalDecoder for a fuzz-chosen +// codec from {shift_jis, gb18030, utf-16}, split the fuzz data at the +// midpoint, then: .decode(first_half), .decode(second_half, final=True), +// .getstate(), .reset(). Exercises the stateful incremental decoding path +// in _multibytecodec (MultibyteIncrementalDecoder_Type). +static void op_codecs_incremental_decode(FuzzedDataProvider &fdp) { + static const char *kIncCodecs[] = {"shift_jis", "gb18030", "utf-16"}; + int ci = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeRemainingBytesAsString(); + size_t mid = data.size() / 2; + + PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); + CHECK(codec_name); + PyRef decoder_factory = PyObject_CallFunction( + codecs_getincrementaldecoder, "O", (PyObject *)codec_name); + CHECK(decoder_factory); + + PyRef decoder = PyObject_CallFunction(decoder_factory, "s", "replace"); + CHECK(decoder); + + PyRef half1 = PyBytes_FromStringAndSize(data.data(), mid); + CHECK(half1); + PyRef r1 = PyObject_CallMethod(decoder, "decode", "O", + (PyObject *)half1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef half2 = PyBytes_FromStringAndSize(data.data() + mid, + data.size() - mid); + CHECK(half2); + PyRef r2 = PyObject_CallMethod(decoder, "decode", "Oi", + (PyObject *)half2, 1); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef state = PyObject_CallMethod(decoder, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef reset = PyObject_CallMethod(decoder, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_INCREMENTAL_ENCODE: Get an IncrementalEncoder for a fuzz-chosen +// codec from {shift_jis, utf-8}. Convert fuzz bytes to str via fuzz-chosen +// decoding (see fuzz_bytes_to_str), split the resulting string at the +// midpoint, then: .encode(first_half), .reset(), .encode(second_half), +// .getstate(). Exercises the stateful incremental encoding path in +// _multibytecodec (MultibyteIncrementalEncoder_Type). +static void op_codecs_incremental_encode(FuzzedDataProvider &fdp) { + static const char *kIncCodecs[] = {"shift_jis", "utf-8"}; + int ci = fdp.ConsumeIntegralInRange(0, 1); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + + PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); + CHECK(codec_name); + PyRef encoder_factory = PyObject_CallFunction( + codecs_getincrementalencoder, "O", (PyObject *)codec_name); + CHECK(encoder_factory); + + PyRef encoder = PyObject_CallFunction(encoder_factory, "s", "replace"); + CHECK(encoder); + + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef r1 = PyObject_CallMethod(encoder, "encode", "O", + (PyObject *)half1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef reset_r = PyObject_CallMethod(encoder, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r2 = PyObject_CallMethod(encoder, "encode", "O", + (PyObject *)half2); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef state = PyObject_CallMethod(encoder, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_STREAM: Wrap fuzz data in BytesIO, create a UTF-8 StreamReader +// via codecs.getreader('utf-8')(bio, errors='replace'), then .read(). +// Exercises the StreamReader code path (MultibyteStreamReader_Type for +// multibyte codecs, or built-in StreamReader for UTF-8). +static void op_codecs_stream(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", + (PyObject *)pydata); + CHECK(bio); + + PyRef reader_factory = PyObject_CallFunction( + codecs_getreader, "s", "utf-8"); + CHECK(reader_factory); + + PyRef reader = PyObject_CallFunction(reader_factory, "Os", + (PyObject *)bio, "replace"); + CHECK(reader); + + PyRef r = PyObject_CallMethod(reader, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — SSL (1 op) +// --------------------------------------------------------------------------- + +// OP_SSL_CERT: Call ssl.DER_cert_to_PEM_cert(data) to attempt DER-to-PEM +// certificate conversion. If successful, create an SSLContext with +// PROTOCOL_TLS_CLIENT and call .load_verify_locations(cadata=pem_string) +// to exercise the OpenSSL certificate parsing path in the _ssl C module. +static void op_ssl_cert(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef pem = PyObject_CallFunction(ssl_DER_cert_to_PEM_cert, "O", + (PyObject *)pydata); + if (!pem) { + PyErr_Clear(); + return; + } + + // Optionally try to load into SSLContext. + PyRef ctx = PyObject_CallFunction(ssl_SSLContext, "l", + ssl_PROTOCOL_TLS_CLIENT_val); + if (!ctx) { + PyErr_Clear(); + return; + } + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "cadata", pem); + PyRef empty_args = PyTuple_New(0); + CHECK(empty_args); + PyRef method = PyObject_GetAttrString(ctx, "load_verify_locations"); + if (!method) { + PyErr_Clear(); + return; + } + PyRef r = PyObject_Call(method, empty_args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_ZLIB_DECOMPRESS, + OP_ZLIB_COMPRESS, + OP_ZLIB_CHECKSUM, + OP_BZ2, + OP_LZMA_DECOMPRESS, + OP_LZMA_COMPRESS, + OP_BINASCII_DECODE, + OP_BINASCII_ENCODE, + OP_BINASCII_CHECKSUM, + OP_BINASCII_ROUNDTRIP, + OP_PICKLE_DUMPS, + OP_PICKLE_LOADS, + OP_PICKLE_PICKLER, + OP_PICKLE_ROUNDTRIP, + OP_CODECS_DECODE, + OP_CODECS_ENCODE, + OP_CODECS_INCREMENTAL_DECODE, + OP_CODECS_INCREMENTAL_ENCODE, + OP_CODECS_STREAM, + OP_SSL_CERT, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_decode(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_ZLIB_DECOMPRESS: + op_zlib_decompress(fdp); + break; + case OP_ZLIB_COMPRESS: + op_zlib_compress(fdp); + break; + case OP_ZLIB_CHECKSUM: + op_zlib_checksum(fdp); + break; + case OP_BZ2: + op_bz2(fdp); + break; + case OP_LZMA_DECOMPRESS: + op_lzma_decompress(fdp); + break; + case OP_LZMA_COMPRESS: + op_lzma_compress(fdp); + break; + case OP_BINASCII_DECODE: + op_binascii_decode(fdp); + break; + case OP_BINASCII_ENCODE: + op_binascii_encode(fdp); + break; + case OP_BINASCII_CHECKSUM: + op_binascii_checksum(fdp); + break; + case OP_BINASCII_ROUNDTRIP: + op_binascii_roundtrip(fdp); + break; + case OP_PICKLE_DUMPS: + op_pickle_dumps(fdp); + break; + case OP_PICKLE_LOADS: + op_pickle_loads(fdp); + break; + case OP_PICKLE_PICKLER: + op_pickle_pickler(fdp); + break; + case OP_PICKLE_ROUNDTRIP: + op_pickle_roundtrip(fdp); + break; + case OP_CODECS_DECODE: + op_codecs_decode(fdp); + break; + case OP_CODECS_ENCODE: + op_codecs_encode(fdp); + break; + case OP_CODECS_INCREMENTAL_DECODE: + op_codecs_incremental_decode(fdp); + break; + case OP_CODECS_INCREMENTAL_ENCODE: + op_codecs_incremental_encode(fdp); + break; + case OP_CODECS_STREAM: + op_codecs_stream(fdp); + break; + case OP_SSL_CERT: + op_ssl_cert(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_helpers.h b/module-fuzzers/fuzz_helpers.h new file mode 100644 index 0000000..c9c270a --- /dev/null +++ b/module-fuzzers/fuzz_helpers.h @@ -0,0 +1,139 @@ +// fuzz_helpers.h — Shared infrastructure for CPython fuzz targets. +// +// Each CPython fuzzer binary (.cpp) includes this header. Since each binary +// compiles exactly one .cpp file, all definitions here are safe (no ODR +// issues across translation units). + +#ifndef FUZZ_HELPERS_H_ +#define FUZZ_HELPERS_H_ + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// LibFuzzer hooks +// --------------------------------------------------------------------------- + +// Disable LeakSanitizer. CPython's pymalloc allocator uses custom freelists +// and arenas that LSAN cannot track, causing thousands of false-positive leak +// reports on every fuzzer iteration. +extern "C" int __lsan_is_turned_off(void) { return 1; } + +// Initialize the CPython interpreter. Called once by libFuzzer before the +// main fuzzing loop begins. +extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { + PyConfig config; + PyConfig_InitPythonConfig(&config); + config.install_signal_handlers = 0; + config.int_max_str_digits = 8086; + PyStatus status; + status = + PyConfig_SetBytesString(&config, &config.program_name, *argv[0]); + if (PyStatus_Exception(status)) goto fail; + status = Py_InitializeFromConfig(&config); + if (PyStatus_Exception(status)) goto fail; + PyConfig_Clear(&config); + return 0; +fail: + PyConfig_Clear(&config); + Py_ExitStatusException(status); +} + +// --------------------------------------------------------------------------- +// RAII wrapper and macros +// --------------------------------------------------------------------------- + +// RAII wrapper for PyObject*. Prevents reference leaks by calling Py_XDECREF +// in the destructor. Non-copyable, move-enabled. +struct PyRef { + PyObject *p; + PyRef(PyObject *o = nullptr) : p(o) {} + ~PyRef() { Py_XDECREF(p); } + operator PyObject *() const { return p; } + explicit operator bool() const { return p != nullptr; } + + PyRef(const PyRef &) = delete; + PyRef &operator=(const PyRef &) = delete; + PyRef(PyRef &&o) : p(o.p) { o.p = nullptr; } + PyRef &operator=(PyRef &&o) { + Py_XDECREF(p); + p = o.p; + o.p = nullptr; + return *this; + } +}; + +// Bail out of the current operation if a Python call returns NULL/false. +// Clears the pending Python exception so the next iteration starts clean. +#define CHECK(x) \ + do { \ + if (!(x)) { \ + PyErr_Clear(); \ + return; \ + } \ + } while (0) + +// Expand a std::string into (const char*, Py_ssize_t) for "y#" format codes. +#define Y(s) (s).data(), (Py_ssize_t)(s).size() + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Import mod.attr and return a new reference. Aborts on failure — called only +// during one-time init, so missing modules indicate a broken build. +static PyObject *import_attr(const char *mod, const char *attr) { + PyObject *m = PyImport_ImportModule(mod); + if (!m) { + PyErr_Print(); + abort(); + } + PyObject *a = PyObject_GetAttrString(m, attr); + Py_DECREF(m); + if (!a) { + PyErr_Print(); + abort(); + } + return a; +} + +// Convert raw fuzz bytes to a Python str using a fuzz-chosen decoding. +// Different decodings give the fuzzer control over different codepoint ranges: +// 0 — Latin-1: lossless 1:1 byte-to-codepoint (U+0000-U+00FF) +// 1 — UTF-8: variable-width, full Unicode (invalid bytes -> U+FFFD) +// 2 — UTF-16-LE: 2 bytes per codepoint, covers BMP including CJK ranges +// 3 — UTF-32-LE: 4 bytes per codepoint, full Unicode incl. supplementary +static PyObject *fuzz_bytes_to_str(const std::string &data, int method) { + switch (method & 3) { + case 0: + return PyUnicode_DecodeLatin1(Y(data), NULL); + case 1: + return PyUnicode_DecodeUTF8(Y(data), "replace"); + case 2: { + int order = -1; // little-endian + return PyUnicode_DecodeUTF16( + data.data(), data.size(), "replace", &order); + } + case 3: { + int order = -1; // little-endian + return PyUnicode_DecodeUTF32( + data.data(), data.size(), "replace", &order); + } + } + return PyUnicode_DecodeLatin1(Y(data), NULL); // unreachable +} + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +// How often (in iterations) to run PyGC_Collect(). +static constexpr int kGcInterval = 200; + +// Maximum fuzz input size (1 MB). +static constexpr size_t kMaxInputSize = 0x100000; + +#endif // FUZZ_HELPERS_H_ diff --git a/module-fuzzers/fuzz_ioops.cpp b/module-fuzzers/fuzz_ioops.cpp new file mode 100644 index 0000000..a8dbd49 --- /dev/null +++ b/module-fuzzers/fuzz_ioops.cpp @@ -0,0 +1,1015 @@ +// fuzz_ioops.cpp — Fuzzer for CPython's I/O C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _io/bytesio.c — BytesIO: write, seek, read, readline, readlines, +// readinto, read1, readinto1, getvalue, getbuffer, +// truncate, tell, iteration, peek (via BufferedReader) +// _io/textio.c — TextIOWrapper: write, read, readline, readlines, +// flush, seek, reconfigure, detach, properties +// (readable/writable/seekable/encoding/buffer), +// IncrementalNewlineDecoder +// _io/bufferedio.c — BufferedReader, BufferedWriter, BufferedRandom, +// BufferedRWPair: read, write, peek, read1, readline, +// seek, tell, truncate, flush, detach, raw +// _io/fileio.c — FileIO: read, readall, readinto, write, flush, +// tell, seek, truncate, fileno, isatty, name, mode, +// closefd, readable, writable, seekable +// _io/_iomodule.c — io.open() with various modes (r, rb, w, wb) +// _io/stringio.c — StringIO: write, seek, readline, readlines, +// truncate, tell, close +// +// The first byte of fuzz input selects one of 7 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (encoding, error handler, newline mode, I/O variant). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. Temporary directory and test file +// are created once at init. PyRef (RAII) prevents reference leaks. +// PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// io classes +static PyObject *io_BytesIO, *io_TextIOWrapper; +static PyObject *io_BufferedReader, *io_BufferedWriter; +static PyObject *io_BufferedRandom, *io_BufferedRWPair; +static PyObject *io_FileIO, *io_open, *io_StringIO; +static PyObject *io_IncrementalNewlineDecoder; + +// os +static PyObject *os_path_join, *os_open_fn, *os_unlink; +static PyObject *os_O_RDONLY; + +// Temp paths (as C strings). +static char tmpdir[256]; +static char tmpfile_path[256]; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_ioops(void) { + if (initialized) return; + + // io + io_BytesIO = import_attr("io", "BytesIO"); + io_TextIOWrapper = import_attr("io", "TextIOWrapper"); + io_BufferedReader = import_attr("io", "BufferedReader"); + io_BufferedWriter = import_attr("io", "BufferedWriter"); + io_BufferedRandom = import_attr("io", "BufferedRandom"); + io_BufferedRWPair = import_attr("io", "BufferedRWPair"); + io_FileIO = import_attr("io", "FileIO"); + io_open = import_attr("io", "open"); + io_StringIO = import_attr("io", "StringIO"); + io_IncrementalNewlineDecoder = import_attr("io", + "IncrementalNewlineDecoder"); + + // os + os_path_join = import_attr("os.path", "join"); + os_open_fn = import_attr("os", "open"); + os_unlink = import_attr("os", "unlink"); + os_O_RDONLY = import_attr("os", "O_RDONLY"); + + // Create temp directory and test file. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import tempfile, os\n" + "_tmpdir = tempfile.mkdtemp(prefix='fuzz_io_')\n" + "_tmpfile = os.path.join(_tmpdir, 'test')\n" + "with open(_tmpfile, 'wb') as f:\n" + " f.write(b'A' * 4096)\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + PyObject *td = PyDict_GetItemString(globals, "_tmpdir"); + PyObject *tf = PyDict_GetItemString(globals, "_tmpfile"); + const char *td_str = PyUnicode_AsUTF8(td); + const char *tf_str = PyUnicode_AsUTF8(tf); + snprintf(tmpdir, sizeof(tmpdir), "%s", td_str); + snprintf(tmpfile_path, sizeof(tmpfile_path), "%s", tf_str); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// Helper: Build a temp file path. +static PyObject *make_tmppath(const char *name) { + return PyObject_CallFunction(os_path_join, "ss", tmpdir, name); +} + +// Helper: Unlink a file (ignore errors). +static void unlink_path(PyObject *path) { + PyRef r = PyObject_CallFunction(os_unlink, "O", path); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations (7 ops). +// --------------------------------------------------------------------------- + +// OP_BYTESIO: BytesIO with fuzz data, then FDP selects actions. +// Exercises _io/bytesio.c paths. +static void op_bytesio(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 6); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // Basic: write/seek/read/getvalue/tell. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef wr = PyObject_CallMethod(bio, "write", "O", (PyObject *)pydata); + if (!wr) { PyErr_Clear(); break; } + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rd = PyObject_CallMethod(bio, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef tl = PyObject_CallMethod(bio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // readline, readlines, readinto. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + { + PyRef r = PyObject_CallMethod(bio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(bio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); + CHECK(buf); + PyRef r = PyObject_CallMethod(bio, "readinto", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // truncate + write + getvalue. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + long trunc_at = data.size() < 64 ? data.size() : 64; + PyRef tr = PyObject_CallMethod(bio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef wr = PyObject_CallMethod(bio, "write", "y#", "XX", 2); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // getbuffer (memoryview). + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef mv = PyObject_CallMethod(bio, "getbuffer", NULL); + if (mv) { + PyRef bytes_val = PyObject_CallFunction( + (PyObject *)&PyBytes_Type, "O", (PyObject *)mv); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rel = PyObject_CallMethod(mv, "release", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 4: { + // read1, readinto1. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + { + PyRef r = PyObject_CallMethod(bio, "read1", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); + CHECK(buf); + PyRef r = PyObject_CallMethod(bio, "readinto1", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 5: { + // Iteration. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef it = PyObject_GetIter(bio); + if (it) { + PyObject *line; + while ((line = PyIter_Next(it)) != NULL) + Py_DECREF(line); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 6: { + // Peek via BufferedReader wrapping. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef br = PyObject_CallFunction(io_BufferedReader, "O", + (PyObject *)bio); + CHECK(br); + { + PyRef r = PyObject_CallMethod(br, "peek", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read", "i", 8); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read1", "i", 8); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(br, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// OP_TEXTIOWRAPPER: FDP selects encoding, errors, newline. Create BytesIO + +// TextIOWrapper. Exercises _io/textio.c paths. +static void op_textiowrapper(FuzzedDataProvider &fdp) { + static const char *kEncodings[] = {"utf-8", "latin-1", "ascii", "utf-16"}; + static const char *kErrors[] = { + "strict", "replace", "xmlcharrefreplace", "backslashreplace", + }; + // NULL = universal newline mode. + static const char *kNewlines[] = {NULL, "\n", "\r\n", ""}; + + int enc_idx = fdp.ConsumeIntegralInRange(0, 3); + int err_idx = fdp.ConsumeIntegralInRange(0, 3); + int nl_idx = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + const char *encoding = kEncodings[enc_idx]; + const char *errors = kErrors[err_idx]; + const char *newline = kNewlines[nl_idx]; + + switch (variant) { + case 0: { + // Write mode: write string, flush, seek, read. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString(encoding); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString(errors); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + if (newline) { + PyRef nl_str = PyUnicode_FromString(newline); + CHECK(nl_str); + PyDict_SetItemString(kwargs, "newline", nl_str); + } + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Read mode: BytesIO(data) + TextIOWrapper, read/readline/readlines. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString(encoding); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + // readline x3. + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(tw, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(tw, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(tw, "read", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(tw, "read", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r3 = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // Reconfigure: write, reconfigure, write more, read. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef kw = PyDict_New(); + CHECK(kw); + PyRef nl = PyUnicode_FromString("\n"); + CHECK(nl); + PyDict_SetItemString(kw, "newline", nl); + PyDict_SetItemString(kw, "line_buffering", Py_True); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef r = PyObject_Call( + PyObject_GetAttrString(tw, "reconfigure"), empty, kw); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (sub) { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)sub); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + { + PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rd = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // Detach. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + { + PyRef r = PyObject_CallMethod(tw, "read", "i", 4); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef raw = PyObject_CallMethod(tw, "detach", NULL); + if (raw) { + PyRef rd = PyObject_CallMethod(raw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(raw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 4: { + // Properties: writable/readable/seekable/encoding/buffer. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + { + PyRef r = PyObject_CallMethod(tw, "writable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "readable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "seekable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(tw, "encoding"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(tw, "buffer"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } +} + +// OP_BUFFERED_IO: FDP selects variant — BufferedReader, BufferedWriter, +// BufferedRandom, BufferedRWPair. Exercises _io/bufferedio.c paths. +static void op_buffered_io(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // BufferedReader wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef br = PyObject_CallFunction(io_BufferedReader, "O", + (PyObject *)bio); + CHECK(br); + { + PyRef r = PyObject_CallMethod(br, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "peek", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read1", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef raw = PyObject_GetAttrString(br, "raw"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef det = PyObject_CallMethod(br, "detach", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // BufferedWriter wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef bw = PyObject_CallFunction(io_BufferedWriter, "O", + (PyObject *)bio); + CHECK(bw); + { + PyRef r = PyObject_CallMethod(bw, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(bw, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(bw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // BufferedRandom wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef brnd = PyObject_CallFunction(io_BufferedRandom, "O", + (PyObject *)bio); + CHECK(brnd); + { + PyRef r = PyObject_CallMethod(brnd, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long trunc_at = data.size() < 64 ? data.size() : 64; + PyRef r = PyObject_CallMethod(brnd, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // BufferedRWPair. + PyRef r_bio = PyObject_CallFunction(io_BytesIO, "O", + (PyObject *)pydata); + CHECK(r_bio); + PyRef w_bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(w_bio); + PyRef rw = PyObject_CallFunction(io_BufferedRWPair, "OO", + (PyObject *)r_bio, (PyObject *)w_bio); + CHECK(rw); + { + PyRef r = PyObject_CallMethod(rw, "read", "i", 32); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(rw, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(rw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } +} + +// OP_FILEIO: FDP selects mode — read, write, read+write. +// Exercises _io/fileio.c paths. +static void op_fileio(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // Read from tmpfile. + PyRef fio = PyObject_CallFunction(io_FileIO, "ss", + tmpfile_path, "r"); + CHECK(fio); + { + PyRef r = PyObject_CallMethod(fio, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "readall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 64); + CHECK(buf); + PyRef r = PyObject_CallMethod(fio, "readinto", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "fileno", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "isatty", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(fio, "name"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(fio, "mode"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "readable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seekable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Write to temp file. + PyRef path(make_tmppath("fio_w")); + CHECK(path); + PyRef fio = PyObject_CallFunction(io_FileIO, "Os", + (PyObject *)path, "w"); + CHECK(fio); + { + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + unlink_path(path); + break; + } + case 2: { + // Read+write mode. + PyRef path(make_tmppath("fio_rw")); + CHECK(path); + PyRef fio = PyObject_CallFunction(io_FileIO, "Os", + (PyObject *)path, "w+b"); + CHECK(fio); + { + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "read", "i", 32); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "truncate", "i", 128); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + unlink_path(path); + break; + } + } +} + +// OP_IO_OPEN: FDP selects mode — read text, read binary, write text, write binary. +// Exercises _io/_iomodule.c open() paths. +static void op_io_open(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 3); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // Read text from tmpfile. + PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "r"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // Read binary from tmpfile. + PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "rb"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // Write text. + PyRef path(make_tmppath("ioopen_w")); + CHECK(path); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + PyRef f = PyObject_CallFunction(io_open, "Os", (PyObject *)path, "w"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + unlink_path(path); + break; + } + case 3: { + // Write binary then read back. + PyRef path(make_tmppath("ioopen_wb")); + CHECK(path); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef f = PyObject_CallFunction(io_open, "Os", + (PyObject *)path, "wb"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + // Read back. + { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef err = PyUnicode_FromString("replace"); + CHECK(err); + PyDict_SetItemString(kwargs, "errors", err); + PyRef args = PyTuple_Pack(1, (PyObject *)path); + CHECK(args); + PyRef f = PyObject_Call(io_open, args, kwargs); + if (f) { + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + unlink_path(path); + break; + } + } +} + +// OP_NEWLINE_DECODER: FDP selects translate mode. Create +// IncrementalNewlineDecoder, split str at midpoint, decode halves. +// Exercises _io/textio.c's newline decoder paths. +static void op_newline_decoder(FuzzedDataProvider &fdp) { + bool translate = fdp.ConsumeBool(); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef dec = PyObject_CallFunction(io_IncrementalNewlineDecoder, "OO", + Py_None, + translate ? Py_True : Py_False); + CHECK(dec); + + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + + { + PyRef r = PyObject_CallMethod(dec, "decode", "O", (PyObject *)half1); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(dec, "decode", "Oi", + (PyObject *)half2, 1); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef state = PyObject_CallMethod(dec, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef reset = PyObject_CallMethod(dec, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + if (state && state.p != Py_None) { + PyRef ss = PyObject_CallMethod(dec, "setstate", "O", + (PyObject *)state); + if (PyErr_Occurred()) PyErr_Clear(); + } + } +} + +// OP_STRINGIO: StringIO write/readline/readlines/truncate/close. +// Exercises _io/stringio.c paths. +static void op_stringio(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 1); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(io_StringIO, NULL); + CHECK(sio); + + { + PyRef r = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); + if (!r) { PyErr_Clear(); return; } + } + { + PyRef r = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + + if (variant == 0) { + // readline x3 + readlines. + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(sio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + } else { + // readlines on initial content. + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + { + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + long trunc_at = slen < 64 ? slen : 64; + PyRef r = PyObject_CallMethod(sio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_BYTESIO, + OP_TEXTIOWRAPPER, + OP_BUFFERED_IO, + OP_FILEIO, + OP_IO_OPEN, + OP_NEWLINE_DECODER, + OP_STRINGIO, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_ioops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_BYTESIO: + op_bytesio(fdp); + break; + case OP_TEXTIOWRAPPER: + op_textiowrapper(fdp); + break; + case OP_BUFFERED_IO: + op_buffered_io(fdp); + break; + case OP_FILEIO: + op_fileio(fdp); + break; + case OP_IO_OPEN: + op_io_open(fdp); + break; + case OP_NEWLINE_DECODER: + op_newline_decoder(fdp); + break; + case OP_STRINGIO: + op_stringio(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_parsers.cpp b/module-fuzzers/fuzz_parsers.cpp new file mode 100644 index 0000000..2e83878 --- /dev/null +++ b/module-fuzzers/fuzz_parsers.cpp @@ -0,0 +1,744 @@ +// fuzz_parsers.cpp — Fuzzer for CPython's parser and text processing C +// extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _json — json.dumps(), JSONEncoder with various options +// _csv — csv.Sniffer.sniff/has_header, csv.writer, +// csv.DictWriter with quoting modes +// pyexpat — ParserCreate with encodings/namespace_separator, +// Parse, ParseFile, handlers, GetInputContext +// time — strftime with fuzz format, strptime with fuzz input +// _operator — lt, gt, eq, ne, contains, countOf, indexOf, +// length_hint, concat, getitem, methodcaller +// _locale — strxfrm, strcoll, getlocale +// _opcode (via dis) — dis.dis() on compiled code +// +// The first byte of fuzz input selects one of 7 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (encoder options, parser encoding, operator selection). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. PyRef (RAII) prevents reference +// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// json +static PyObject *json_dumps, *json_JSONEncoder; + +// csv +static PyObject *csv_Sniffer, *csv_writer, *csv_DictWriter; +static PyObject *csv_QUOTE_ALL, *csv_QUOTE_NONNUMERIC; + +// expat +static PyObject *expat_ParserCreate; + +// io +static PyObject *bytesio_ctor, *stringio_ctor; + +// time +static PyObject *time_strftime, *time_strptime, *time_localtime; + +// operator +static PyObject *op_lt, *op_gt, *op_eq, *op_ne; +static PyObject *op_contains, *op_countOf, *op_indexOf, *op_length_hint; +static PyObject *op_concat, *op_getitem, *op_methodcaller; + +// dis +static PyObject *dis_dis; + +// locale +static PyObject *locale_strxfrm, *locale_strcoll, *locale_getlocale; + +// Handler lambdas (for expat). +static PyObject *noop_handler, *noop_handler_noargs; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_parsers(void) { + if (initialized) return; + + // json + json_dumps = import_attr("json", "dumps"); + json_JSONEncoder = import_attr("json", "JSONEncoder"); + + // csv + csv_Sniffer = import_attr("csv", "Sniffer"); + csv_writer = import_attr("csv", "writer"); + csv_DictWriter = import_attr("csv", "DictWriter"); + csv_QUOTE_ALL = import_attr("csv", "QUOTE_ALL"); + csv_QUOTE_NONNUMERIC = import_attr("csv", "QUOTE_NONNUMERIC"); + + // expat + expat_ParserCreate = import_attr("xml.parsers.expat", "ParserCreate"); + + // io + bytesio_ctor = import_attr("io", "BytesIO"); + stringio_ctor = import_attr("io", "StringIO"); + + // time + time_strftime = import_attr("time", "strftime"); + time_strptime = import_attr("time", "strptime"); + time_localtime = import_attr("time", "localtime"); + + // operator + op_lt = import_attr("operator", "lt"); + op_gt = import_attr("operator", "gt"); + op_eq = import_attr("operator", "eq"); + op_ne = import_attr("operator", "ne"); + op_contains = import_attr("operator", "contains"); + op_countOf = import_attr("operator", "countOf"); + op_indexOf = import_attr("operator", "indexOf"); + op_length_hint = import_attr("operator", "length_hint"); + op_concat = import_attr("operator", "concat"); + op_getitem = import_attr("operator", "getitem"); + op_methodcaller = import_attr("operator", "methodcaller"); + + // dis + dis_dis = import_attr("dis", "dis"); + + // locale + locale_strxfrm = import_attr("locale", "strxfrm"); + locale_strcoll = import_attr("locale", "strcoll"); + locale_getlocale = import_attr("locale", "getlocale"); + + // No-op handler lambdas for expat. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "_noop = lambda *a: None\n" + "_noop_noargs = lambda: None\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + noop_handler = PyDict_GetItemString(globals, "_noop"); + Py_INCREF(noop_handler); + noop_handler_noargs = PyDict_GetItemString(globals, "_noop_noargs"); + Py_INCREF(noop_handler_noargs); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations (7 ops). +// --------------------------------------------------------------------------- + +// OP_JSON_ENCODE: FDP selects variant — json.dumps(str), json.dumps({str:str}), +// json.dumps([str,str]), or JSONEncoder with options. Exercises the _json +// C acceleration module's encoding paths. +static void op_json_encode(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // json.dumps(str) + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)pystr); + break; + } + case 1: { + // json.dumps({str: str}) + PyRef d = PyDict_New(); + CHECK(d); + PyDict_SetItem(d, pystr, pystr); + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)d); + break; + } + case 2: { + // json.dumps([str, str]) + PyRef lst = PyList_New(2); + CHECK(lst); + Py_INCREF((PyObject *)pystr); + Py_INCREF((PyObject *)pystr); + PyList_SET_ITEM((PyObject *)lst, 0, (PyObject *)pystr); + PyList_SET_ITEM((PyObject *)lst, 1, (PyObject *)pystr); + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)lst); + break; + } + case 3: { + // JSONEncoder(ensure_ascii=False).encode(str) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); + break; + } + case 4: { + // JSONEncoder(ensure_ascii=True).encode(str) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_True); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); + break; + } + case 5: { + // JSONEncoder(sort_keys=True, indent=2, ensure_ascii=False).encode({s:s}) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "sort_keys", Py_True); + PyRef indent = PyLong_FromLong(2); + CHECK(indent); + PyDict_SetItemString(kwargs, "indent", indent); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef d = PyDict_New(); + CHECK(d); + PyDict_SetItem(d, pystr, pystr); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)d); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CSV_SNIFFER: Call csv.Sniffer().sniff() and .has_header() on fuzz str. +// Exercises the _csv C module's dialect detection paths. +static void op_csv_sniffer(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)1024)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sniffer = PyObject_CallFunction(csv_Sniffer, NULL); + CHECK(sniffer); + + { + PyRef r = PyObject_CallMethod(sniffer, "sniff", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sniffer, "has_header", "O", + (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_CSV_WRITER: FDP selects variant — basic writerow, writerows, tab-delimited, +// DictWriter, QUOTE_ALL, QUOTE_NONNUMERIC. All write to StringIO. +// Exercises the _csv C module's writer paths. +static void op_csv_writer(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + + // Split string into words for row data. + PyRef words = PyObject_CallMethod(pystr, "split", NULL); + if (!words) { PyErr_Clear(); return; } + + // Ensure non-empty. + if (PyList_Size(words) == 0) { + PyRef empty = PyUnicode_FromString(""); + PyList_Append(words, empty); + } + + switch (variant) { + case 0: { + // Basic writerow. + PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 1: { + // writerows with lines. + PyRef lines = PyObject_CallMethod(pystr, "splitlines", NULL); + if (!lines) { PyErr_Clear(); break; } + PyRef rows = PyList_New(0); + CHECK(rows); + Py_ssize_t nlines = PyList_Size(lines); + for (Py_ssize_t i = 0; i < nlines && i < 20; i++) { + PyObject *line = PyList_GetItem(lines, i); + PyRef lwords = PyObject_CallMethod(line, "split", NULL); + if (!lwords) { PyErr_Clear(); continue; } + if (PyList_Size(lwords) == 0) { + PyRef e = PyUnicode_FromString(""); + PyList_Append(lwords, e); + } + PyList_Append(rows, lwords); + } + PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerows", "O", (PyObject *)rows); + break; + } + case 2: { + // Tab-delimited. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef delim = PyUnicode_FromString("\t"); + CHECK(delim); + PyDict_SetItemString(kwargs, "delimiter", delim); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 3: { + // DictWriter. + Py_ssize_t nwords = PyList_Size(words); + Py_ssize_t nfields = nwords < 8 ? nwords : 8; + if (nfields == 0) nfields = 1; + PyRef fieldnames = PyList_GetSlice(words, 0, nfields); + CHECK(fieldnames); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "fieldnames", fieldnames); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef dw = PyObject_Call(csv_DictWriter, args, kwargs); + CHECK(dw); + PyRef wh = PyObject_CallMethod(dw, "writeheader", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + // Build row dict. + PyRef row = PyDict_New(); + CHECK(row); + for (Py_ssize_t i = 0; i < nfields; i++) { + PyObject *fn = PyList_GetItem(fieldnames, i); + PyDict_SetItem(row, fn, pystr); + } + PyRef wr = PyObject_CallMethod(dw, "writerow", "O", (PyObject *)row); + break; + } + case 4: { + // QUOTE_ALL. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_ALL); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 5: { + // QUOTE_NONNUMERIC. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_NONNUMERIC); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + // Read result. + PyRef val = PyObject_CallMethod(sio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_EXPAT: FDP selects encoding and handler setup, then Parse or ParseFile. +// Exercises the pyexpat C module's XML parsing paths. +static void op_expat(FuzzedDataProvider &fdp) { + static const char *kEncodings[] = {"utf-8", "iso-8859-1", NULL}; + int enc_idx = fdp.ConsumeIntegralInRange(0, 2); + bool use_ns = fdp.ConsumeBool(); + bool set_handlers = fdp.ConsumeBool(); + bool use_parsefile = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)4096)); + + // Create parser. + PyRef parser; + if (use_ns) { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef ns_sep = PyUnicode_FromString(" "); + CHECK(ns_sep); + PyDict_SetItemString(kwargs, "namespace_separator", ns_sep); + PyRef empty = PyTuple_New(0); + CHECK(empty); + parser = PyRef(PyObject_Call(expat_ParserCreate, empty, kwargs)); + } else if (kEncodings[enc_idx]) { + parser = PyRef(PyObject_CallFunction(expat_ParserCreate, "s", + kEncodings[enc_idx])); + } else { + parser = PyRef(PyObject_CallFunction(expat_ParserCreate, NULL)); + } + CHECK(parser); + + // Set handlers. + if (set_handlers) { + PyObject_SetAttrString(parser, "StartElementHandler", noop_handler); + PyObject_SetAttrString(parser, "EndElementHandler", noop_handler); + PyObject_SetAttrString(parser, "CharacterDataHandler", noop_handler); + PyObject_SetAttrString(parser, "ProcessingInstructionHandler", + noop_handler); + PyObject_SetAttrString(parser, "CommentHandler", noop_handler); + PyObject_SetAttrString(parser, "StartCdataSectionHandler", + noop_handler_noargs); + PyObject_SetAttrString(parser, "EndCdataSectionHandler", + noop_handler_noargs); + } + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_parsefile) { + // ParseFile(BytesIO(data)). + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", (PyObject *)pydata); + CHECK(bio); + PyRef r = PyObject_CallMethod(parser, "ParseFile", "O", (PyObject *)bio); + } else { + // Parse(data, True). + PyRef r = PyObject_CallMethod(parser, "Parse", "Oi", + (PyObject *)pydata, 1); + } + if (PyErr_Occurred()) PyErr_Clear(); + + // Optionally GetInputContext. + if (data.size() % 2 == 0) { + PyRef ctx = PyObject_CallMethod(parser, "GetInputContext", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_TIME: FDP selects variant — strftime with fuzz format, strptime with +// fuzz input, or strptime with fuzz format. Exercises the time C module. +static void op_time(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // time.strftime(str, time.localtime()) + PyRef lt = PyObject_CallFunction(time_localtime, NULL); + CHECK(lt); + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strftime, "OO", + (PyObject *)def_fmt, (PyObject *)lt); + } else { + PyRef r = PyObject_CallFunction(time_strftime, "OO", + fmt, (PyObject *)lt); + } + break; + } + case 1: { + // time.strptime(str, '%Y-%m-%d %H:%M:%S') + PyRef r = PyObject_CallFunction(time_strptime, "Os", + (PyObject *)pystr, + "%Y-%m-%d %H:%M:%S"); + break; + } + case 2: { + // time.strptime('2024-01-15 12:30:00', str) + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", + (PyObject *)def_fmt); + } else { + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", fmt); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_OPERATOR: FDP selects operator variant — comparisons, sequence ops, +// concat, getitem, methodcaller. Exercises the _operator C module. +static void op_operator(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeRemainingBytesAsString(); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // Comparisons: lt/gt/eq/ne(data, data[::-1]) + PyRef rev = PyObject_CallMethod(pydata, "__class__", NULL); + // Build reversed bytes. + std::string rdata(data.rbegin(), data.rend()); + PyRef pyrev = PyBytes_FromStringAndSize(Y(rdata)); + CHECK(pyrev); + { + PyRef r = PyObject_CallFunction(op_lt, "OO", + (PyObject *)pydata, (PyObject *)pyrev); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_gt, "OO", + (PyObject *)pydata, (PyObject *)pyrev); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_eq, "OO", + (PyObject *)pydata, (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef empty = PyBytes_FromStringAndSize("", 0); + CHECK(empty); + PyRef r = PyObject_CallFunction(op_ne, "OO", + (PyObject *)pydata, (PyObject *)empty); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Sequence ops: contains, countOf, indexOf, length_hint + if (data.empty()) break; + PyRef byte_val = PyLong_FromLong((unsigned char)data[0]); + CHECK(byte_val); + { + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_countOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_indexOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_length_hint, "O", + (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // concat(data, data) + PyRef r = PyObject_CallFunction(op_concat, "OO", + (PyObject *)pydata, (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // getitem(data, 0) + getitem(data, slice) + if (data.empty()) break; + PyRef zero = PyLong_FromLong(0); + CHECK(zero); + { + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)zero); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef half = PyLong_FromLong(data.size() / 2); + CHECK(half); + PyRef sl = PySlice_New(zero, half, NULL); + CHECK(sl); + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 4: { + // methodcaller('upper')(str) + methodcaller('encode', 'utf-8')(str) + int str_enc = data.size() > 0 ? data[0] & 3 : 0; + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "s", "upper"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "ss", + "encode", "utf-8"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 5: { + // contains on bytes with slice + if (data.empty()) break; + PyRef first = PyBytes_FromStringAndSize(data.data(), 1); + CHECK(first); + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// OP_DIS_LOCALE: FDP selects — dis.dis(compile(str)), locale.strxfrm(str), +// locale.strcoll(str), or locale.getlocale(). Exercises _opcode via dis +// and _locale C module. +static void op_dis_locale(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // dis.dis(compile(str, '', 'exec')) + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *src = slen > 0 ? (PyObject *)pystr : NULL; + if (!src) { + PyRef def_src = PyUnicode_FromString("pass"); + CHECK(def_src); + src = def_src; + Py_INCREF(src); + } else { + Py_INCREF(src); + } + PyRef code = PyRef(Py_CompileString( + PyUnicode_AsUTF8(src), "", Py_file_input)); + Py_DECREF(src); + if (!code) { PyErr_Clear(); break; } + // Capture dis output to StringIO. + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "file", sio); + PyRef args = PyTuple_Pack(1, (PyObject *)code); + CHECK(args); + PyRef r = PyObject_Call(dis_dis, args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // locale.strxfrm(str) + PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // locale.strcoll(str[:mid], str[mid:]) + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r = PyObject_CallFunction(locale_strcoll, "OO", + (PyObject *)half1, (PyObject *)half2); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // locale.getlocale() + PyRef r = PyObject_CallFunction(locale_getlocale, NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_JSON_ENCODE, + OP_CSV_SNIFFER, + OP_CSV_WRITER, + OP_EXPAT, + OP_TIME, + OP_OPERATOR, + OP_DIS_LOCALE, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_parsers(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_JSON_ENCODE: + op_json_encode(fdp); + break; + case OP_CSV_SNIFFER: + op_csv_sniffer(fdp); + break; + case OP_CSV_WRITER: + op_csv_writer(fdp); + break; + case OP_EXPAT: + op_expat(fdp); + break; + case OP_TIME: + op_time(fdp); + break; + case OP_OPERATOR: + op_operator(fdp); + break; + case OP_DIS_LOCALE: + op_dis_locale(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_textops.cpp b/module-fuzzers/fuzz_textops.cpp new file mode 100644 index 0000000..03551e2 --- /dev/null +++ b/module-fuzzers/fuzz_textops.cpp @@ -0,0 +1,467 @@ +// fuzz_textops.cpp — Fuzzer for CPython's text-processing C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// datetime — date/time/datetime.fromisoformat(), strptime(), +// strftime(), format() +// collections — _count_elements (Counter internals) +// unicodedata — category, bidirectional, numeric, decimal, +// combining, east_asian_width, mirrored, name, +// decomposition, normalize, is_normalized, lookup, +// ucd_3_2_0.normalize +// _io (StringIO) — write, seek, read, getvalue, readline, readlines, +// truncate, iteration +// +// The first byte of fuzz input selects one of 6 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (format selection, character range, normalization form). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. PyRef (RAII) prevents reference +// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// datetime +static PyObject *dt_date, *dt_time, *dt_datetime; + +// collections +static PyObject *collections_count_elements; + +// unicodedata +static PyObject *ud_category, *ud_bidirectional, *ud_normalize, *ud_numeric; +static PyObject *ud_lookup, *ud_name, *ud_decomposition, *ud_is_normalized; +static PyObject *ud_east_asian_width, *ud_mirrored, *ud_decimal, *ud_combining; +static PyObject *ud_ucd_3_2_0; + +// io +static PyObject *stringio_ctor; + +// struct +static PyObject *struct_unpack; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_textops(void) { + if (initialized) return; + + // datetime + dt_date = import_attr("datetime", "date"); + dt_time = import_attr("datetime", "time"); + dt_datetime = import_attr("datetime", "datetime"); + + // collections + collections_count_elements = import_attr("collections", "_count_elements"); + + // unicodedata + ud_category = import_attr("unicodedata", "category"); + ud_bidirectional = import_attr("unicodedata", "bidirectional"); + ud_normalize = import_attr("unicodedata", "normalize"); + ud_numeric = import_attr("unicodedata", "numeric"); + ud_lookup = import_attr("unicodedata", "lookup"); + ud_name = import_attr("unicodedata", "name"); + ud_decomposition = import_attr("unicodedata", "decomposition"); + ud_is_normalized = import_attr("unicodedata", "is_normalized"); + ud_east_asian_width = import_attr("unicodedata", "east_asian_width"); + ud_mirrored = import_attr("unicodedata", "mirrored"); + ud_decimal = import_attr("unicodedata", "decimal"); + ud_combining = import_attr("unicodedata", "combining"); + ud_ucd_3_2_0 = import_attr("unicodedata", "ucd_3_2_0"); + + // io + stringio_ctor = import_attr("io", "StringIO"); + + // struct + struct_unpack = import_attr("struct", "unpack"); + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations (6 ops). +// --------------------------------------------------------------------------- + +// OP_DATETIME_PARSE: FDP selects variant — date/time/datetime.fromisoformat() +// or datetime.strptime() with a fuzz-chosen format string. Exercises the +// datetime C module's parsing paths. +static void op_datetime_parse(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + PyRef r = PyObject_CallMethod(dt_date, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 1: { + PyRef r = PyObject_CallMethod(dt_time, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 2: { + PyRef r = PyObject_CallMethod(dt_datetime, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 3: { + PyRef fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); + CHECK(fmt); + PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", + (PyObject *)pystr, (PyObject *)fmt); + break; + } + case 4: { + PyRef fmt = PyUnicode_FromString("%Y/%m/%dT%H:%M"); + CHECK(fmt); + PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", + (PyObject *)pystr, (PyObject *)fmt); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_DATETIME_FORMAT: Unpack 6 shorts from first 12 bytes to build a valid +// datetime, then call strftime() with the remaining fuzz data as the format +// string. Exercises datetime formatting code paths. +static void op_datetime_format(FuzzedDataProvider &fdp) { + // Need at least 12 bytes for the datetime fields. + std::string header = fdp.ConsumeBytesAsString(12); + if (header.size() < 12) return; + + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string fmt_data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef fmt_str(fuzz_bytes_to_str(fmt_data, str_enc)); + CHECK(fmt_str); + + // Unpack 6 unsigned shorts via struct.unpack. + PyRef hdr_bytes = PyBytes_FromStringAndSize(header.data(), 12); + CHECK(hdr_bytes); + PyRef vals = PyObject_CallFunction(struct_unpack, "sO", "6H", + (PyObject *)hdr_bytes); + CHECK(vals); + + // Extract fields and clamp to valid ranges. + long v[6]; + for (int i = 0; i < 6; i++) { + PyObject *item = PyTuple_GetItem(vals, i); + v[i] = PyLong_AsLong(item); + } + long year = (v[0] % 9999) + 1; + long month = (v[1] % 12) + 1; + long day = (v[2] % 28) + 1; + long hour = v[3] % 24; + long minute = v[4] % 60; + long second = v[5] % 60; + + PyRef dt = PyObject_CallFunction(dt_datetime, "llllll", + year, month, day, hour, minute, second); + CHECK(dt); + + // strftime on datetime. + { + PyRef r = PyObject_CallMethod(dt, "strftime", "O", (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // strftime on date. + { + PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); + if (date_obj) { + PyRef r = PyObject_CallMethod(date_obj, "strftime", "O", + (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // strftime on time. + { + PyRef time_obj = PyObject_CallMethod(dt, "time", NULL); + if (time_obj) { + PyRef r = PyObject_CallMethod(time_obj, "strftime", "O", + (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // format(date, str[:16]). + { + PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); + if (date_obj) { + // Cap format spec to 16 chars. + Py_ssize_t flen = PyUnicode_GET_LENGTH(fmt_str); + PyRef short_fmt = PyUnicode_Substring(fmt_str, 0, + flen < 16 ? flen : 16); + if (short_fmt) { + PyRef r = PyObject_Format(date_obj, short_fmt); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } else { + PyErr_Clear(); + } + } +} + +// OP_COLLECTIONS_COUNT: Build a dict and call collections._count_elements() +// with a fuzz-generated string. Exercises the Counter internals C path. +static void op_collections_count(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef d = PyDict_New(); + CHECK(d); + PyRef r = PyObject_CallFunction(collections_count_elements, "OO", + (PyObject *)d, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_UNICODEDATA_CHARINFO: Convert data to str (cap 200 chars), then call +// per-character unicodedata functions. FDP selects which functions to call. +// Exercises the unicodedata C module character-info paths. +static void op_unicodedata_charinfo(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + uint8_t func_mask = fdp.ConsumeIntegral(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)800)); // ~200 chars max + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + Py_ssize_t len = PyUnicode_GET_LENGTH(pystr); + if (len > 200) len = 200; + + PyRef neg_one = PyLong_FromLong(-1); + CHECK(neg_one); + PyRef empty_str = PyUnicode_FromString(""); + CHECK(empty_str); + + for (Py_ssize_t i = 0; i < len; i++) { + PyRef ch = PyUnicode_Substring(pystr, i, i + 1); + if (!ch) { PyErr_Clear(); continue; } + + if (func_mask & 0x01) { + PyRef r = PyObject_CallFunction(ud_category, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x02) { + PyRef r = PyObject_CallFunction(ud_bidirectional, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x04) { + PyRef r = PyObject_CallFunction(ud_numeric, "OO", + (PyObject *)ch, (PyObject *)neg_one); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x08) { + PyRef r = PyObject_CallFunction(ud_decimal, "OO", + (PyObject *)ch, (PyObject *)neg_one); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x10) { + PyRef r = PyObject_CallFunction(ud_combining, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x20) { + PyRef r = PyObject_CallFunction(ud_east_asian_width, "O", + (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x40) { + PyRef r = PyObject_CallFunction(ud_mirrored, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x80) { + PyRef r = PyObject_CallFunction(ud_name, "OO", + (PyObject *)ch, (PyObject *)empty_str); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallFunction(ud_decomposition, "O", + (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + } +} + +// OP_UNICODEDATA_NORMALIZE: FDP selects normalization form from +// {NFC, NFD, NFKC, NFKD}, calls normalize() and is_normalized(). +// Optionally calls ucd_3_2_0.normalize() and lookup(). +static void op_unicodedata_normalize(FuzzedDataProvider &fdp) { + static const char *kForms[] = {"NFC", "NFD", "NFKC", "NFKD"}; + int form_idx = fdp.ConsumeIntegralInRange(0, 3); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + bool try_ucd = fdp.ConsumeBool(); + bool try_lookup = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + const char *form = kForms[form_idx]; + + // normalize(form, str) + { + PyRef r = PyObject_CallFunction(ud_normalize, "sO", + form, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // is_normalized(form, str) + { + PyRef r = PyObject_CallFunction(ud_is_normalized, "sO", + form, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // ucd_3_2_0.normalize('NFC', str) + if (try_ucd) { + PyRef r = PyObject_CallMethod(ud_ucd_3_2_0, "normalize", "sO", + "NFC", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // lookup(str) + if (try_lookup) { + PyRef r = PyObject_CallFunction(ud_lookup, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_STRINGIO: Create io.StringIO(), write fuzz str, then exercise +// read/readline/readlines/truncate/iteration. Exercises _io/stringio.c. +static void op_stringio(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + + // Write the fuzz string. + PyRef wr = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); + if (!wr) { PyErr_Clear(); return; } + + // Seek to start. + PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); + if (!sk) { PyErr_Clear(); return; } + + switch (variant) { + case 0: { + // read + getvalue + PyRef r1 = PyObject_CallMethod(sio, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(sio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // readline x3 + readlines + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(sio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // truncate + tell + iteration + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + long trunc_at = slen < 64 ? slen : 64; + PyRef tr = PyObject_CallMethod(sio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef tl = PyObject_CallMethod(sio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + // Iterate. + PyRef it = PyObject_GetIter(sio); + if (it) { + PyObject *line; + while ((line = PyIter_Next(it)) != NULL) + Py_DECREF(line); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + break; + } + } + + PyRef cl = PyObject_CallMethod(sio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_DATETIME_PARSE, + OP_DATETIME_FORMAT, + OP_COLLECTIONS_COUNT, + OP_UNICODEDATA_CHARINFO, + OP_UNICODEDATA_NORMALIZE, + OP_STRINGIO, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_textops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_DATETIME_PARSE: + op_datetime_parse(fdp); + break; + case OP_DATETIME_FORMAT: + op_datetime_format(fdp); + break; + case OP_COLLECTIONS_COUNT: + op_collections_count(fdp); + break; + case OP_UNICODEDATA_CHARINFO: + op_unicodedata_charinfo(fdp); + break; + case OP_UNICODEDATA_NORMALIZE: + op_unicodedata_normalize(fdp); + break; + case OP_STRINGIO: + op_stringio(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +}