diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index d3d0c964..8eb1cc65 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -4,6 +4,11 @@ XXX version-specific blurb XXX +- Add initial read-only mmap support for store containers: + `DictStore`, `TreeStore`, and `EmbedStore` now accept `mmap_mode="r"` + when opened with `mode="r"` (including via `blosc2.open` for `.b2d`, + `.b2z`, and `.b2e`). + ## Changes from 4.0.0-b1 to 4.0.0 - On Windows, miniexpr is temporarily disabled for integral outputs and mixed-dtype expressions. diff --git a/bench/README_mmap_store_read.md b/bench/README_mmap_store_read.md new file mode 100644 index 00000000..07354b59 --- /dev/null +++ b/bench/README_mmap_store_read.md @@ -0,0 +1,119 @@ +# mmap Store Read Benchmark + +This benchmark compares read performance between: + +- `mode="r", mmap_mode=None` (regular read path) +- `mode="r", mmap_mode="r"` (memory-mapped read path) + +for: + +- `EmbedStore` (`.b2e`) +- `DictStore` (`.b2d`, `.b2z`) +- `TreeStore` (`.b2d`, `.b2z`) + +Script: `bench/mmap_store_read.py` + +## What It Measures + +For each selected combination of container/storage/layout/scenario, it reports: + +- open time median +- read time median +- total time median +- total p10 / p90 +- effective throughput (MiB/s) +- speedup ratio (`regular / mmap`) printed to stdout + +## Scenarios + +### 1. `warm_full_scan` +Reads every node completely (`node[:]`) in-process, repeatedly. + +Use this for steady-state throughput after the OS cache is warm. + +### 2. `warm_random_slices` +Reads random slices from each node (`node[start:start+slice_len]`) in-process, repeatedly. + +Use this for latency-sensitive random access when files are likely warm. + +### 3. `cold_full_scan_drop_caches` +Before each run: calls Linux `drop_caches`, then reads every node completely. + +Use this for first-touch behavior with minimal page-cache carryover. + +### 4. `cold_random_slices_drop_caches` +Before each run: calls Linux `drop_caches`, then performs random-slice reads. + +Use this for first-touch random-access behavior. + +## Dataset Layouts + +- `embedded`: all values stored inside the container payload. +- `external`: for `DictStore` / `TreeStore`, all values are written as external `.b2nd` nodes and referenced. +- `mixed`: alternating embedded/external nodes (for `DictStore` / `TreeStore`). + +`EmbedStore` is benchmarked with `embedded` layout only (by design of this benchmark). + +## Usage Examples + +### Quick warm benchmark across all containers + +```bash +python bench/mmap_store_read.py \ + --scenario warm_full_scan warm_random_slices \ + --runs 7 --n-nodes 128 --node-len 100000 +``` + +### Cold benchmark (Linux + root required) + +```bash +sudo python bench/mmap_store_read.py \ + --scenario cold_full_scan_drop_caches cold_random_slices_drop_caches \ + --runs 5 --drop-caches-value 3 +``` + +### Focused TreeStore benchmark with JSON output + +```bash +sudo python bench/mmap_store_read.py \ + --container tree --storage b2d b2z --layout embedded external mixed \ + --scenario warm_full_scan cold_full_scan_drop_caches \ + --runs 9 --json-out bench_mmap_tree_results.json +``` + +## Important Caveats + +1. Linux root-only for cold scenarios +- Cold scenarios use `/proc/sys/vm/drop_caches` and require root. +- They are intentionally disabled on non-Linux platforms. + +2. `drop_caches` is system-wide and intrusive +- It affects the whole machine, not only this benchmark process. +- Avoid running it on shared or production systems. + +3. mmap is still page-cache based +- `mmap` does not bypass OS caching; it changes IO path and access behavior. +- Warm-cache and cold-cache results can differ substantially. + +4. Compression/decompression may dominate +- If decompression CPU cost is dominant, mmap gains can be small even when IO path improves. +- Compare both full-scan and random-slice scenarios before concluding. + +5. Storage format impacts behavior +- `.b2d` and `.b2z` can behave differently due to layout and access locality. +- Keep format fixed when making A/B claims. + +6. Benchmark realism +- Use representative node sizes, counts, and slice patterns from your target workload. +- Defaults are useful for quick checks, not a universal production proxy. + +7. Variance is expected +- Background IO, filesystem state, and thermal/power conditions affect runs. +- Use medians and p10/p90 (already reported) rather than single-run minima. + +## Recommended Evaluation Workflow + +1. Run warm scenarios first to estimate steady-state behavior. +2. Run cold scenarios (as root) to estimate first-touch behavior. +3. Compare by container + storage format + layout separately. +4. Validate improvements hold for your real slice patterns and dataset scales. diff --git a/bench/mmap_store_read.py b/bench/mmap_store_read.py new file mode 100644 index 00000000..50a30b15 --- /dev/null +++ b/bench/mmap_store_read.py @@ -0,0 +1,579 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +""" +Benchmark mmap read-mode vs regular read-mode for EmbedStore, DictStore and TreeStore. + +This script creates deterministic datasets, then compares: + - mode="r", mmap_mode=None + - mode="r", mmap_mode="r" + +It supports multiple read scenarios: + * warm_full_scan: full reads with warm OS cache. + * warm_random_slices: random small slices with warm OS cache. + * cold_full_scan_drop_caches: full reads after dropping Linux page cache. + * cold_random_slices_drop_caches: random small slices after dropping Linux page cache. + +For cold scenarios, the cache drop mechanism relies on Linux +(/proc/sys/vm/drop_caches) and root privileges. +""" + +from __future__ import annotations + +import argparse +import json +import math +import os +import platform +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +import numpy as np + +import blosc2 + +SCENARIOS = { + "warm_full_scan": { + "pattern": "full", + "drop_caches": False, + "description": "Read all nodes completely in-process with warm cache.", + }, + "warm_random_slices": { + "pattern": "random", + "drop_caches": False, + "description": "Read random slices from each node in-process with warm cache.", + }, + "cold_full_scan_drop_caches": { + "pattern": "full", + "drop_caches": True, + "description": "Read all nodes after dropping Linux page cache before each run.", + }, + "cold_random_slices_drop_caches": { + "pattern": "random", + "drop_caches": True, + "description": "Read random slices after dropping Linux page cache before each run.", + }, +} + + +@dataclass +class RunMetrics: + open_s: float + read_s: float + total_s: float + bytes_read: int + sink: float + + +@dataclass +class SummaryMetrics: + open_median_s: float + read_median_s: float + total_median_s: float + total_p10_s: float + total_p90_s: float + throughput_mib_s: float + + +@dataclass +class ResultRow: + container: str + storage: str + layout: str + scenario: str + mode: str + open_median_s: float + read_median_s: float + total_median_s: float + total_p10_s: float + total_p90_s: float + throughput_mib_s: float + + +def q(values: list[float], quantile: float) -> float: + return float(np.quantile(np.asarray(values, dtype=np.float64), quantile)) + + +def summarize(metrics: list[RunMetrics]) -> SummaryMetrics: + open_vals = [m.open_s for m in metrics] + read_vals = [m.read_s for m in metrics] + total_vals = [m.total_s for m in metrics] + bytes_read = metrics[0].bytes_read if metrics else 0 + total_median = float(np.median(total_vals)) + mib = bytes_read / 2**20 + throughput = mib / total_median if total_median > 0 else math.inf + return SummaryMetrics( + open_median_s=float(np.median(open_vals)), + read_median_s=float(np.median(read_vals)), + total_median_s=total_median, + total_p10_s=q(total_vals, 0.1), + total_p90_s=q(total_vals, 0.9), + throughput_mib_s=throughput, + ) + + +def drop_linux_page_cache(drop_caches_value: int = 3) -> None: + if platform.system() != "Linux": + raise RuntimeError("drop_caches is supported only on Linux") + if os.geteuid() != 0: + raise PermissionError("drop_caches requires root privileges") + if drop_caches_value not in (1, 2, 3): + raise ValueError("drop_caches_value must be 1, 2, or 3") + + os.sync() + with open("/proc/sys/vm/drop_caches", "w", encoding="ascii") as f: + f.write(str(drop_caches_value)) + + +def container_cls(container: str): + if container == "embed": + return blosc2.EmbedStore + if container == "dict": + return blosc2.DictStore + if container == "tree": + return blosc2.TreeStore + raise ValueError(f"Unknown container: {container}") + + +def valid_storage_values(container: str) -> tuple[str, ...]: + if container == "embed": + return ("b2e",) + return ("b2d", "b2z") + + +def store_path(dataset_root: Path, container: str, storage: str, layout: str) -> Path: + base = dataset_root / f"{container}_{layout}" + if storage == "b2e": + return base.with_suffix(".b2e") + if storage == "b2d": + return base.with_suffix(".b2d") + if storage == "b2z": + return base.with_suffix(".b2z") + raise ValueError(f"Unknown storage format: {storage}") + + +def external_data_dir(dataset_root: Path, container: str, storage: str, layout: str) -> Path: + return dataset_root / f"external_{container}_{storage}_{layout}" + + +def node_key(i: int) -> str: + return f"/group_{i % 8:02d}/node_{i:05d}" + + +def node_array(i: int, node_len: int, dtype: np.dtype) -> np.ndarray: + start = i * node_len + stop = start + node_len + return np.arange(start, stop, dtype=dtype) + + +def is_external_node(i: int, layout: str) -> bool: + if layout == "embedded": + return False + if layout == "external": + return True + if layout == "mixed": + return (i % 2) == 1 + raise ValueError(f"Unknown layout: {layout}") + + +def cleanup_path(path: Path) -> None: + blosc2.remove_urlpath(str(path)) + + +def create_dataset( + *, + dataset_root: Path, + container: str, + storage: str, + layout: str, + n_nodes: int, + node_len: int, + dtype: np.dtype, + clevel: int, + codec: blosc2.Codec, +) -> Path: + if container == "embed" and layout != "embedded": + raise ValueError("EmbedStore supports only layout=embedded in this benchmark") + + s_path = store_path(dataset_root, container, storage, layout) + ext_dir = external_data_dir(dataset_root, container, storage, layout) + + cleanup_path(s_path) + cleanup_path(ext_dir) + dataset_root.mkdir(parents=True, exist_ok=True) + ext_dir.mkdir(parents=True, exist_ok=True) + + cparams = blosc2.CParams(clevel=clevel, codec=codec) + dparams = blosc2.DParams(nthreads=blosc2.nthreads) + + if container == "embed": + with blosc2.EmbedStore(urlpath=str(s_path), mode="w", cparams=cparams, dparams=dparams) as store: + for i in range(n_nodes): + store[node_key(i)] = node_array(i, node_len, dtype) + return s_path + + cls = container_cls(container) + with cls(str(s_path), mode="w", threshold=None, cparams=cparams, dparams=dparams) as store: + for i in range(n_nodes): + key = node_key(i) + if is_external_node(i, layout): + epath = ext_dir / f"node_{i:05d}.b2nd" + arr = blosc2.asarray( + node_array(i, node_len, dtype), + urlpath=str(epath), + mode="w", + cparams=cparams, + dparams=dparams, + ) + store[key] = arr + else: + store[key] = node_array(i, node_len, dtype) + + return s_path + + +def open_store(container: str, path: Path, mmap_enabled: bool): + mmap_mode = "r" if mmap_enabled else None + dparams = blosc2.DParams(nthreads=blosc2.nthreads) + + if container == "embed": + return blosc2.EmbedStore(urlpath=str(path), mode="r", mmap_mode=mmap_mode, dparams=dparams) + if container == "dict": + return blosc2.DictStore(str(path), mode="r", mmap_mode=mmap_mode, dparams=dparams) + if container == "tree": + return blosc2.TreeStore(str(path), mode="r", mmap_mode=mmap_mode, dparams=dparams) + raise ValueError(f"Unknown container: {container}") + + +def workload_full_scan(store: Any, keys: list[str]) -> tuple[int, float]: + bytes_read = 0 + sink = 0.0 + for key in keys: + arr = store[key] + data = arr[:] + bytes_read += int(np.asarray(data).nbytes) + if len(data) > 0: + sink += float(np.asarray(data).reshape(-1)[0]) + return bytes_read, sink + + +def workload_random_slices( + store: Any, + keys: list[str], + *, + slice_len: int, + reads_per_node: int, + rng: np.random.Generator, +) -> tuple[int, float]: + bytes_read = 0 + sink = 0.0 + for key in keys: + arr = store[key] + n = len(arr) + if n <= 0: + continue + width = min(slice_len, n) + hi = n - width + for _ in range(reads_per_node): + start = int(rng.integers(0, hi + 1)) if hi > 0 else 0 + data = arr[start : start + width] + data_np = np.asarray(data) + bytes_read += int(data_np.nbytes) + sink += float(data_np.reshape(-1)[0]) + return bytes_read, sink + + +def run_once( + *, + container: str, + path: Path, + data_keys: list[str], + mmap_enabled: bool, + pattern: str, + slice_len: int, + reads_per_node: int, + seed: int, +) -> RunMetrics: + t0 = time.perf_counter() + store = open_store(container, path, mmap_enabled=mmap_enabled) + t1 = time.perf_counter() + + try: + # Use the known data-node keys generated during dataset creation. + # This avoids structural subtree keys in TreeStore (e.g. "/group_xx"), + # which are valid keys but do not represent leaf array payloads. + keys = data_keys + if pattern == "full": + bytes_read, sink = workload_full_scan(store, keys) + elif pattern == "random": + rng = np.random.default_rng(seed) + bytes_read, sink = workload_random_slices( + store, + keys, + slice_len=slice_len, + reads_per_node=reads_per_node, + rng=rng, + ) + else: + raise ValueError(f"Unknown pattern: {pattern}") + finally: + # DictStore/TreeStore expose close(); EmbedStore currently does not. + # Use close() when available and otherwise just release the reference. + close = getattr(store, "close", None) + if callable(close): + close() + del store + + t2 = time.perf_counter() + return RunMetrics(open_s=t1 - t0, read_s=t2 - t1, total_s=t2 - t0, bytes_read=bytes_read, sink=sink) + + +def run_mode( + *, + container: str, + path: Path, + data_keys: list[str], + mmap_enabled: bool, + pattern: str, + drop_caches: bool, + runs: int, + slice_len: int, + reads_per_node: int, + seed: int, + drop_caches_value: int, +) -> list[RunMetrics]: + metrics: list[RunMetrics] = [] + for i in range(runs): + if drop_caches: + drop_linux_page_cache(drop_caches_value=drop_caches_value) + metrics.append( + run_once( + container=container, + path=path, + data_keys=data_keys, + mmap_enabled=mmap_enabled, + pattern=pattern, + slice_len=slice_len, + reads_per_node=reads_per_node, + seed=seed + i, + ) + ) + return metrics + + +def print_scenario_header(name: str) -> None: + meta = SCENARIOS[name] + print(f" Scenario: {name}") + print(f" Description: {meta['description']}") + + +def print_compare(a: SummaryMetrics, b: SummaryMetrics) -> None: + speedup = a.total_median_s / b.total_median_s if b.total_median_s > 0 else math.inf + print( + " regular median={:.4f}s ({:.1f} MiB/s) | mmap median={:.4f}s ({:.1f} MiB/s) | speedup={:.3f}x".format( + a.total_median_s, + a.throughput_mib_s, + b.total_median_s, + b.throughput_mib_s, + speedup, + ) + ) + + +def parse_combinations( + containers: list[str], + storages: list[str], + layouts: list[str], +) -> list[tuple[str, str, str]]: + combos: list[tuple[str, str, str]] = [] + for container in containers: + valid_storages = valid_storage_values(container) + for storage in storages: + if storage not in valid_storages: + continue + for layout in layouts: + if container == "embed" and layout != "embedded": + continue + combos.append((container, storage, layout)) + return combos + + +def validate_args(args: argparse.Namespace) -> None: + if args.drop_caches_value not in (1, 2, 3): + raise ValueError("--drop-caches-value must be 1, 2, or 3") + + cold_selected = any(SCENARIOS[s]["drop_caches"] for s in args.scenarios) + if cold_selected: + if platform.system() != "Linux": + raise RuntimeError("cold/drop-cache scenarios are supported only on Linux") + if os.geteuid() != 0: + raise PermissionError("cold/drop-cache scenarios require root") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Benchmark mmap read mode for EmbedStore/DictStore/TreeStore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--dataset-root", type=Path, default=Path("bench_mmap_store_data")) + parser.add_argument("--container", nargs="+", default=["embed", "dict", "tree"], choices=["embed", "dict", "tree"]) + parser.add_argument("--storage", nargs="+", default=["b2e", "b2d", "b2z"], choices=["b2e", "b2d", "b2z"]) + parser.add_argument("--layout", nargs="+", default=["embedded", "external", "mixed"], choices=["embedded", "external", "mixed"]) + parser.add_argument("--scenario", nargs="+", dest="scenarios", default=list(SCENARIOS), choices=list(SCENARIOS)) + + parser.add_argument("--n-nodes", type=int, default=128) + parser.add_argument("--node-len", type=int, default=100_000) + parser.add_argument("--dtype", type=str, default="float64") + parser.add_argument("--clevel", type=int, default=5) + parser.add_argument("--codec", type=str, default="ZSTD", choices=[c.name for c in blosc2.Codec]) + + parser.add_argument("--runs", type=int, default=7) + parser.add_argument("--slice-len", type=int, default=4096) + parser.add_argument("--reads-per-node", type=int, default=8) + parser.add_argument("--seed", type=int, default=12345) + parser.add_argument("--drop-caches-value", type=int, default=3) + + parser.add_argument("--json-out", type=Path, default=None, help="Optional JSON output path") + parser.add_argument("--keep-dataset", action="store_true", help="Keep generated benchmark files") + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + validate_args(args) + + dtype = np.dtype(args.dtype) + codec = blosc2.Codec[args.codec] + + combos = parse_combinations(args.container, args.storage, args.layout) + if not combos: + raise ValueError("No valid (container, storage, layout) combinations selected") + + rows: list[ResultRow] = [] + + print("mmap read benchmark") + print(f" dataset_root={args.dataset_root}") + print(f" runs={args.runs}, n_nodes={args.n_nodes}, node_len={args.node_len}, dtype={dtype}") + + for container, storage, layout in combos: + print(f"\nDataset: container={container}, storage={storage}, layout={layout}") + data_keys = [node_key(i) for i in range(args.n_nodes)] + s_path = create_dataset( + dataset_root=args.dataset_root, + container=container, + storage=storage, + layout=layout, + n_nodes=args.n_nodes, + node_len=args.node_len, + dtype=dtype, + clevel=args.clevel, + codec=codec, + ) + + for scenario in args.scenarios: + print_scenario_header(scenario) + meta = SCENARIOS[scenario] + regular_runs = run_mode( + container=container, + path=s_path, + data_keys=data_keys, + mmap_enabled=False, + pattern=meta["pattern"], + drop_caches=meta["drop_caches"], + runs=args.runs, + slice_len=args.slice_len, + reads_per_node=args.reads_per_node, + seed=args.seed, + drop_caches_value=args.drop_caches_value, + ) + mmap_runs = run_mode( + container=container, + path=s_path, + data_keys=data_keys, + mmap_enabled=True, + pattern=meta["pattern"], + drop_caches=meta["drop_caches"], + runs=args.runs, + slice_len=args.slice_len, + reads_per_node=args.reads_per_node, + seed=args.seed, + drop_caches_value=args.drop_caches_value, + ) + + regular_summary = summarize(regular_runs) + mmap_summary = summarize(mmap_runs) + print_compare(regular_summary, mmap_summary) + + rows.append( + ResultRow( + container=container, + storage=storage, + layout=layout, + scenario=scenario, + mode="regular", + open_median_s=regular_summary.open_median_s, + read_median_s=regular_summary.read_median_s, + total_median_s=regular_summary.total_median_s, + total_p10_s=regular_summary.total_p10_s, + total_p90_s=regular_summary.total_p90_s, + throughput_mib_s=regular_summary.throughput_mib_s, + ) + ) + rows.append( + ResultRow( + container=container, + storage=storage, + layout=layout, + scenario=scenario, + mode="mmap", + open_median_s=mmap_summary.open_median_s, + read_median_s=mmap_summary.read_median_s, + total_median_s=mmap_summary.total_median_s, + total_p10_s=mmap_summary.total_p10_s, + total_p90_s=mmap_summary.total_p90_s, + throughput_mib_s=mmap_summary.throughput_mib_s, + ) + ) + + if not args.keep_dataset: + cleanup_path(s_path) + cleanup_path(external_data_dir(args.dataset_root, container, storage, layout)) + + if args.json_out is not None: + args.json_out.parent.mkdir(parents=True, exist_ok=True) + payload = { + "config": { + "dataset_root": str(args.dataset_root), + "container": args.container, + "storage": args.storage, + "layout": args.layout, + "scenarios": args.scenarios, + "n_nodes": args.n_nodes, + "node_len": args.node_len, + "dtype": str(dtype), + "clevel": args.clevel, + "codec": args.codec, + "runs": args.runs, + "slice_len": args.slice_len, + "reads_per_node": args.reads_per_node, + "seed": args.seed, + "drop_caches_value": args.drop_caches_value, + }, + "results": [asdict(r) for r in rows], + } + with open(args.json_out, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2) + print(f"\nSaved JSON results to: {args.json_out}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/doc/reference/dict_store.rst b/doc/reference/dict_store.rst index dd0ba6cc..612d53e5 100644 --- a/doc/reference/dict_store.rst +++ b/doc/reference/dict_store.rst @@ -14,6 +14,10 @@ DictStore lets you store and retrieve arrays by string keys (paths like ``"/dir/ Supported values include ``blosc2.NDArray``, ``blosc2.SChunk`` and ``blosc2.C2Array`` (as well as ``numpy.ndarray``, which is converted to NDArray). Small arrays (below a configurable compression‑size threshold) and in‑memory objects are kept inside the embedded store; larger or explicitly external arrays live as regular ``.b2nd`` (NDArray) or ``.b2f`` (SChunk) files. ``C2Array`` objects are always stored in the embedded store. You can mix all types seamlessly and use the usual mapping methods (``__getitem__``, ``__setitem__``, ``keys()``, ``items()``...). +For read-only workloads, DictStore supports memory-mapped opens via +``mmap_mode="r"`` (constructor or :func:`blosc2.open`). This works for both +``.b2d`` and ``.b2z`` containers. + Quick example ------------- @@ -34,6 +38,14 @@ Quick example print(sorted(dstore.keys())) # ['/dir1/node3', '/node1', '/node2'] print(dstore["/node1"][:]) # [1 2 3] + # Reopen in read-only mmap mode + with blosc2.open("my_dstore.b2z", mode="r", mmap_mode="r") as dstore_mmap: + print(dstore_mmap["/dir1/node3"][1:3]) + +.. note:: + For store containers, only ``mmap_mode="r"`` is currently supported, and it + requires ``mode="r"``. + .. currentmodule:: blosc2 .. autoclass:: DictStore diff --git a/doc/reference/embed_store.rst b/doc/reference/embed_store.rst index 261541a5..46f8709b 100644 --- a/doc/reference/embed_store.rst +++ b/doc/reference/embed_store.rst @@ -16,6 +16,9 @@ Important: Only remote ``C2Array`` objects are stored as lightweight references Typical use cases include bundling several small/medium arrays together, shipping datasets as one file, or creating a simple keyed store for heterogeneous array sources. +For read-only access, EmbedStore supports memory-mapped opens via +``mmap_mode="r"`` (constructor or :func:`blosc2.open`) for ``.b2e`` files. + Quickstart ---------- @@ -45,10 +48,15 @@ Quickstart estore = blosc2.open("example_estore.b2e", mode="r") print(list(estore.keys())) + # Reopen in read-only mmap mode + estore_mmap = blosc2.open("example_estore.b2e", mode="r", mmap_mode="r") + print(estore_mmap["/node2"][:]) + .. note:: - Embedded arrays (NumPy, NDArray, and SChunk) increase the size of the ``.b2e`` container. - Remote ``C2Array`` nodes only store lightweight references; reading them requires access to the remote source. NDArrays coming from external ``.b2nd`` files are embedded into the store. - When retrieving, ``estore[key]`` may return either an ``NDArray`` or an ``SChunk`` depending on what was originally stored; deserialization uses :func:`blosc2.from_cframe`. + - For store containers, only ``mmap_mode="r"`` is currently supported, and it requires ``mode="r"``. .. currentmodule:: blosc2 diff --git a/doc/reference/tree_store.rst b/doc/reference/tree_store.rst index 29ac41ae..82e31058 100644 --- a/doc/reference/tree_store.rst +++ b/doc/reference/tree_store.rst @@ -23,6 +23,9 @@ store, while larger arrays or explicitly external arrays are stored as separate ``.b2nd`` files. You can traverse your dataset hierarchically with ``walk()``, query children/descendants, or focus on a subtree view with ``get_subtree()``. +TreeStore also supports read-only memory-mapped opens via ``mmap_mode="r"`` +(constructor or :func:`blosc2.open`) for both ``.b2d`` and ``.b2z`` formats. + Quick example ------------- @@ -51,6 +54,14 @@ Quick example with blosc2.open("my_tree.b2z", mode="r") as tstore: print(sorted(tstore.keys())) + # Reopen in read-only mmap mode + with blosc2.open("my_tree.b2z", mode="r", mmap_mode="r") as tstore_mmap: + print(tstore_mmap["/child0/leaf1"][0:2]) + +.. note:: + For store containers, only ``mmap_mode="r"`` is currently supported, and it + requires ``mode="r"``. + .. currentmodule:: blosc2 .. autoclass:: TreeStore diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index b4281615..b1c7ad9e 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -44,6 +44,9 @@ class DictStore: will be treated as a Blosc2 zip format (B2ZIP). mode : str, optional File mode ('r', 'w', 'a'). Default is 'a'. + mmap_mode : str or None, optional + Memory mapping mode for read access. For now, only ``"r"`` is supported, + and only when ``mode="r"``. Default is None. tmpdir : str or None, optional Temporary directory to use when working with ".b2z" files. If None, a system temporary directory will be managed. Default is None. @@ -97,6 +100,8 @@ def __init__( dparams: blosc2.DParams | None = None, storage: blosc2.Storage | None = None, threshold: int | None = 2**13, + *, + mmap_mode: str | None = None, ): """ See :class:`DictStore` for full documentation of parameters. @@ -106,8 +111,13 @@ def __init__( raise ValueError(f"localpath must have a .b2z or .b2d extension; you passed: {self.localpath}") if mode not in ("r", "w", "a"): raise ValueError("For DictStore containers, mode must be 'r', 'w', or 'a'") + if mmap_mode not in (None, "r"): + raise ValueError("For DictStore containers, mmap_mode must be None or 'r'") + if mmap_mode == "r" and mode != "r": + raise ValueError("For DictStore containers, mmap_mode='r' requires mode='r'") self.mode = mode + self.mmap_mode = mmap_mode self.threshold = threshold self.cparams = cparams or blosc2.CParams() self.dparams = dparams or blosc2.DParams() @@ -153,7 +163,13 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): if "embed.b2e" not in self.offsets: raise FileNotFoundError("Embed file embed.b2e not found in store.") estore_offset = self.offsets["embed.b2e"]["offset"] - schunk = blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=estore_offset, dparams=dparams) + schunk = blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=estore_offset, + mmap_mode=self.mmap_mode, + dparams=dparams, + ) for filepath in self.offsets: if filepath.endswith((".b2nd", ".b2f")): key = "/" + filepath[: -5 if filepath.endswith(".b2nd") else -4] @@ -161,7 +177,13 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): else: # .b2d if not os.path.isdir(self.localpath): raise FileNotFoundError(f"Directory {self.localpath} does not exist for reading.") - schunk = blosc2.blosc2_ext.open(self.estore_path, mode="r", offset=0, dparams=dparams) + schunk = blosc2.blosc2_ext.open( + self.estore_path, + mode="r", + offset=0, + mmap_mode=self.mmap_mode, + dparams=dparams, + ) self._update_map_tree() self._estore = EmbedStore(_from_schunk=schunk) @@ -267,11 +289,22 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array: filepath = self.map_tree[key] if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - return blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=offset, dparams=self.dparams) + return blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ) else: urlpath = os.path.join(self.working_dir, filepath) if os.path.exists(urlpath): - return blosc2.open(urlpath, mode="r" if self.mode == "r" else "a", dparams=self.dparams) + return blosc2.open( + urlpath, + mode="r" if self.mode == "r" else "a", + mmap_mode=self.mmap_mode if self.mode == "r" else None, + dparams=self.dparams, + ) else: raise KeyError(f"File for key '{key}' not found in offsets or temporary directory.") @@ -332,11 +365,20 @@ def values(self) -> Iterator[blosc2.NDArray | SChunk | C2Array]: if filepath in self.offsets: offset = self.offsets[filepath]["offset"] yield blosc2.blosc2_ext.open( - self.b2z_path, mode="r", offset=offset, dparams=self.dparams + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, ) else: urlpath = os.path.join(self.working_dir, filepath) - yield blosc2.open(urlpath, mode="r" if self.mode == "r" else "a", dparams=self.dparams) + yield blosc2.open( + urlpath, + mode="r" if self.mode == "r" else "a", + mmap_mode=self.mmap_mode if self.mode == "r" else None, + dparams=self.dparams, + ) elif key in self._estore: yield self._estore[key] @@ -352,10 +394,27 @@ def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | C2Array]]: if self.is_zip_store: if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - yield key, blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=offset) + yield ( + key, + blosc2.blosc2_ext.open( + self.b2z_path, + mode="r", + offset=offset, + mmap_mode=self.mmap_mode, + dparams=self.dparams, + ), + ) else: urlpath = os.path.join(self.working_dir, filepath) - yield key, blosc2.open(urlpath, mode="r" if self.mode == "r" else "a") + yield ( + key, + blosc2.open( + urlpath, + mode="r" if self.mode == "r" else "a", + mmap_mode=self.mmap_mode if self.mode == "r" else None, + dparams=self.dparams, + ), + ) elif key in self._estore: yield key, self._estore[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index 7d6316fe..757c0245 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -36,6 +36,9 @@ class EmbedStore: deserialized later using the :func:`blosc2.from_cframe` function. mode : str, optional File mode ('r', 'w', 'a'). Default is 'w'. + mmap_mode : str or None, optional + Memory mapping mode for read access. For now, only ``"r"`` is supported, + and only when ``mode="r"``. Default is None. cparams : dict or None, optional Compression parameters for nodes and the embed store itself. Default is None, which uses the default Blosc2 parameters. @@ -76,6 +79,8 @@ def __init__( storage: blosc2.Storage | None = None, chunksize: int | None = 2**13, _from_schunk: SChunk | None = None, + *, + mmap_mode: str | None = None, ): """Initialize EmbedStore.""" @@ -84,6 +89,11 @@ def __init__( # Let's use the SChunk store by default and continue experimenting. self._schunk_store = True # put this to False to use an NDArray instead of a SChunk self.urlpath = urlpath + if mmap_mode not in (None, "r"): + raise ValueError("For EmbedStore containers, mmap_mode must be None or 'r'") + if mmap_mode == "r" and mode != "r": + raise ValueError("For EmbedStore containers, mmap_mode='r' requires mode='r'") + self.mmap_mode = mmap_mode if _from_schunk is not None: self.cparams = _from_schunk.cparams @@ -108,7 +118,7 @@ def __init__( self.storage = storage if mode in ("r", "a") and urlpath: - self._store = blosc2.blosc2_ext.open(urlpath, mode=mode, offset=0) + self._store = blosc2.blosc2_ext.open(urlpath, mode=mode, offset=0, mmap_mode=mmap_mode) self._load_metadata() return diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index 8ba844b7..2510194d 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -445,7 +445,32 @@ def test_open_context_manager(populated_dict_store): dstore_fixture.close() # Test opening via blosc2.open as a context manager - with blosc2.open(path, mode="r") as dstore: + with blosc2.open(path, mode="r", mmap_mode="r") as dstore: assert isinstance(dstore, DictStore) assert "/node1" in dstore assert np.array_equal(dstore["/node1"][:], np.array([1, 2, 3])) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_mmap_mode_read_access(storage_type, tmp_path): + path = tmp_path / f"test_mmap_dstore.{storage_type}" + external_path = tmp_path / "external_node.b2nd" + + with DictStore(str(path), mode="w", threshold=None) as dstore: + dstore["/embedded"] = np.arange(16) + external = blosc2.arange(32, urlpath=str(external_path), mode="w") + dstore["/external"] = external + + with DictStore(str(path), mode="r", mmap_mode="r") as dstore: + assert np.array_equal(dstore["/embedded"][3:7], np.arange(3, 7)) + assert np.array_equal(dstore["/external"][11:15], np.arange(11, 15)) + + +def test_mmap_mode_validation(tmp_path): + path = tmp_path / "test_mmap_validation.b2z" + + with pytest.raises(ValueError, match="mmap_mode must be None or 'r'"): + DictStore(str(path), mode="r", mmap_mode="r+") + + with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"): + DictStore(str(path), mode="a", mmap_mode="r") diff --git a/tests/test_embed_store.py b/tests/test_embed_store.py index a29d5b20..719505a5 100644 --- a/tests/test_embed_store.py +++ b/tests/test_embed_store.py @@ -213,7 +213,26 @@ def test_open_context_manager(cleanup_files): estore["/node1"] = np.arange(10) # Test opening via blosc2.open as a context manager - with blosc2.open(path, mode="r") as estore_read: + with blosc2.open(path, mode="r", mmap_mode="r") as estore_read: assert isinstance(estore_read, blosc2.EmbedStore) assert "/node1" in estore_read assert np.array_equal(estore_read["/node1"][:], np.arange(10)) + + +def test_mmap_mode_read_access(cleanup_files): + path = "test_embed_mmap_read.b2e" + cleanup_files.append(path) + + estore = blosc2.EmbedStore(path, mode="w") + estore["/node1"] = np.arange(20) + + estore_read = blosc2.EmbedStore(path, mode="r", mmap_mode="r") + assert np.array_equal(estore_read["/node1"][5:11], np.arange(5, 11)) + + +def test_mmap_mode_validation(): + with pytest.raises(ValueError, match="mmap_mode must be None or 'r'"): + blosc2.EmbedStore(urlpath="test_invalid.b2e", mode="r", mmap_mode="r+") + + with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"): + blosc2.EmbedStore(urlpath="test_invalid.b2e", mode="a", mmap_mode="r") diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 09f86b17..9f101594 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -930,7 +930,28 @@ def test_open_context_manager(populated_tree_store): tstore_fixture.close() # Test opening via blosc2.open as a context manager - with blosc2.open(path, mode="r") as tstore: + with blosc2.open(path, mode="r", mmap_mode="r") as tstore: assert isinstance(tstore, TreeStore) assert "/child0/data" in tstore assert np.array_equal(tstore["/child0/data"][:], np.array([1, 2, 3])) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_mmap_mode_read_access(storage_type, tmp_path): + path = tmp_path / f"test_tstore_mmap.{storage_type}" + + with TreeStore(str(path), mode="w") as tstore: + tstore["/group/node"] = np.arange(12) + + with TreeStore(str(path), mode="r", mmap_mode="r") as tstore: + assert np.array_equal(tstore["/group/node"][4:9], np.arange(4, 9)) + + +def test_mmap_mode_validation(tmp_path): + path = tmp_path / "test_tstore_mmap_validation.b2z" + + with pytest.raises(ValueError, match="mmap_mode must be None or 'r'"): + TreeStore(str(path), mode="r", mmap_mode="c") + + with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"): + TreeStore(str(path), mode="a", mmap_mode="r")