From 8952c73497eac89ca6edc1f81076d752a2cfcd89 Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 15:44:38 -0500 Subject: [PATCH 1/7] Add typer cli with option to download and register new dataset. This is meant to start addressing https://github.com/Parcels-code/parcels-benchmarks/issues/26 --- parcels_benchmarks/benchmark_setup.py | 221 +++++++++++++++++++------- parcels_benchmarks/benchmarks.json | 16 ++ pyproject.toml | 10 ++ 3 files changed, 187 insertions(+), 60 deletions(-) create mode 100644 parcels_benchmarks/benchmarks.json diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py index 2a7715e..2c682ad 100644 --- a/parcels_benchmarks/benchmark_setup.py +++ b/parcels_benchmarks/benchmark_setup.py @@ -6,57 +6,87 @@ import pooch import sys import xarray as xr +import typer + + +app = typer.Typer(add_completion=False) PARCELS_DATADIR = os.getenv("PARCELS_DATADIR", default=None) if PARCELS_DATADIR is not None: PARCELS_DATADIR = Path(PARCELS_DATADIR) -# When modifying existing datasets in a backwards incompatible way, -# make a new release in the repo and update the DATA_REPO_TAG to the new tag -BENCHMARK_DATA = [ - { - "name": "MOi-curvilinear", - "file": "Parcels_Benchmarks_MOi_data.zip", - "known_hash": "f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885" - }, - { - "name": "FESOM-baroclinic-gyre", - "file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip", - "known_hash": "8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3" - } -] - -DATA_URL = "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=" - -DATA_FILES = {} -for data in BENCHMARK_DATA: - DATA_FILES[data["name"]] = data["file"] - -def _create_pooch_registry() -> dict[str, None]: + +DEFAULT_MANIFEST = Path(__file__).with_name("benchmarks.json") + +def _load_manifest(path: Path) -> dict: + if not path.is_file(): + raise FileNotFoundError(f"Manifest not found: {path}") + with path.open("r", encoding="utf-8") as f: + manifest = json.load(f) + + if "datasets" not in manifest or not isinstance(manifest["datasets"], list): + raise ValueError("Manifest must contain a top-level 'datasets' list") + + # Provide default URL if omitted + manifest.setdefault( + "data_url", + "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=", + ) + return manifest + +def _save_manifest(path: Path, manifest: dict[str, Any]) -> None: + # keep stable ordering by dataset name + manifest["datasets"] = sorted(manifest["datasets"], key=lambda d: d.get("name", "")) + with path.open("w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2) + f.write("\n") + +def _cache_dir(data_home: Path | None) -> Path: + if data_home is None: + return Path(pooch.os_cache("parcels-benchmarks")) + return Path(data_home) + +def _datasets_by_name(manifest: dict) -> dict[str, dict]: + out: dict[str, dict] = {} + for d in manifest["datasets"]: + name = d.get("name") + file = d.get("file") + known_hash = d.get("known_hash") + if not name or not file: + raise ValueError(f"Each dataset needs at least 'name' and 'file': {d}") + if name in out: + raise ValueError(f"Duplicate dataset name in manifest: {name}") + out[name] = { + "name": name, + "file": file, + "known_hash": known_hash, + } + return out + +def _create_pooch_registry(manifest: dict) -> dict[str, str | None]: """Collapses the mapping of dataset names to filenames into a pooch registry. Hashes are set to None for all files. """ - registry: dict[str, None] = {} - for data in BENCHMARK_DATA: - registry[data["file"]] = data["known_hash"] + registry: dict[str, str | None] = {} + for data in manifests["datasets"]: + registry[data["file"]] = data.get("known_hash") return registry POOCH_REGISTRY = _create_pooch_registry() -def _get_pooch(data_home=None): - if data_home is None: - data_home = pooch.os_cache("parcels-benchmarks") - - data_home.parent.mkdir(exist_ok=True) +def _get_pooch(manifest: dict, data_home: Path | None=None)->pooch.Pooch: + cache_dir = _cache_dire(data_home) + registry = _create_pooch_registry(manifest) + cache_dir.parent.mkdir(parents=True,exist_ok=True) return pooch.create( - path=data_home, - base_url=DATA_URL, - registry=POOCH_REGISTRY, + path=cache_dir, + base_url=manifest["data_url"], + registry=registry, ) -def download_example_dataset(dataset: str, data_home=None): - """Load an example dataset from the parcels website. +def download_example_dataset(dataset: str, manifest_path: Path = DEFAULT_MANIFEST, data_home: Path | None = None) -> Path: + """Load an example dataset listed in the provided manifest This function provides quick access to a small number of example datasets that are useful in documentation and testing in parcels. @@ -65,6 +95,8 @@ def download_example_dataset(dataset: str, data_home=None): ---------- dataset : str Name of the dataset to load. + manifest_path: Path + Fully qualified path to a parcels-benchmarks manifest file data_home : pathlike, optional The directory in which to cache data. If not specified, defaults to wherever pooch.os_cache("parcels-benchmarks") goes on your system. @@ -74,13 +106,17 @@ def download_example_dataset(dataset: str, data_home=None): dataset_folder : Path Path to the folder containing the downloaded dataset files. """ + manifest = _load_manifest(manifest_path) + datasets = _datasets_by_name(manifest) + # Dev note: `dataset` is assumed to be a folder name with netcdf files - if dataset not in DATA_FILES: + if dataset not in datasets: raise ValueError( - f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(DATA_FILES.keys()) + f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(datasets.keys()) ) - odie = _get_pooch(data_home=data_home) - listing = odie.fetch(DATA_FILES[dataset],processor=pooch.Unzip()) + odie = _get_pooch(manifest,data_home=data_home) + zip_name = datasets[dataset]["file"] + listing = odie.fetch(zip_name,processor=pooch.Unzip()) # as pooch currently returns a file listing while we want a dir, # we find the common parent dir to all files @@ -89,33 +125,98 @@ def download_example_dataset(dataset: str, data_home=None): return common_parent_dir -def download_datasets(data_home=None): - """Download all datasets listed in the config file to the specified location. +@app.command("download-all") +def download_all( + manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), + data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."), +) -> None: + """Download all datasets listed in benchmarks manifest file.""" - Parameters - ---------- - data_home : pathlike, optional - The directory in which to cache data. If not specified, defaults to wherever - pooch.os_cache("parcels-benchmarks") goes on your system. + manifest = _load_manifest(manifest_path) + datasets = _datasets_by_name(manifest) - Returns - ------- - dataset_folders : dict - Mapping of dataset names to paths to the folder containing the downloaded dataset files. - """ - dataset_folders = {} - for dataset in DATA_FILES: - folder = download_example_dataset(dataset, data_home=data_home) + dataset_folders: dict[str, Path] = {} + for dataset_name in datasets.keys(): + folder = download_example_dataset(dataset_name, manifest_path=manifest_path, data_home=data_home) dataset_folders[dataset] = folder return dataset_folders +@app.command("add-remote") +def add_remote_dataset( + name: str = typer.Option(..., help="New dataset name to add to the manifest."), + file: str = typer.Option(..., help="Zip filename available at data_url (e.g. Foo.zip)."), + manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), + data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."), +) -> None: + """ + Download a NEW dataset whose zip exists at data_url but is not yet in the manifest. + + We assume the sha256 is unknown ahead of time: + - download with known_hash=None + - compute sha256 of the downloaded zip + - append {name,file,known_hash} to the manifest + """ + m = _load_manifest(manifest) + by_name = _dataset_by_name(m) + + if name in by_name: + raise typer.BadParameter(f"Dataset {name!r} already exists in manifest.") + + # Also prevent duplicates by file + existing_files = {d.get("file") for d in m["datasets"]} + if file in existing_files: + raise typer.BadParameter(f"File {file!r} is already referenced in the manifest.") + + base_url = m["data_url"] + cache_dir = _cache_dir(data_home) + url = f"{base_url}{file}" + cache_dir.mkdir(parents=True, exist_ok=True) + + typer.echo(f"Downloading (no hash verification): {url}") + # Download the zip WITHOUT verifying hash. + result = pooch.retrieve( + url=url, + known_hash=None, + path=cache_dir, + processor=None, + ) + typer.echo(f" Downloaded zip -> {zip_path}") + + digest = pooch.file_hash(Path(result)) + known_hash = f"sha256:{digest}" + typer.echo(f" Computed {known_hash}") + + typer.echo("Unzipping...") + result = pooch.retrieve( + url=url, + known_hash=known_hash, + path=cache_dir, + processor=pooch.Unzip(), + ) + typer.echo(f" Unzipped -> {folder}") + + # Append to manifest + m["datasets"].append({"name": name, "file": file, "known_hash": known_hash}) + _save_manifest(manifest, m) + typer.echo(f"Added {name!r} to {manifest}") + +@app.command("list") +def list_datasets( + manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), +) -> None: + """ + List datasets in the manifest. + """ + m = _load_manifest(manifest) + by_name = _dataset_by_name(m) + for name, entry in sorted(by_name.items(), key=lambda kv: kv[0]): + typer.echo(f"{name}: {entry['file']} ({entry.get('known_hash', 'no-hash')})") + + +def main() -> None: + app() -def main(argv=None) -> int: - folders = download_datasets(data_home=PARCELS_DATADIR) - print("Downloaded datasets:") - for name, folder in folders.items(): - print(f" {name}: {folder}") if __name__ == "__main__": - raise main() + main() diff --git a/parcels_benchmarks/benchmarks.json b/parcels_benchmarks/benchmarks.json new file mode 100644 index 0000000..57f0113 --- /dev/null +++ b/parcels_benchmarks/benchmarks.json @@ -0,0 +1,16 @@ +{ + "data_url": "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=", + "datasets": [ + { + "name": "MOi-curvilinear", + "file": "Parcels_Benchmarks_MOi_data.zip", + "known_hash": "sha256:f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885" + }, + { + "name": "FESOM-baroclinic-gyre", + "file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip", + "known_hash": "sha256:8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3" + } + ] +} + diff --git a/pyproject.toml b/pyproject.toml index 07ee4ee..16180a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,12 +11,22 @@ authors = [{ name = "Parcels team" }] dependencies = [ "xarray", "pooch", + "typer>=0.9", ] +# This creates an installable CLI command: `parcels-benchmarks ...` +[project.scripts] +parcels-benchmarks = "parcels_benchmarks.benchmark_setup:main" + + [tool.setuptools.packages.find] where = ["."] include = ["parcels_benchmarks"] +# Include the benchmarks manifest in the wheel/sdist +[tool.setuptools.package-data] +parcels_benchmarks = ["*.json"] + [tool.ruff.lint] select = [ From ad19e8d8bc3ac2ad421f86e02e967f9918e8d3a1 Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 20:07:10 -0500 Subject: [PATCH 2/7] Debug benchmark-setup tool and rename cli to benchmark-setup --- parcels_benchmarks/benchmark_setup.py | 3 +-- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py index 2c682ad..3143098 100644 --- a/parcels_benchmarks/benchmark_setup.py +++ b/parcels_benchmarks/benchmark_setup.py @@ -3,6 +3,7 @@ import json import os from pathlib import Path +from typing import Any import pooch import sys import xarray as xr @@ -73,8 +74,6 @@ def _create_pooch_registry(manifest: dict) -> dict[str, str | None]: return registry -POOCH_REGISTRY = _create_pooch_registry() - def _get_pooch(manifest: dict, data_home: Path | None=None)->pooch.Pooch: cache_dir = _cache_dire(data_home) registry = _create_pooch_registry(manifest) diff --git a/pyproject.toml b/pyproject.toml index 16180a3..8605d6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,9 @@ dependencies = [ "typer>=0.9", ] -# This creates an installable CLI command: `parcels-benchmarks ...` +# This creates an installable CLI command: `benchmark-setup ...` [project.scripts] -parcels-benchmarks = "parcels_benchmarks.benchmark_setup:main" +benchmark-setup = "parcels_benchmarks.benchmark_setup:main" [tool.setuptools.packages.find] From 0187688fc6860668b4827d92540f88c6ef8fdea4 Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 21:06:29 -0500 Subject: [PATCH 3/7] Debug add-dataset subcommand --- parcels_benchmarks/benchmark_setup.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py index 3143098..bc21def 100644 --- a/parcels_benchmarks/benchmark_setup.py +++ b/parcels_benchmarks/benchmark_setup.py @@ -75,7 +75,7 @@ def _create_pooch_registry(manifest: dict) -> dict[str, str | None]: def _get_pooch(manifest: dict, data_home: Path | None=None)->pooch.Pooch: - cache_dir = _cache_dire(data_home) + cache_dir = _cache_dir(data_home) registry = _create_pooch_registry(manifest) cache_dir.parent.mkdir(parents=True,exist_ok=True) return pooch.create( @@ -140,8 +140,8 @@ def download_all( dataset_folders[dataset] = folder return dataset_folders -@app.command("add-remote") -def add_remote_dataset( +@app.command("add-dataset") +def add_dataset( name: str = typer.Option(..., help="New dataset name to add to the manifest."), file: str = typer.Option(..., help="Zip filename available at data_url (e.g. Foo.zip)."), manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), @@ -156,9 +156,9 @@ def add_remote_dataset( - append {name,file,known_hash} to the manifest """ m = _load_manifest(manifest) - by_name = _dataset_by_name(m) + datasets = _datasets_by_name(m) - if name in by_name: + if name in datasets: raise typer.BadParameter(f"Dataset {name!r} already exists in manifest.") # Also prevent duplicates by file @@ -179,7 +179,7 @@ def add_remote_dataset( path=cache_dir, processor=None, ) - typer.echo(f" Downloaded zip -> {zip_path}") + typer.echo(f" Downloaded zip -> {Path(result)}") digest = pooch.file_hash(Path(result)) known_hash = f"sha256:{digest}" @@ -192,7 +192,9 @@ def add_remote_dataset( path=cache_dir, processor=pooch.Unzip(), ) - typer.echo(f" Unzipped -> {folder}") + files = [Path(p) for p in result] + common_parent_dir = min(files, key=lambda p: len(p.parents)).parent + typer.echo(f" Unzipped -> {common_parent_dir}") # Append to manifest m["datasets"].append({"name": name, "file": file, "known_hash": known_hash}) From 0064ac3ac5edba9176f6e883c4b7995d8a0dc2e9 Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 21:08:15 -0500 Subject: [PATCH 4/7] Update section on adding new data --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3ff60ee..bc1c245 100644 --- a/README.md +++ b/README.md @@ -55,17 +55,17 @@ git push origin main ## Adding benchmarks Adding benchmarks for parcels typically involves adding a dataset and defining the benchmarks you want to run. -Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive. -Once your data is hosted, you can add an entry to the `parcels_benchmarks.benchmark_setup.DATA_FILES` list. Each entry has the following attributes +### Adding new data +Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive. +Once your data is hosted in the shared SurfDrive, you can easily add your dataset to the benchmark dataset manifest using ``` -{ - "name": str # Name of the dataset that you can reference in the benchmarks - "file": str, # Path, relative to the data_url, to the .zip file containing the benchmark data - "known_hash": str | None # Pooch hash of the zip file; set to None if it is unknown -}, +pixi run benchmark-setup pixi add-dataset --name "Name for your dataset" --file "Path to ZIP archive in the SurfDrive" ``` +During this process, the dataset will be downloaded and a complete entry will be added to the [parcels_benchmarks/benchmarks.json](./parcels_benchmarks/benchmarks.json) manifest file. Once updated, this file can be committed to this repository and contributed via a pull request. + +### Writing the benchmarks This repository uses [ASV](https://asv.readthedocs.io/) for running benchmarks. You can add benchmarks by including a python script in the `benchmarks/` subdirectory. Within each `benchmarks/*.py` file, we ask that you define a class for the set of benchmarks you plan to run for your dataset. You can use the existing benchmarks as a good starting point for writing your benchmarks. To learn more about writing benchmarks compatible with ASV, see the [ASV "Writing Benchmarks" documentation](https://asv.readthedocs.io/en/latest/writing_benchmarks.html) From db64d35c666ae02fb70e90afc0a08e6dc65ff20e Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 21:09:07 -0500 Subject: [PATCH 5/7] Add ICON dataset --- parcels_benchmarks/benchmarks.json | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/parcels_benchmarks/benchmarks.json b/parcels_benchmarks/benchmarks.json index 57f0113..fb93639 100644 --- a/parcels_benchmarks/benchmarks.json +++ b/parcels_benchmarks/benchmarks.json @@ -1,16 +1,20 @@ { "data_url": "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=", "datasets": [ - { - "name": "MOi-curvilinear", - "file": "Parcels_Benchmarks_MOi_data.zip", - "known_hash": "sha256:f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885" - }, { "name": "FESOM-baroclinic-gyre", "file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip", "known_hash": "sha256:8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3" + }, + { + "name": "Global ICON Data", + "file": "Parcels_Benchmarks_ICON.zip", + "known_hash": "sha256:06e80941050d16b89ddce758fb6a1030e3facaba9d32bf9085f1e1e497731903" + }, + { + "name": "MOi-curvilinear", + "file": "Parcels_Benchmarks_MOi_data.zip", + "known_hash": "sha256:f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885" } ] } - From bd0027bf607701cf3af30315beed1d63fca80305 Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Tue, 20 Jan 2026 21:19:58 -0500 Subject: [PATCH 6/7] Fix typo in download_all --- parcels_benchmarks/benchmark_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py index bc21def..b251597 100644 --- a/parcels_benchmarks/benchmark_setup.py +++ b/parcels_benchmarks/benchmark_setup.py @@ -126,7 +126,7 @@ def download_example_dataset(dataset: str, manifest_path: Path = DEFAULT_MANIFES @app.command("download-all") def download_all( - manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), + manifest_path: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."), data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."), ) -> None: """Download all datasets listed in benchmarks manifest file.""" From 79b003cbfc2eb87c1b4e9a770d3145a1de84f7fe Mon Sep 17 00:00:00 2001 From: Joe Schoonover Date: Wed, 21 Jan 2026 08:48:48 -0500 Subject: [PATCH 7/7] Resolve bugs in download-all --- benchmarks/fesom2.py | 13 ++----------- parcels_benchmarks/benchmark_setup.py | 2 +- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/benchmarks/fesom2.py b/benchmarks/fesom2.py index 7c7094f..91b3d34 100644 --- a/benchmarks/fesom2.py +++ b/benchmarks/fesom2.py @@ -3,15 +3,12 @@ from parcels import ( Field, FieldSet, - Geographic, - GeographicPolar, Particle, ParticleSet, UxGrid, VectorField, ) -from parcels.kernels.advection import AdvectionEE -from parcels.interpolators import UxPiecewiseConstantFace +from parcels.kernels import AdvectionEE from parcels_benchmarks.benchmark_setup import download_example_dataset, PARCELS_DATADIR runtime=np.timedelta64(1, "D") @@ -47,13 +44,7 @@ def time_load_data(self,npart,integrator): def pset_execute(self,npart,integrator): ds = _load_ds(self.datapath) - grid = UxGrid(ds.uxgrid, z=ds.coords["nz"], mesh="flat") - U = Field(name="U", data=ds.u, grid=grid, interp_method=UxPiecewiseConstantFace) - V = Field(name="V", data=ds.v, grid=grid, interp_method=UxPiecewiseConstantFace) - U.units = GeographicPolar() - V.units = Geographic() - UV = VectorField(name="UV", U=U, V=V) - fieldset = FieldSet([UV, UV.U, UV.V]) + fieldset = FieldSet.from_fesom2(ds) lon = np.linspace(2.0,15.0,npart) lat = np.linspace(32.0,19.0,npart) diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py index b251597..c937e6d 100644 --- a/parcels_benchmarks/benchmark_setup.py +++ b/parcels_benchmarks/benchmark_setup.py @@ -69,7 +69,7 @@ def _create_pooch_registry(manifest: dict) -> dict[str, str | None]: Hashes are set to None for all files. """ registry: dict[str, str | None] = {} - for data in manifests["datasets"]: + for data in manifest["datasets"]: registry[data["file"]] = data.get("known_hash") return registry