Parcels-code · fluidnumerics-joe · Jan 20, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/README.md b/README.md
@@ -55,17 +55,17 @@ git push origin main
 ## Adding benchmarks
 Adding benchmarks for parcels typically involves adding a dataset and defining the benchmarks you want to run. 
 
-Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive.
-Once your data is hosted, you can add an entry to the `parcels_benchmarks.benchmark_setup.DATA_FILES` list. Each entry has the following attributes
 
+### Adding new data
+Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive.
+Once your data is hosted in the shared SurfDrive, you can easily add your dataset to the benchmark dataset manifest using
 ```
-{
-      "name": str # Name of the dataset that you can reference in the benchmarks
-      "file": str, # Path, relative to the data_url, to the .zip file containing the benchmark data
-      "known_hash": str | None # Pooch hash of the zip file; set to None if it is unknown
-},
+pixi run benchmark-setup pixi add-dataset --name "Name for your dataset" --file "Path to ZIP archive in the SurfDrive"
 ```
 
+During this process, the dataset will be downloaded and a complete entry will be added to the [parcels_benchmarks/benchmarks.json](./parcels_benchmarks/benchmarks.json) manifest file. Once updated, this file can be committed to this repository and contributed via a pull request.
+
+### Writing the benchmarks
 This repository uses [ASV](https://asv.readthedocs.io/) for running benchmarks. You can add benchmarks by including a python script in the `benchmarks/` subdirectory. Within each `benchmarks/*.py` file, we ask that you define a class for the set of benchmarks you plan to run for your dataset. You can use the existing benchmarks as a good starting point for writing your benchmarks.
 
 To learn more about writing benchmarks compatible with ASV, see the [ASV "Writing Benchmarks" documentation](https://asv.readthedocs.io/en/latest/writing_benchmarks.html)
diff --git a/benchmarks/fesom2.py b/benchmarks/fesom2.py
@@ -3,15 +3,12 @@
 from parcels import (
     Field,
     FieldSet,
-    Geographic,
-    GeographicPolar,
     Particle,
     ParticleSet,
     UxGrid,
     VectorField,
 )
-from parcels.kernels.advection import AdvectionEE
-from parcels.interpolators import UxPiecewiseConstantFace
+from parcels.kernels import AdvectionEE
 from parcels_benchmarks.benchmark_setup import download_example_dataset, PARCELS_DATADIR
 
 runtime=np.timedelta64(1, "D")
@@ -47,13 +44,7 @@ def time_load_data(self,npart,integrator):
 
     def pset_execute(self,npart,integrator):
         ds = _load_ds(self.datapath)
-        grid = UxGrid(ds.uxgrid, z=ds.coords["nz"], mesh="flat")
-        U = Field(name="U", data=ds.u, grid=grid, interp_method=UxPiecewiseConstantFace)
-        V = Field(name="V", data=ds.v, grid=grid, interp_method=UxPiecewiseConstantFace)
-        U.units = GeographicPolar()
-        V.units = Geographic()
-        UV = VectorField(name="UV", U=U, V=V) 
-        fieldset = FieldSet([UV, UV.U, UV.V])
+        fieldset = FieldSet.from_fesom2(ds)
 
         lon = np.linspace(2.0,15.0,npart)
         lat = np.linspace(32.0,19.0,npart)

diff --git a/parcels_benchmarks/benchmark_setup.py b/parcels_benchmarks/benchmark_setup.py
@@ -3,60 +3,89 @@
 import json
 import os
 from pathlib import Path
+from typing import Any
 import pooch
 import sys
 import xarray as xr
+import typer
+
+
+app = typer.Typer(add_completion=False)
 
 PARCELS_DATADIR = os.getenv("PARCELS_DATADIR", default=None)
 if PARCELS_DATADIR is not None:
     PARCELS_DATADIR = Path(PARCELS_DATADIR)
-# When modifying existing datasets in a backwards incompatible way,
-# make a new release in the repo and update the DATA_REPO_TAG to the new tag
-BENCHMARK_DATA = [
-  {
-    "name": "MOi-curvilinear",
-    "file": "Parcels_Benchmarks_MOi_data.zip",
-    "known_hash": "f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885"
-  },
-  {
-    "name": "FESOM-baroclinic-gyre",
-    "file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip",
-    "known_hash": "8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3"
-  }
-]
-
-DATA_URL = "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files="
-
-DATA_FILES = {}
-for data in BENCHMARK_DATA:
-    DATA_FILES[data["name"]] = data["file"]
-
-def _create_pooch_registry() -> dict[str, None]:
+
+DEFAULT_MANIFEST = Path(__file__).with_name("benchmarks.json")
+
+def _load_manifest(path: Path) -> dict:
+    if not path.is_file():
+        raise FileNotFoundError(f"Manifest not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        manifest = json.load(f)
+
+    if "datasets" not in manifest or not isinstance(manifest["datasets"], list):
+        raise ValueError("Manifest must contain a top-level 'datasets' list")
+
+    # Provide default URL if omitted
+    manifest.setdefault(
+        "data_url",
+        "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=",
+    )
+    return manifest
+
+def _save_manifest(path: Path, manifest: dict[str, Any]) -> None:
+    # keep stable ordering by dataset name
+    manifest["datasets"] = sorted(manifest["datasets"], key=lambda d: d.get("name", ""))
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2)
+        f.write("\n")
+
+def _cache_dir(data_home: Path | None) -> Path:
+    if data_home is None:
+        return Path(pooch.os_cache("parcels-benchmarks"))
+    return Path(data_home)
+
+def _datasets_by_name(manifest: dict) -> dict[str, dict]:
+    out: dict[str, dict] = {}
+    for d in manifest["datasets"]:
+        name = d.get("name")
+        file = d.get("file")
+        known_hash = d.get("known_hash")
+        if not name or not file:
+            raise ValueError(f"Each dataset needs at least 'name' and 'file': {d}")
+        if name in out:
+            raise ValueError(f"Duplicate dataset name in manifest: {name}")
+        out[name] = {
+            "name": name,
+            "file": file,
+            "known_hash": known_hash,
+        }
+    return out
+
+def _create_pooch_registry(manifest: dict) -> dict[str, str | None]:
     """Collapses the mapping of dataset names to filenames into a pooch registry.
 
     Hashes are set to None for all files.
     """
-    registry: dict[str, None] = {}
-    for data in BENCHMARK_DATA:
-        registry[data["file"]] = data["known_hash"]
+    registry: dict[str, str | None] = {}
+    for data in manifest["datasets"]:
+        registry[data["file"]] = data.get("known_hash")
     return registry
 
 
-POOCH_REGISTRY = _create_pooch_registry()
-
-def _get_pooch(data_home=None):
-    if data_home is None:
-        data_home = pooch.os_cache("parcels-benchmarks")
-
-    data_home.parent.mkdir(exist_ok=True)
+def _get_pooch(manifest: dict, data_home: Path | None=None)->pooch.Pooch:
+    cache_dir = _cache_dir(data_home)
+    registry = _create_pooch_registry(manifest)
+    cache_dir.parent.mkdir(parents=True,exist_ok=True)
     return pooch.create(
-        path=data_home,
-        base_url=DATA_URL,
-        registry=POOCH_REGISTRY,
+        path=cache_dir,
+        base_url=manifest["data_url"],
+        registry=registry,
     )
 
-def download_example_dataset(dataset: str, data_home=None):
-    """Load an example dataset from the parcels website.
+def download_example_dataset(dataset: str, manifest_path: Path = DEFAULT_MANIFEST, data_home: Path | None = None) -> Path:
+    """Load an example dataset listed in the provided manifest
 
     This function provides quick access to a small number of example datasets
     that are useful in documentation and testing in parcels.
@@ -65,6 +94,8 @@ def download_example_dataset(dataset: str, data_home=None):
     ----------
     dataset : str
         Name of the dataset to load.
+    manifest_path: Path
+        Fully qualified path to a parcels-benchmarks manifest file
     data_home : pathlike, optional
         The directory in which to cache data. If not specified, defaults to wherever
         pooch.os_cache("parcels-benchmarks") goes on your system.
@@ -74,13 +105,17 @@ def download_example_dataset(dataset: str, data_home=None):
     dataset_folder : Path
         Path to the folder containing the downloaded dataset files.
     """
+    manifest = _load_manifest(manifest_path)
+    datasets = _datasets_by_name(manifest)
+
     # Dev note: `dataset` is assumed to be a folder name with netcdf files
-    if dataset not in DATA_FILES:
+    if dataset not in datasets:
         raise ValueError(
-            f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(DATA_FILES.keys())
+            f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(datasets.keys())
         )
-    odie = _get_pooch(data_home=data_home)
-    listing = odie.fetch(DATA_FILES[dataset],processor=pooch.Unzip())
+    odie = _get_pooch(manifest,data_home=data_home)
+    zip_name = datasets[dataset]["file"]
+    listing = odie.fetch(zip_name,processor=pooch.Unzip())
 
     # as pooch currently returns a file listing while we want a dir,
     # we find the common parent dir to all files
@@ -89,33 +124,100 @@ def download_example_dataset(dataset: str, data_home=None):
     return common_parent_dir
 
 
-def download_datasets(data_home=None):
-    """Download all datasets listed in the config file to the specified location.
+@app.command("download-all")
+def download_all(
+    manifest_path: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
+    data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."),
+) -> None:
+    """Download all datasets listed in benchmarks manifest file."""
 
-    Parameters
-    ----------
-    data_home : pathlike, optional
-        The directory in which to cache data. If not specified, defaults to wherever
-        pooch.os_cache("parcels-benchmarks") goes on your system.
+    manifest = _load_manifest(manifest_path)
+    datasets = _datasets_by_name(manifest)
 
-    Returns
-    -------
-    dataset_folders : dict
-        Mapping of dataset names to paths to the folder containing the downloaded dataset files.
-    """
-    dataset_folders = {}
-    for dataset in DATA_FILES:
-        folder = download_example_dataset(dataset, data_home=data_home)
+    dataset_folders: dict[str, Path] = {}
+    for dataset_name in datasets.keys():
+        folder = download_example_dataset(dataset_name, manifest_path=manifest_path, data_home=data_home)
         dataset_folders[dataset] = folder
     return dataset_folders
 
+@app.command("add-dataset")
+def add_dataset(
+    name: str = typer.Option(..., help="New dataset name to add to the manifest."),
+    file: str = typer.Option(..., help="Zip filename available at data_url (e.g. Foo.zip)."),
+    manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
+    data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."),
+) -> None:
+    """
+    Download a NEW dataset whose zip exists at data_url but is not yet in the manifest.
+
+    We assume the sha256 is unknown ahead of time:
+    - download with known_hash=None
+    - compute sha256 of the downloaded zip
+    - append {name,file,known_hash} to the manifest
+    """
+    m = _load_manifest(manifest)
+    datasets = _datasets_by_name(m)
+
+    if name in datasets:
+        raise typer.BadParameter(f"Dataset {name!r} already exists in manifest.")
+
+    # Also prevent duplicates by file
+    existing_files = {d.get("file") for d in m["datasets"]}
+    if file in existing_files:
+        raise typer.BadParameter(f"File {file!r} is already referenced in the manifest.")
+
+    base_url = m["data_url"]
+    cache_dir = _cache_dir(data_home)
+    url = f"{base_url}{file}"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    typer.echo(f"Downloading (no hash verification): {url}")
+    # Download the zip WITHOUT verifying hash.
+    result = pooch.retrieve(
+        url=url,
+        known_hash=None,
+        path=cache_dir,
+        processor=None,
+    )
+    typer.echo(f"  Downloaded zip -> {Path(result)}")
+
+    digest = pooch.file_hash(Path(result))
+    known_hash = f"sha256:{digest}"
+    typer.echo(f"  Computed {known_hash}")
+
+    typer.echo("Unzipping...")
+    result = pooch.retrieve(
+        url=url,
+        known_hash=known_hash,
+        path=cache_dir,
+        processor=pooch.Unzip(),
+    )
+    files = [Path(p) for p in result]
+    common_parent_dir = min(files, key=lambda p: len(p.parents)).parent
+    typer.echo(f"  Unzipped -> {common_parent_dir}")
+
+    # Append to manifest
+    m["datasets"].append({"name": name, "file": file, "known_hash": known_hash})
+    _save_manifest(manifest, m)
+    typer.echo(f"Added {name!r} to {manifest}")
+
+@app.command("list")
+def list_datasets(
+    manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
+) -> None:
+    """
+    List datasets in the manifest.
+    """
+    m = _load_manifest(manifest)
+    by_name = _dataset_by_name(m)
+    for name, entry in sorted(by_name.items(), key=lambda kv: kv[0]):
+        typer.echo(f"{name}: {entry['file']} ({entry.get('known_hash', 'no-hash')})")
+
+
+def main() -> None:
+    app()
 
-def main(argv=None) -> int:
-    folders = download_datasets(data_home=PARCELS_DATADIR)
-    print("Downloaded datasets:")
-    for name, folder in folders.items():
-        print(f"  {name}: {folder}")
 
 if __name__ == "__main__":
-    raise main()
+    main()
 
diff --git a/parcels_benchmarks/benchmarks.json b/parcels_benchmarks/benchmarks.json
@@ -0,0 +1,20 @@
+{
+  "data_url": "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=",
+  "datasets": [
+    {
+      "name": "FESOM-baroclinic-gyre",
+      "file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip",
+      "known_hash": "sha256:8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3"
+    },
+    {
+      "name": "Global ICON Data",
+      "file": "Parcels_Benchmarks_ICON.zip",
+      "known_hash": "sha256:06e80941050d16b89ddce758fb6a1030e3facaba9d32bf9085f1e1e497731903"
+    },
+    {
+      "name": "MOi-curvilinear",
+      "file": "Parcels_Benchmarks_MOi_data.zip",
+      "known_hash": "sha256:f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885"
+    }
+  ]
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,12 +11,22 @@ authors = [{ name = "Parcels team" }]
 dependencies = [
   "xarray",
   "pooch",
+  "typer>=0.9",
 ]
 
+# This creates an installable CLI command: `benchmark-setup ...`
+[project.scripts]
+benchmark-setup = "parcels_benchmarks.benchmark_setup:main"
+
+
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["parcels_benchmarks"]
 
+# Include the benchmarks manifest in the wheel/sdist
+[tool.setuptools.package-data]
+parcels_benchmarks = ["*.json"]
+
 
 [tool.ruff.lint]
 select = [