Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,17 @@ git push origin main
## Adding benchmarks
Adding benchmarks for parcels typically involves adding a dataset and defining the benchmarks you want to run.

Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive.
Once your data is hosted, you can add an entry to the `parcels_benchmarks.benchmark_setup.DATA_FILES` list. Each entry has the following attributes

### Adding new data
Data is hosted remotely on a SurfDrive managed by the Parcels developers. You will need to open an issue on this repository to start the process of getting your data hosted in the shared SurfDrive.
Once your data is hosted in the shared SurfDrive, you can easily add your dataset to the benchmark dataset manifest using
```
{
"name": str # Name of the dataset that you can reference in the benchmarks
"file": str, # Path, relative to the data_url, to the .zip file containing the benchmark data
"known_hash": str | None # Pooch hash of the zip file; set to None if it is unknown
},
pixi run benchmark-setup pixi add-dataset --name "Name for your dataset" --file "Path to ZIP archive in the SurfDrive"
```

During this process, the dataset will be downloaded and a complete entry will be added to the [parcels_benchmarks/benchmarks.json](./parcels_benchmarks/benchmarks.json) manifest file. Once updated, this file can be committed to this repository and contributed via a pull request.

### Writing the benchmarks
This repository uses [ASV](https://asv.readthedocs.io/) for running benchmarks. You can add benchmarks by including a python script in the `benchmarks/` subdirectory. Within each `benchmarks/*.py` file, we ask that you define a class for the set of benchmarks you plan to run for your dataset. You can use the existing benchmarks as a good starting point for writing your benchmarks.

To learn more about writing benchmarks compatible with ASV, see the [ASV "Writing Benchmarks" documentation](https://asv.readthedocs.io/en/latest/writing_benchmarks.html)
13 changes: 2 additions & 11 deletions benchmarks/fesom2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@
from parcels import (
Field,
FieldSet,
Geographic,
GeographicPolar,
Particle,
ParticleSet,
UxGrid,
VectorField,
)
from parcels.kernels.advection import AdvectionEE
from parcels.interpolators import UxPiecewiseConstantFace
from parcels.kernels import AdvectionEE
from parcels_benchmarks.benchmark_setup import download_example_dataset, PARCELS_DATADIR

runtime=np.timedelta64(1, "D")
Expand Down Expand Up @@ -47,13 +44,7 @@ def time_load_data(self,npart,integrator):

def pset_execute(self,npart,integrator):
ds = _load_ds(self.datapath)
grid = UxGrid(ds.uxgrid, z=ds.coords["nz"], mesh="flat")
U = Field(name="U", data=ds.u, grid=grid, interp_method=UxPiecewiseConstantFace)
V = Field(name="V", data=ds.v, grid=grid, interp_method=UxPiecewiseConstantFace)
U.units = GeographicPolar()
V.units = Geographic()
UV = VectorField(name="UV", U=U, V=V)
fieldset = FieldSet([UV, UV.U, UV.V])
fieldset = FieldSet.from_fesom2(ds)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine for now, but we should probably (soon) change this to a workflow

ds = convert.fesom2_to_ugrid(ds)
fieldset = FieldSet.from_ugrid_conventions(ds)

which aligns better with the other model ingestion codes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed.. when that exists upstream in parcels, we'll switch to that


lon = np.linspace(2.0,15.0,npart)
lat = np.linspace(32.0,19.0,npart)
Expand Down
226 changes: 164 additions & 62 deletions parcels_benchmarks/benchmark_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,89 @@
import json
import os
from pathlib import Path
from typing import Any
import pooch
import sys
import xarray as xr
import typer


app = typer.Typer(add_completion=False)

PARCELS_DATADIR = os.getenv("PARCELS_DATADIR", default=None)
if PARCELS_DATADIR is not None:
PARCELS_DATADIR = Path(PARCELS_DATADIR)
# When modifying existing datasets in a backwards incompatible way,
# make a new release in the repo and update the DATA_REPO_TAG to the new tag
BENCHMARK_DATA = [
{
"name": "MOi-curvilinear",
"file": "Parcels_Benchmarks_MOi_data.zip",
"known_hash": "f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885"
},
{
"name": "FESOM-baroclinic-gyre",
"file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip",
"known_hash": "8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3"
}
]

DATA_URL = "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files="

DATA_FILES = {}
for data in BENCHMARK_DATA:
DATA_FILES[data["name"]] = data["file"]

def _create_pooch_registry() -> dict[str, None]:

DEFAULT_MANIFEST = Path(__file__).with_name("benchmarks.json")

def _load_manifest(path: Path) -> dict:
if not path.is_file():
raise FileNotFoundError(f"Manifest not found: {path}")
with path.open("r", encoding="utf-8") as f:
manifest = json.load(f)

if "datasets" not in manifest or not isinstance(manifest["datasets"], list):
raise ValueError("Manifest must contain a top-level 'datasets' list")

# Provide default URL if omitted
manifest.setdefault(
"data_url",
"https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=",
)
return manifest

def _save_manifest(path: Path, manifest: dict[str, Any]) -> None:
# keep stable ordering by dataset name
manifest["datasets"] = sorted(manifest["datasets"], key=lambda d: d.get("name", ""))
with path.open("w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)
f.write("\n")

def _cache_dir(data_home: Path | None) -> Path:
if data_home is None:
return Path(pooch.os_cache("parcels-benchmarks"))
return Path(data_home)

def _datasets_by_name(manifest: dict) -> dict[str, dict]:
out: dict[str, dict] = {}
for d in manifest["datasets"]:
name = d.get("name")
file = d.get("file")
known_hash = d.get("known_hash")
if not name or not file:
raise ValueError(f"Each dataset needs at least 'name' and 'file': {d}")
if name in out:
raise ValueError(f"Duplicate dataset name in manifest: {name}")
out[name] = {
"name": name,
"file": file,
"known_hash": known_hash,
}
return out

def _create_pooch_registry(manifest: dict) -> dict[str, str | None]:
"""Collapses the mapping of dataset names to filenames into a pooch registry.

Hashes are set to None for all files.
"""
registry: dict[str, None] = {}
for data in BENCHMARK_DATA:
registry[data["file"]] = data["known_hash"]
registry: dict[str, str | None] = {}
for data in manifest["datasets"]:
registry[data["file"]] = data.get("known_hash")
return registry


POOCH_REGISTRY = _create_pooch_registry()

def _get_pooch(data_home=None):
if data_home is None:
data_home = pooch.os_cache("parcels-benchmarks")

data_home.parent.mkdir(exist_ok=True)
def _get_pooch(manifest: dict, data_home: Path | None=None)->pooch.Pooch:
cache_dir = _cache_dir(data_home)
registry = _create_pooch_registry(manifest)
cache_dir.parent.mkdir(parents=True,exist_ok=True)
return pooch.create(
path=data_home,
base_url=DATA_URL,
registry=POOCH_REGISTRY,
path=cache_dir,
base_url=manifest["data_url"],
registry=registry,
)

def download_example_dataset(dataset: str, data_home=None):
"""Load an example dataset from the parcels website.
def download_example_dataset(dataset: str, manifest_path: Path = DEFAULT_MANIFEST, data_home: Path | None = None) -> Path:
"""Load an example dataset listed in the provided manifest

This function provides quick access to a small number of example datasets
that are useful in documentation and testing in parcels.
Expand All @@ -65,6 +94,8 @@ def download_example_dataset(dataset: str, data_home=None):
----------
dataset : str
Name of the dataset to load.
manifest_path: Path
Fully qualified path to a parcels-benchmarks manifest file
data_home : pathlike, optional
The directory in which to cache data. If not specified, defaults to wherever
pooch.os_cache("parcels-benchmarks") goes on your system.
Expand All @@ -74,13 +105,17 @@ def download_example_dataset(dataset: str, data_home=None):
dataset_folder : Path
Path to the folder containing the downloaded dataset files.
"""
manifest = _load_manifest(manifest_path)
datasets = _datasets_by_name(manifest)

# Dev note: `dataset` is assumed to be a folder name with netcdf files
if dataset not in DATA_FILES:
if dataset not in datasets:
raise ValueError(
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(DATA_FILES.keys())
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(datasets.keys())
)
odie = _get_pooch(data_home=data_home)
listing = odie.fetch(DATA_FILES[dataset],processor=pooch.Unzip())
odie = _get_pooch(manifest,data_home=data_home)
zip_name = datasets[dataset]["file"]
listing = odie.fetch(zip_name,processor=pooch.Unzip())

# as pooch currently returns a file listing while we want a dir,
# we find the common parent dir to all files
Expand All @@ -89,33 +124,100 @@ def download_example_dataset(dataset: str, data_home=None):
return common_parent_dir


def download_datasets(data_home=None):
"""Download all datasets listed in the config file to the specified location.
@app.command("download-all")
def download_all(
manifest_path: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."),
) -> None:
"""Download all datasets listed in benchmarks manifest file."""

Parameters
----------
data_home : pathlike, optional
The directory in which to cache data. If not specified, defaults to wherever
pooch.os_cache("parcels-benchmarks") goes on your system.
manifest = _load_manifest(manifest_path)
datasets = _datasets_by_name(manifest)

Returns
-------
dataset_folders : dict
Mapping of dataset names to paths to the folder containing the downloaded dataset files.
"""
dataset_folders = {}
for dataset in DATA_FILES:
folder = download_example_dataset(dataset, data_home=data_home)
dataset_folders: dict[str, Path] = {}
for dataset_name in datasets.keys():
folder = download_example_dataset(dataset_name, manifest_path=manifest_path, data_home=data_home)
dataset_folders[dataset] = folder
return dataset_folders

@app.command("add-dataset")
def add_dataset(
name: str = typer.Option(..., help="New dataset name to add to the manifest."),
file: str = typer.Option(..., help="Zip filename available at data_url (e.g. Foo.zip)."),
manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
data_home: Path | None = typer.Option(PARCELS_DATADIR, help="Override cache directory."),
) -> None:
"""
Download a NEW dataset whose zip exists at data_url but is not yet in the manifest.

We assume the sha256 is unknown ahead of time:
- download with known_hash=None
- compute sha256 of the downloaded zip
- append {name,file,known_hash} to the manifest
"""
m = _load_manifest(manifest)
datasets = _datasets_by_name(m)

if name in datasets:
raise typer.BadParameter(f"Dataset {name!r} already exists in manifest.")

# Also prevent duplicates by file
existing_files = {d.get("file") for d in m["datasets"]}
if file in existing_files:
raise typer.BadParameter(f"File {file!r} is already referenced in the manifest.")

base_url = m["data_url"]
cache_dir = _cache_dir(data_home)
url = f"{base_url}{file}"
cache_dir.mkdir(parents=True, exist_ok=True)

typer.echo(f"Downloading (no hash verification): {url}")
# Download the zip WITHOUT verifying hash.
result = pooch.retrieve(
url=url,
known_hash=None,
path=cache_dir,
processor=None,
)
typer.echo(f" Downloaded zip -> {Path(result)}")

digest = pooch.file_hash(Path(result))
known_hash = f"sha256:{digest}"
typer.echo(f" Computed {known_hash}")

typer.echo("Unzipping...")
result = pooch.retrieve(
url=url,
known_hash=known_hash,
path=cache_dir,
processor=pooch.Unzip(),
)
files = [Path(p) for p in result]
common_parent_dir = min(files, key=lambda p: len(p.parents)).parent
typer.echo(f" Unzipped -> {common_parent_dir}")

# Append to manifest
m["datasets"].append({"name": name, "file": file, "known_hash": known_hash})
_save_manifest(manifest, m)
typer.echo(f"Added {name!r} to {manifest}")

@app.command("list")
def list_datasets(
manifest: Path = typer.Option(DEFAULT_MANIFEST, help="Path to benchmarks manifest JSON."),
) -> None:
"""
List datasets in the manifest.
"""
m = _load_manifest(manifest)
by_name = _dataset_by_name(m)
for name, entry in sorted(by_name.items(), key=lambda kv: kv[0]):
typer.echo(f"{name}: {entry['file']} ({entry.get('known_hash', 'no-hash')})")


def main() -> None:
app()

def main(argv=None) -> int:
folders = download_datasets(data_home=PARCELS_DATADIR)
print("Downloaded datasets:")
for name, folder in folders.items():
print(f" {name}: {folder}")

if __name__ == "__main__":
raise main()
main()

20 changes: 20 additions & 0 deletions parcels_benchmarks/benchmarks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"data_url": "https://surfdrive.surf.nl/index.php/s/7xlfdOFaUGDEmpD/download?path=%2F&files=",
"datasets": [
{
"name": "FESOM-baroclinic-gyre",
"file": "Parcels_Benchmarks_FESOM-baroclinic-gyre_v2025.10.2.2.zip",
"known_hash": "sha256:8d849df2996e3cecf95344e6cde6ed873919d33d731b5fbed4ecacf1a57fbce3"
},
{
"name": "Global ICON Data",
"file": "Parcels_Benchmarks_ICON.zip",
"known_hash": "sha256:06e80941050d16b89ddce758fb6a1030e3facaba9d32bf9085f1e1e497731903"
},
{
"name": "MOi-curvilinear",
"file": "Parcels_Benchmarks_MOi_data.zip",
"known_hash": "sha256:f7816d872897c089eeb07a4e32b7fbcc96a0023ef01ac6c3792f88d8d8893885"
}
]
}
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,22 @@ authors = [{ name = "Parcels team" }]
dependencies = [
"xarray",
"pooch",
"typer>=0.9",
]

# This creates an installable CLI command: `benchmark-setup ...`
[project.scripts]
benchmark-setup = "parcels_benchmarks.benchmark_setup:main"


[tool.setuptools.packages.find]
where = ["."]
include = ["parcels_benchmarks"]

# Include the benchmarks manifest in the wheel/sdist
[tool.setuptools.package-data]
parcels_benchmarks = ["*.json"]


[tool.ruff.lint]
select = [
Expand Down