diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 5855aef6c..5ace4600a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -4,4 +4,3 @@ updates: directory: "/" schedule: interval: "weekly" - target-branch: "devel" diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index cdaedc943..d91d0d0d0 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -9,7 +9,7 @@ jobs: if: ${{ github.repository_owner == 'deepmodeling' }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: diff --git a/.github/workflows/pyright.yml b/.github/workflows/pyright.yml index 8855438d1..9fbc470b8 100644 --- a/.github/workflows/pyright.yml +++ b/.github/workflows/pyright.yml @@ -13,7 +13,7 @@ jobs: with: python-version: '3.12' - run: pip install uv - - run: uv pip install --system -e .[amber,ase,pymatgen] 'rdkit<2025.3.3' openbabel-wheel - - uses: jakebailey/pyright-action@v2 + - run: uv pip install --system -e .[amber,ase,pymatgen] rdkit openbabel-wheel + - uses: jakebailey/pyright-action@v3 with: version: 1.1.363 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6398f3d19..fdb20593c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,8 +1,8 @@ name: Python package on: - - push - - pull_request +- push +- pull_request jobs: build: @@ -12,7 +12,7 @@ jobs: python-version: ["3.8", "3.12"] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 # set up conda - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 @@ -26,13 +26,15 @@ jobs: **/pyproject.toml cache-suffix: "py${{ matrix.python-version }}" - name: Install dependencies - run: uv pip install --system .[test,amber,ase,pymatgen] coverage ./tests/plugin 'rdkit<2025.3.3' openbabel-wheel 'numpy<2.3' + run: uv pip install --system .[test,amber,ase,pymatgen] coverage ./tests/plugin rdkit openbabel-wheel - name: Test run: cd tests && coverage run --source=../dpdata -m unittest && cd .. && coverage combine tests/.coverage && coverage report - name: Run codecov uses: codecov/codecov-action@v5 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + with: + use_oidc: true pass: needs: [build] runs-on: ubuntu-latest @@ -42,3 +44,5 @@ jobs: uses: re-actors/alls-green@release/v1 with: jobs: ${{ toJSON(needs) }} +permissions: + id-token: write diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml index 8fad209f0..178f1c9e7 100644 --- a/.github/workflows/test_import.yml +++ b/.github/workflows/test_import.yml @@ -8,7 +8,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.9' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d57d3235..07a6fddba 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: # Python - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.14.1 + rev: v0.15.2 hooks: - id: ruff args: ["--fix"] @@ -34,10 +34,18 @@ repos: hooks: - id: velin args: ["--write"] -# Python inside docs -- repo: https://github.com/asottile/blacken-docs - rev: 1.20.0 +# markdown +- repo: https://github.com/hukkin/mdformat + rev: 1.0.0 hooks: - - id: blacken-docs -ci: - autoupdate_branch: devel + - id: mdformat + exclude: "^tests/.*$" + additional_dependencies: + # - mdformat-myst==0.3.0 + # See https://github.com/executablebooks/mdformat-myst/issues/13 + - "git+https://github.com/njzjz-bothub/mdformat-myst@d9c414e#egg=mdformat-myst" + - mdformat-ruff==0.1.3 + - mdformat-web==0.2.0 + - mdformat-config==0.2.1 + - mdformat-beautysh==1.0.0 + - mdformat-gfm-alerts==2.0.0 diff --git a/AGENTS.md b/AGENTS.md index 6477b84ba..19d633b99 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,22 +7,26 @@ Always reference these instructions first and fallback to search or bash command ## Working Effectively - **Bootstrap and install the repository:** + - `cd /home/runner/work/dpdata/dpdata` (or wherever the repo is cloned) - `uv pip install -e .` -- installs dpdata in development mode with core dependencies (numpy, scipy, h5py, monty, wcmatch) - Test installation: `dpdata --version` -- should show version like "dpdata v0.1.dev2+..." - **Run tests:** + - `cd tests && python -m unittest discover` -- runs all 1826 tests in ~10 seconds. NEVER CANCEL. - `cd tests && python -m unittest test_.py` -- run specific test modules (individual modules take ~0.5 seconds) - `cd tests && coverage run --source=../dpdata -m unittest discover && coverage report` -- run tests with coverage - **Linting and formatting:** + - Install ruff: `uv pip install ruff` - `ruff check dpdata/` -- lint the main package (takes ~1 second) - `ruff format dpdata/` -- format code according to project style - `ruff check --fix dpdata/` -- auto-fix linting issues where possible - **Pre-commit hooks:** + - Install: `uv pip install pre-commit` - `pre-commit run --all-files` -- run all hooks on all files - Hooks include: ruff linting/formatting, trailing whitespace, end-of-file-fixer, yaml/json/toml checks @@ -30,15 +34,18 @@ Always reference these instructions first and fallback to search or bash command ## Validation - **Always test CLI functionality after making changes:** + - `dpdata --help` -- ensure CLI still works - `dpdata --version` -- verify version is correct - Test a basic conversion if sample data is available - **Always run linting before committing:** + - `ruff check dpdata/` -- ensure no new linting errors - `ruff format dpdata/` -- ensure code is properly formatted - **Run relevant tests for your changes:** + - For format-specific changes: `cd tests && python -m unittest test_*.py` - For core system changes: `cd tests && python -m unittest test_system*.py test_multisystems.py` - For CLI changes: `cd tests && python -m unittest test_cli.py` (if exists) @@ -46,12 +53,14 @@ Always reference these instructions first and fallback to search or bash command ## Build and Documentation - **Documentation:** + - `cd docs && make help` -- see all available build targets - `cd docs && make html` -- build HTML documentation (requires additional dependencies) - Documentation source is in `docs/` directory using Sphinx - **NOTE:** Full docs build requires additional dependencies like `deepmodeling-sphinx` that may not be readily available - **Package building:** + - Uses setuptools with pyproject.toml configuration - `uv pip install build && python -m build` -- create source and wheel distributions - Version is managed by setuptools_scm from git tags @@ -61,6 +70,7 @@ Always reference these instructions first and fallback to search or bash command The following are outputs from frequently run commands. Reference them instead of re-running to save time. ### Repository structure + ``` /home/runner/work/dpdata/dpdata/ ├── dpdata/ # Main package code @@ -82,6 +92,7 @@ The following are outputs from frequently run commands. Reference them instead o ``` ### Key dependencies + - Core: numpy>=1.14.3, scipy, h5py, monty, wcmatch - Optional: ase (ASE integration), parmed (AMBER), pymatgen (Materials Project), rdkit (molecular analysis) - Testing: unittest (built-in), coverage @@ -89,25 +100,30 @@ The following are outputs from frequently run commands. Reference them instead o - Docs: sphinx with various extensions ### Test timing expectations + - Full test suite: ~10 seconds (1826 tests). NEVER CANCEL. - Individual test modules: ~0.5 seconds - Linting with ruff: ~1 second - Documentation build: ~30 seconds ### Common workflows + 1. **Adding a new format:** + - Create module in `dpdata//` - Implement format classes inheriting from appropriate base classes - Add tests in `tests/test_*.py` - Register format in the plugin system -2. **Fixing bugs:** +1. **Fixing bugs:** + - Write test that reproduces the bug first - Make minimal fix to pass the test - Run full test suite to ensure no regressions - Run linting to ensure code style compliance -3. **CLI changes:** +1. **CLI changes:** + - Modify `dpdata/cli.py` - Test with `dpdata --help` and specific commands - Add/update tests if needed @@ -115,6 +131,7 @@ The following are outputs from frequently run commands. Reference them instead o ## Troubleshooting - **Installation timeouts:** Network timeouts during `uv pip install` are common. If this occurs, try: + - Individual package installation: `uv pip install numpy scipy h5py monty wcmatch` - Use `--timeout` option: `uv pip install --timeout 300 -e .` - Verify existing installation works: `dpdata --version` should work even if reinstall fails diff --git a/docs/systems/bond_order_system.md b/docs/systems/bond_order_system.md index 8d92d7249..7e786845d 100644 --- a/docs/systems/bond_order_system.md +++ b/docs/systems/bond_order_system.md @@ -1,6 +1,7 @@ - ## BondOrderSystem + A new class {class}`BondOrderSystem ` which inherits from class {class}`System ` is introduced in dpdata. This new class contains information of chemical bonds and formal charges (stored in `BondOrderSystem.data['bonds']`, `BondOrderSystem.data['formal_charges']`). Now BondOrderSystem can only read from .mol/.sdf formats, because of its dependency on rdkit (which means rdkit must be installed if you want to use this function). Other formats, such as pdb, must be converted to .mol/.sdf format (maybe with software like open babel). + ```python import dpdata @@ -11,8 +12,10 @@ system_2 = dpdata.BondOrderSystem( "tests/bond_order/methane.sdf", fmt="sdf" ) # read from .sdf file ``` + In sdf file, all molecules must be of the same topology (i.e. conformers of the same molecular configuration). `BondOrderSystem ` also supports initialize from a {class}`rdkit.Chem.rdchem.Mol` object directly. + ```python from rdkit import Chem from rdkit.Chem import AllChem @@ -25,12 +28,14 @@ system = dpdata.BondOrderSystem(rdkit_mol=mol) ``` ### Bond Order Assignment + The {class}`BondOrderSystem ` implements a more robust sanitize procedure for rdkit Mol, as defined in {class}`dpdata.rdkit.santizie.Sanitizer`. This class defines 3 level of sanitization process by: low, medium and high. (default is medium). -+ low: use `rdkit.Chem.SanitizeMol()` function to sanitize molecule. -+ medium: before using rdkit, the programm will first assign formal charge of each atom to avoid inappropriate valence exceptions. However, this mode requires the rightness of the bond order information in the given molecule. -+ high: the program will try to fix inappropriate bond orders in aromatic hetreocycles, phosphate, sulfate, carboxyl, nitro, nitrine, guanidine groups. If this procedure fails to sanitize the given molecule, the program will then try to call `obabel` to pre-process the mol and repeat the sanitization procedure. **That is to say, if you wan't to use this level of sanitization, please ensure `obabel` is installed in the environment.** -According to our test, our sanitization procedure can successfully read 4852 small molecules in the PDBBind-refined-set. It is necessary to point out that the in the molecule file (mol/sdf), the number of explicit hydrogens has to be correct. Thus, we recommend to use - `obabel xxx -O xxx -h` to pre-process the file. The reason why we do not implement this hydrogen-adding procedure in dpdata is that we can not ensure its correctness. + +- low: use `rdkit.Chem.SanitizeMol()` function to sanitize molecule. +- medium: before using rdkit, the programm will first assign formal charge of each atom to avoid inappropriate valence exceptions. However, this mode requires the rightness of the bond order information in the given molecule. +- high: the program will try to fix inappropriate bond orders in aromatic hetreocycles, phosphate, sulfate, carboxyl, nitro, nitrine, guanidine groups. If this procedure fails to sanitize the given molecule, the program will then try to call `obabel` to pre-process the mol and repeat the sanitization procedure. **That is to say, if you wan't to use this level of sanitization, please ensure `obabel` is installed in the environment.** + According to our test, our sanitization procedure can successfully read 4852 small molecules in the PDBBind-refined-set. It is necessary to point out that the in the molecule file (mol/sdf), the number of explicit hydrogens has to be correct. Thus, we recommend to use + `obabel xxx -O xxx -h` to pre-process the file. The reason why we do not implement this hydrogen-adding procedure in dpdata is that we can not ensure its correctness. ```python import dpdata @@ -38,8 +43,11 @@ import dpdata for sdf_file in glob.glob("bond_order/refined-set-ligands/obabel/*sdf"): syst = dpdata.BondOrderSystem(sdf_file, sanitize_level="high", verbose=False) ``` + ### Formal Charge Assignment + BondOrderSystem implement a method to assign formal charge for each atom based on the 8-electron rule (see below). Note that it only supports common elements in bio-system: B,C,N,O,P,S,As + ```python import dpdata diff --git a/docs/systems/mixed.md b/docs/systems/mixed.md index 25837ea6e..6052c38b3 100644 --- a/docs/systems/mixed.md +++ b/docs/systems/mixed.md @@ -10,6 +10,7 @@ This also helps to mixture the type information together for model training with Here are examples using `deepmd/npy/mixed` format: - Dump a MultiSystems into a mixed type numpy directory: + ```python import dpdata @@ -17,6 +18,7 @@ dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") ``` - Load a mixed type data into a MultiSystems: + ```python import dpdata diff --git a/docs/systems/multi.md b/docs/systems/multi.md index 20551c7e8..c8033ca89 100644 --- a/docs/systems/multi.md +++ b/docs/systems/multi.md @@ -1,15 +1,16 @@ # `MultiSystems` -The Class {class}`dpdata.MultiSystems` can read data from a dir which may contains many files of different systems, or from single xyz file which contains different systems. +The Class {class}`dpdata.MultiSystems` can read data from a dir which may contains many files of different systems, or from single xyz file which contains different systems. -Use {meth}`dpdata.MultiSystems.from_dir` to read from a directory, {class}`dpdata.MultiSystems` will walk in the directory -Recursively and find all file with specific file_name. Supports all the file formats that {class}`dpdata.LabeledSystem` supports. +Use {meth}`dpdata.MultiSystems.from_dir` to read from a directory, {class}`dpdata.MultiSystems` will walk in the directory +Recursively and find all file with specific file_name. Supports all the file formats that {class}`dpdata.LabeledSystem` supports. Use {meth}`dpdata.MultiSystems.from_file` to read from single file. Single-file support is available for the `quip/gap/xyz` and `ase/structure` formats. For example, for `quip/gap xyz` files, single .xyz file may contain many different configurations with different atom numbers and atom type. The following commands relating to {class}`dpdata.MultiSystems` may be useful. + ```python # load data @@ -40,6 +41,7 @@ xyz_multi_systems.to_deepmd_raw("./my_deepmd_data/") ``` You may also use the following code to parse muti-system: + ```python from dpdata import LabeledSystem, MultiSystems from glob import glob diff --git a/docs/systems/system.md b/docs/systems/system.md index 9f01fc40f..9d8684916 100644 --- a/docs/systems/system.md +++ b/docs/systems/system.md @@ -1,84 +1,103 @@ # `System` and `LabeledSystem` This section gives some examples on how dpdata works. Firstly one needs to import the module in a python 3.x compatible code. + ```python import dpdata ``` + The typicall workflow of `dpdata` is 1. Load data from vasp or lammps or deepmd-kit data files. -2. Manipulate data -3. Dump data to in a desired format - +1. Manipulate data +1. Dump data to in a desired format ### Load data + ```python d_poscar = dpdata.System("POSCAR", fmt="vasp/poscar") ``` + or let dpdata infer the format (`vasp/poscar`) of the file from the file name extension + ```python d_poscar = dpdata.System("my.POSCAR") ``` + The number of atoms, atom types, coordinates are loaded from the `POSCAR` and stored to a data {class}`System ` called `d_poscar`. A data {class}`System ` (a concept used by [deepmd-kit](https://github.com/deepmodeling/deepmd-kit)) contains frames that has the same number of atoms of the same type. The order of the atoms should be consistent among the frames in one {class}`System `. It is noted that `POSCAR` only contains one frame. If the multiple frames stored in, for example, a `OUTCAR` is wanted, + ```python d_outcar = dpdata.LabeledSystem("OUTCAR") ``` + The labels provided in the `OUTCAR`, i.e. energies, forces and virials (if any), are loaded by {class}`LabeledSystem `. It is noted that the forces of atoms are always assumed to exist. {class}`LabeledSystem ` is a derived class of {class}`System `. The {class}`System ` or {class}`LabeledSystem ` can be constructed from the [supported file formats](../formats.rst) with the `format key` in the table passed to argument `fmt`. - - ### Access data + These properties stored in {class}`System ` and {class}`LabeledSystem ` can be accessed by operator `[]` with the key of the property supplied, for example + ```python coords = d_outcar["coords"] ``` -Available properties are (nframe: number of frames in the system, natoms: total number of atoms in the system) -| key | type | dimension | are labels | description -| --- | --- | --- | --- | --- -| 'atom_names' | list of str | ntypes | False | The name of each atom type -| 'atom_numbs' | list of int | ntypes | False | The number of atoms of each atom type -| 'atom_types' | np.ndarray | natoms | False | Array assigning type to each atom -| 'cells' | np.ndarray | nframes x 3 x 3 | False | The cell tensor of each frame -| 'coords' | np.ndarray | nframes x natoms x 3 | False | The atom coordinates -| 'energies' | np.ndarray | nframes | True | The frame energies -| 'forces' | np.ndarray | nframes x natoms x 3 | True | The atom forces -| 'virials' | np.ndarray | nframes x 3 x 3 | True | The virial tensor of each frame +Available properties are (nframe: number of frames in the system, natoms: total number of atoms in the system) +| key | type | dimension | are labels | description | +| ------------ | ----------- | -------------------- | ---------- | ------------------------------------- | +| 'atom_names' | list of str | ntypes | False | The name of each atom type | +| 'atom_numbs' | list of int | ntypes | False | The number of atoms of each atom type | +| 'atom_types' | np.ndarray | natoms | False | Array assigning type to each atom | +| 'cells' | np.ndarray | nframes x 3 x 3 | False | The cell tensor of each frame | +| 'coords' | np.ndarray | nframes x natoms x 3 | False | The atom coordinates | +| 'energies' | np.ndarray | nframes | True | The frame energies | +| 'forces' | np.ndarray | nframes x natoms x 3 | True | The atom forces | +| 'virials' | np.ndarray | nframes x 3 x 3 | True | The virial tensor of each frame | ### Dump data + The data stored in {class}`System ` or {class}`LabeledSystem ` can be dumped in 'lammps/lmp' or 'vasp/poscar' format, for example: + ```python d_outcar.to("lammps/lmp", "conf.lmp", frame_idx=0) ``` + The first frames of `d_outcar` will be dumped to 'conf.lmp' + ```python d_outcar.to("vasp/poscar", "POSCAR", frame_idx=-1) ``` + The last frames of `d_outcar` will be dumped to 'POSCAR'. The data stored in `LabeledSystem` can be dumped to deepmd-kit raw format, for example + ```python d_outcar.to("deepmd/raw", "dpmd_raw") ``` + Or a simpler command: + ```python dpdata.LabeledSystem("OUTCAR").to("deepmd/raw", "dpmd_raw") ``` + Frame selection can be implemented by + ```python dpdata.LabeledSystem("OUTCAR").sub_system([0, -1]).to("deepmd/raw", "dpmd_raw") ``` -by which only the first and last frames are dumped to `dpmd_raw`. +by which only the first and last frames are dumped to `dpmd_raw`. ### replicate + dpdata will create a super cell of the current atom configuration. + ```python dpdata.System("./POSCAR").replicate( ( @@ -88,11 +107,13 @@ dpdata.System("./POSCAR").replicate( ) ) ``` -tuple(1,2,3) means don't copy atom configuration in x direction, make 2 copys in y direction, make 3 copys in z direction. +tuple(1,2,3) means don't copy atom configuration in x direction, make 2 copys in y direction, make 3 copys in z direction. ### perturb + By the following example, each frame of the original system (`dpdata.System('./POSCAR')`) is perturbed to generate three new frames. For each frame, the cell is perturbed by 5% and the atom positions are perturbed by 0.6 Angstrom. `atom_pert_style` indicates that the perturbation to the atom positions is subject to normal distribution. Other available options to `atom_pert_style` are`uniform` (uniform in a ball), and `const` (uniform on a sphere). + ```python perturbed_system = dpdata.System("./POSCAR").perturb( pert_num=3, @@ -104,7 +125,9 @@ print(perturbed_system.data) ``` ### replace + By the following example, Random 8 Hf atoms in the system will be replaced by Zr atoms with the atom postion unchanged. + ```python s = dpdata.System("tests/poscars/POSCAR.P42nmc", fmt="vasp/poscar") s.replace("Hf", "Zr", 8) diff --git a/dpdata/cp2k/output.py b/dpdata/cp2k/output.py index bd827595e..bf575f728 100644 --- a/dpdata/cp2k/output.py +++ b/dpdata/cp2k/output.py @@ -65,7 +65,7 @@ def __next__(self): # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') assert math.isclose( - log_info_dict["energies"], xyz_info_dict["energies"], abs_tol=1.0e-6 + log_info_dict["energies"][0], xyz_info_dict["energies"][0], abs_tol=1.0e-6 ), ( log_info_dict["energies"], xyz_info_dict["energies"], diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 38e8b386b..734b6a730 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -1,15 +1,128 @@ from __future__ import annotations import copy +import math import numpy as np import dpdata +from dpdata.data_type import Axis from .comp import dump as comp_dump from .comp import to_system_data as comp_to_system_data +def _pad_to(sys_data, target_natoms, dtypes): + """Pad system data dict so that NATOMS dimension becomes target_natoms. + + Virtual atoms get real_atom_types = -1, and all other per-atom data is + padded with zeros. + + Parameters + ---------- + sys_data : dict + System data dict, already in mixed-type format. + target_natoms : int + Target number of atoms after padding. + dtypes : tuple[DataType, ...] + Registered data types to iterate for generic per-atom padding. + """ + natoms = sys_data["atom_types"].shape[0] + npad = target_natoms - natoms + if npad <= 0: + return + nframes = sys_data["coords"].shape[0] + + # Pad atom_types (all MIXED_TOKEN = 0) + sys_data["atom_types"] = np.concatenate( + [sys_data["atom_types"], np.zeros(npad, dtype=int)] + ) + sys_data["atom_numbs"] = [target_natoms] + + # Pad real_atom_types with -1 (virtual atom sentinel) + sys_data["real_atom_types"] = np.concatenate( + [ + sys_data["real_atom_types"], + -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype), + ], + axis=1, + ) + + # Pad coords and all other per-atom data generically + reserved = { + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "cells", + "real_atom_names", + "real_atom_types", + "nopbc", + } + for dtype in dtypes: + if dtype.name in reserved: + continue + if dtype.name not in sys_data: + continue + if not ( + len(dtype.shape) >= 2 + and dtype.shape[0] == Axis.NFRAMES + and Axis.NATOMS in dtype.shape + ): + continue + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + arr = sys_data[dtype.name] + pad_width = [(0, 0)] * len(arr.shape) + pad_width[axis_natoms] = (0, npad) + sys_data[dtype.name] = np.pad( + arr, pad_width, mode="constant", constant_values=0 + ) + + +def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): + """Strip virtual atoms (type -1) from a group of frames. + + Parameters + ---------- + atom_types_row : np.ndarray + 1-D array of atom type indices for the group (same for all frames). + coords : np.ndarray + Coordinates array, shape (nframes, natoms_padded, 3). + extra_data : dict + Dict of {name: array} for this group, arrays already frame-sliced. + dtypes : tuple[DataType, ...] + Registered data types. + + Returns + ------- + atom_types : np.ndarray + Atom types with virtual atoms removed. + coords : np.ndarray + Coords with virtual atoms removed. + extra_data : dict + Extra data with virtual atoms removed. + """ + real_mask = atom_types_row >= 0 + if real_mask.all(): + return atom_types_row, coords, extra_data + + atom_types = atom_types_row[real_mask] + coords = coords[:, real_mask, :] + + stripped = {} + for name, arr in extra_data.items(): + for dtype in dtypes: + if dtype.name == name and Axis.NATOMS in dtype.shape: + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + idx = [slice(None)] * len(arr.shape) + idx[axis_natoms] = real_mask + arr = arr[tuple(idx)] + break + stripped[name] = arr + + return atom_types, coords, stripped + + def to_system_data(folder, type_map=None, labels=True): data = comp_to_system_data(folder, type_map, labels) # data is empty @@ -26,7 +139,11 @@ def to_system_data(folder, type_map=None, labels=True): index_map = None all_real_atom_types_concat = data.pop("real_atom_types").astype(int) if index_map is not None: - all_real_atom_types_concat = index_map[all_real_atom_types_concat] + # Preserve -1 (virtual atom sentinel) during remapping + valid = all_real_atom_types_concat >= 0 + remapped = np.full_like(all_real_atom_types_concat, -1) + remapped[valid] = index_map[all_real_atom_types_concat[valid]] + all_real_atom_types_concat = remapped all_cells_concat = data["cells"] all_coords_concat = data["coords"] @@ -60,10 +177,6 @@ def to_system_data(folder, type_map=None, labels=True): while True: if all_real_atom_types_concat.size == 0: break - temp_atom_numbs = [ - np.count_nonzero(all_real_atom_types_concat[0] == i) - for i in range(len(data["atom_names"])) - ] # temp_formula = formula(data['atom_names'], temp_atom_numbs) temp_idx = np.arange(all_real_atom_types_concat.shape[0])[ (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1) @@ -71,20 +184,37 @@ def to_system_data(folder, type_map=None, labels=True): rest_idx = np.arange(all_real_atom_types_concat.shape[0])[ (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1) ] + + # Extract data for this group + group_atom_types = all_real_atom_types_concat[0] + group_coords = all_coords_concat[temp_idx] + group_extra = {} + for name in extra_data: + group_extra[name] = extra_data[name][temp_idx] + extra_data[name] = extra_data[name][rest_idx] + + # Strip virtual atoms (type -1) introduced by padding + group_atom_types, group_coords, group_extra = _strip_virtual_atoms( + group_atom_types, group_coords, group_extra, dtypes + ) + + temp_atom_numbs = [ + np.count_nonzero(group_atom_types == i) + for i in range(len(data["atom_names"])) + ] + temp_data = data.copy() temp_data["atom_names"] = data["atom_names"].copy() temp_data["atom_numbs"] = temp_atom_numbs - temp_data["atom_types"] = all_real_atom_types_concat[0] + temp_data["atom_types"] = group_atom_types all_real_atom_types_concat = all_real_atom_types_concat[rest_idx] temp_data["cells"] = all_cells_concat[temp_idx] all_cells_concat = all_cells_concat[rest_idx] - temp_data["coords"] = all_coords_concat[temp_idx] + temp_data["coords"] = group_coords all_coords_concat = all_coords_concat[rest_idx] - for name in extra_data: - all_dtype_concat = extra_data[name] - temp_data[name] = all_dtype_concat[temp_idx] - extra_data[name] = all_dtype_concat[rest_idx] + for name in group_extra: + temp_data[name] = group_extra[name] data_list.append(temp_data) return data_list @@ -109,7 +239,7 @@ def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True): comp_dump(folder, data, set_size, comp_prec, remove_sets) -def mix_system(*system, type_map, **kwargs): +def mix_system(*system, type_map, atom_numb_pad=None, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -118,6 +248,11 @@ def mix_system(*system, type_map, **kwargs): The systems to mix type_map : list of str Maps atom type to name + atom_numb_pad : int, optional + If provided, pad atom counts to the next multiple of this number + using virtual atoms (type -1 in real_atom_types). This reduces the + number of subdirectories when systems have many different atom counts. + For example, atom_numb_pad=8 groups systems into multiples of 8. **kwargs : dict Other parameters @@ -129,21 +264,28 @@ def mix_system(*system, type_map, **kwargs): mixed_systems = {} temp_systems = {} atom_numbs_frame_index = {} # index of frames in cur sys + # Use LabeledSystem DTYPES as superset for generic per-atom padding + dtypes = dpdata.system.LabeledSystem.DTYPES for sys in system: tmp_sys = sys.copy() natom = tmp_sys.get_natoms() tmp_sys.convert_to_mixed_type(type_map=type_map) - if str(natom) not in atom_numbs_frame_index: - atom_numbs_frame_index[str(natom)] = 0 - atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes() - if str(natom) not in temp_systems or not temp_systems[str(natom)]: - temp_systems[str(natom)] = tmp_sys + if atom_numb_pad is not None and atom_numb_pad > 1: + padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad + _pad_to(tmp_sys.data, padded_natom, dtypes) + group_key = str(padded_natom) + else: + group_key = str(natom) + if group_key not in atom_numbs_frame_index: + atom_numbs_frame_index[group_key] = 0 + atom_numbs_frame_index[group_key] += tmp_sys.get_nframes() + if group_key not in temp_systems or not temp_systems[group_key]: + temp_systems[group_key] = tmp_sys else: - temp_systems[str(natom)].append(tmp_sys) - for natom in temp_systems: - if atom_numbs_frame_index[natom] > 0: - sys_name = f"{natom}" - mixed_systems[sys_name] = temp_systems[natom] + temp_systems[group_key].append(tmp_sys) + for natom_key in temp_systems: + if atom_numbs_frame_index[natom_key] > 0: + mixed_systems[natom_key] = temp_systems[natom_key] return mixed_systems diff --git a/dpdata/lmdb/__init__.py b/dpdata/lmdb/__init__.py new file mode 100644 index 000000000..53a3e8f0e --- /dev/null +++ b/dpdata/lmdb/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .format import LMDBFormat + +__all__ = ["LMDBFormat"] diff --git a/dpdata/lmdb/format.py b/dpdata/lmdb/format.py new file mode 100644 index 000000000..9b518be6b --- /dev/null +++ b/dpdata/lmdb/format.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import os + +import lmdb +import msgpack +import msgpack_numpy as m +import numpy as np + +from dpdata.format import Format + +m.patch() + + +class LMDBError(Exception): + """Base class for LMDB errors.""" + + +class LMDBMetadataError(LMDBError): + """Metadata not found in LMDB.""" + + +class LMDBFrameError(LMDBError): + """Frame data not found in LMDB.""" + + +class LMDBFormat(Format): + """ + Class for handling the LMDB format, which stores atomic configurations in a + Lightning Memory-Mapped Database (LMDB). + + This format is optimized for machine learning workflows where fast, random + access to a large number of frames is required. All frames from multiple + systems (with potentially different numbers of atoms) are stored in a + single LMDB database file. + + Both single systems and multiple systems are supported via the standard + ``dpdata`` APIs. + + Examples + -------- + **Saving a single LabeledSystem** + + >>> import dpdata + >>> system = dpdata.LabeledSystem("path/to/input.vasp", fmt="vasp/outcar") + >>> system.to("lmdb", "my_single_system.lmdb") + + **Loading a single LabeledSystem** + + >>> loaded_system = dpdata.LabeledSystem("my_single_system.lmdb", fmt="lmdb") + + **Saving multiple systems to a single LMDB database** + + >>> import dpdata + >>> system_1 = dpdata.LabeledSystem("path/to/system1/OUTCAR", fmt="vasp/outcar") + >>> system_2 = dpdata.LabeledSystem("path/to/system2/OUTCAR", fmt="vasp/outcar") + >>> multi_systems_obj = dpdata.MultiSystems(system_1, system_2) + >>> multi_systems_obj.to("lmdb", "my_multi_system_db.lmdb") + + **Loading multiple systems from a single LMDB database** + + >>> import dpdata + >>> loaded_multi_systems = dpdata.MultiSystems.from_file("my_multi_system_db.lmdb", fmt="lmdb") + """ + + def to_multi_systems( + self, formulas, directory, map_size=1000000000, frame_idx_fmt="012d", **kwargs + ): + """Implement MultiSystems.to for LMDB format. + + Parameters + ---------- + formulas : list[str] + list of formulas + directory : str + directory of system + map_size : int, optional + Maximum size of the LMDB database in bytes. Default is 1GB. + frame_idx_fmt : str, optional + The format string used to encode the frame index as a key. Default is "012d". + **kwargs : dict + other parameters + + Yields + ------ + tuple + (self, formula) to be used by to_system + """ + self._frame_idx_fmt = frame_idx_fmt + self._global_frame_idx = 0 + self._system_info = [] + os.makedirs(directory, exist_ok=True) + with lmdb.open(directory, map_size=map_size) as env: + with env.begin(write=True) as txn: + self._txn = txn + for ff in formulas: + yield (self, ff) + # Finalize metadata + metadata = { + "nframes": self._global_frame_idx, + "system_info": self._system_info, + "frame_idx_fmt": self._frame_idx_fmt, + } + txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True)) + self._txn = None + + def _dump_to_txn(self, data, txn, formula, dtypes): + from dpdata.data_type import Axis + + nframes = data["coords"].shape[0] + + # Identify symbolic shapes and frame-dependent keys + data_shapes = {} + frame_dependent_keys = [] + for dt in dtypes: + if dt.name in data: + if dt.shape is not None: + data_shapes[dt.name] = [ + s.value if isinstance(s, Axis) else s for s in dt.shape + ] + if Axis.NFRAMES in dt.shape: + frame_dependent_keys.append(dt.name) + else: + data_shapes[dt.name] = None + + # Record system info + # natoms needs to be extracted from data + if "atom_numbs" in data: + natoms_list = data["atom_numbs"] + else: + # Fallback for systems without atom_numbs (should not happen in valid dpdata systems) + natoms_list = [] + + self._system_info.append( + { + "formula": formula, + "natoms": natoms_list, + "nframes": nframes, + "start_idx": self._global_frame_idx, + "data_shapes": data_shapes, + "frame_dependent_keys": frame_dependent_keys, + } + ) + + for i in range(nframes): + frame_data = {} + for key, val in data.items(): + if key in frame_dependent_keys: + frame_data[key] = val[i] + else: + frame_data[key] = val + + key = f"{self._global_frame_idx:{self._frame_idx_fmt}}".encode("ascii") + value = msgpack.packb(frame_data, use_bin_type=True) + txn.put(key, value) + self._global_frame_idx += 1 + + def to_labeled_system(self, data, file_name, **kwargs): + """Save a single LabeledSystem to an LMDB database.""" + from dpdata.system import LabeledSystem + + if isinstance(file_name, tuple) and file_name[0] is self: + txn, formula = self._txn, file_name[1] + self._dump_to_txn(data, txn, formula, LabeledSystem.DTYPES) + else: + # Single system call: use to_multi_systems logic + # Infer formula from data if possible, or use default + formula = kwargs.get("formula", "unknown") + gen = self.to_multi_systems([formula], file_name, **kwargs) + handle = next(gen) + self.to_labeled_system(data, handle, **kwargs) + try: + next(gen) + except StopIteration: + pass + + def to_system(self, data, file_name, **kwargs): + """Save a single System to an LMDB database.""" + from dpdata.system import System + + if isinstance(file_name, tuple) and file_name[0] is self: + txn, formula = self._txn, file_name[1] + self._dump_to_txn(data, txn, formula, System.DTYPES) + else: + # Single system call + formula = kwargs.get("formula", "unknown") + gen = self.to_multi_systems([formula], file_name, **kwargs) + handle = next(gen) + self.to_system(data, handle, **kwargs) + try: + next(gen) + except StopIteration: + pass + + def from_multi_systems(self, file_name, map_size=1000000000, **kwargs): + """Load multiple systems from a single LMDB database. + + Parameters + ---------- + file_name : str + The path to the LMDB database directory. + map_size : int, optional + Maximum size of the LMDB database in bytes. + **kwargs : dict + other parameters + + Yields + ------ + dict + data dictionary for each system + """ + from dpdata.data_type import Axis, DataType + from dpdata.system import LabeledSystem, System + + with lmdb.open(file_name, readonly=True) as env: + with env.begin() as txn: + metadata_packed = txn.get(b"__metadata__") + if metadata_packed is None: + raise LMDBMetadataError("LMDB database does not contain metadata.") + metadata = msgpack.unpackb(metadata_packed, raw=False) + frame_idx_fmt = metadata.get("frame_idx_fmt", "012d") + + for sys_info in metadata["system_info"]: + system_frames = [] + start_idx = sys_info["start_idx"] + nframes = sys_info["nframes"] + data_shapes = sys_info.get("data_shapes", {}) + frame_dependent_keys = sys_info.get("frame_dependent_keys", []) + + for i in range(start_idx, start_idx + nframes): + key = f"{i:{frame_idx_fmt}}".encode("ascii") + value = txn.get(key) + if value is None: + raise LMDBFrameError(f"Frame data not found for key: {key}") + frame_data = msgpack.unpackb(value, raw=False) + system_frames.append(frame_data) + + # Aggregate data for one system + first_frame = system_frames[0] + is_labeled = "energies" in first_frame + cls = LabeledSystem if is_labeled else System + + # Auto-register unknown data types + existing_dt_names = [dt.name for dt in cls.DTYPES] + new_dts = [] + axis_map = {a.value: a for a in Axis} + for key, val in first_frame.items(): + if key not in existing_dt_names and key in data_shapes: + shape_raw = data_shapes[key] + if shape_raw is not None: + shape = tuple([axis_map.get(s, s) for s in shape_raw]) + else: + shape = None + + v_arr = np.array(val) + new_dts.append( + DataType(key, type(v_arr), shape=shape, required=False) + ) + + if new_dts: + cls.register_data_type(*new_dts) + + agg_data = {} + for key, val in first_frame.items(): + if key in frame_dependent_keys: + agg_data[key] = np.array([d[key] for d in system_frames]) + else: + agg_data[key] = val + + yield agg_data + + def from_labeled_system(self, file_name, **kwargs): + """Load data for a single LabeledSystem from an LMDB database.""" + if isinstance(file_name, dict): + return file_name + # from_multi_systems returns a generator of dicts + gen = self.from_multi_systems(file_name, **kwargs) + return next(gen) + + def from_system(self, file_name, **kwargs): + """Load data for a single System from an LMDB database.""" + if isinstance(file_name, dict): + return file_name + # from_multi_systems returns a generator of dicts + gen = self.from_multi_systems(file_name, **kwargs) + return next(gen) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 2726e1d46..860f52d02 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -117,6 +117,12 @@ class DeePMDMixedFormat(Format): >>> import dpdata >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") + Dump with ``atom_numb_pad`` to reduce the number of subdirectories. + Systems are padded with virtual atoms (type -1) so that atom counts are + rounded up to the nearest multiple of the given number: + + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8) + Load a mixed type data into a MultiSystems: >>> import dpdata @@ -156,7 +162,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): file_name, type_map=type_map, labels=True ) - def mix_system(self, *system, type_map, **kwargs): + def mix_system(self, *system, type_map, atom_numb_pad=None, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -165,6 +171,13 @@ def mix_system(self, *system, type_map, **kwargs): The systems to mix type_map : list of str Maps atom type to name + atom_numb_pad : int, optional + If provided, pad atom counts to the next multiple of this number + using virtual atoms (type -1 in real_atom_types). This reduces the + number of subdirectories when systems have many different atom counts. + For example, ``atom_numb_pad=8`` groups systems into multiples of 8: + a 5-atom system is padded to 8, a 9-atom system is padded to 16, etc. + Virtual atoms are transparently removed when loading the data back. **kwargs : dict other parameters @@ -172,8 +185,17 @@ def mix_system(self, *system, type_map, **kwargs): ------- mixed_systems: dict dict of mixed system with key 'atom_numbs' + + Examples + -------- + Dump with padding so that atom counts are rounded up to multiples of 8: + + >>> import dpdata + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8) """ - return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs) + return dpdata.deepmd.mixed.mix_system( + *system, type_map=type_map, atom_numb_pad=atom_numb_pad, **kwargs + ) def from_multi_systems(self, directory, **kwargs): register_spin() diff --git a/dpdata/plugins/lmdb.py b/dpdata/plugins/lmdb.py new file mode 100644 index 000000000..8391c1fae --- /dev/null +++ b/dpdata/plugins/lmdb.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from dpdata.format import Format +from dpdata.lmdb.format import LMDBFormat + +Format.register("lmdb")(LMDBFormat) diff --git a/dpdata/system.py b/dpdata/system.py index 4c8f350a2..64cacf243 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1369,6 +1369,8 @@ def __init__(self, *systems, type_map=None): Maps atom type to name """ self.systems: dict[str, System] = {} + # short name to name + self._short_name_map: dict[str, str] = {} if type_map is not None: self.atom_names: list[str] = type_map else: @@ -1443,6 +1445,8 @@ def __getitem__(self, key): """Returns proerty stored in System by key or by idx.""" if isinstance(key, int): return list(self.systems.values())[key] + if key in self._short_name_map: + return self.systems[self._short_name_map[key]] return self.systems[key] def __len__(self): @@ -1524,6 +1528,7 @@ def __append(self, system: System): self.systems[formula].append(system) else: self.systems[formula] = system.copy() + self._short_name_map[system.short_name] = formula def check_atom_names(self, system: System): """Make atom_names in all systems equal, prevent inconsistent atom_types.""" @@ -1536,11 +1541,14 @@ def check_atom_names(self, system: System): self.atom_names.extend(new_in_system) # Add this atom_name to each system, and change their names new_systems = {} + new_short_name_map = {} for each_system in self.systems.values(): each_system.add_atom_names(new_in_system) each_system.sort_atom_names(type_map=self.atom_names) new_systems[each_system.formula] = each_system + new_short_name_map[each_system.short_name] = each_system.formula self.systems = new_systems + self._short_name_map = new_short_name_map if len(new_in_self): # Previous atom_name not in this system system.add_atom_names(new_in_self) diff --git a/pyproject.toml b/pyproject.toml index 52c47804e..683001bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ dependencies = [ 'scipy', 'h5py', 'wcmatch', + 'lmdb', + 'msgpack-numpy', 'importlib_metadata>=1.4; python_version < "3.8"', 'typing_extensions; python_version < "3.8"', ] diff --git a/skills/dpdata-cli/README.md b/skills/dpdata-cli/README.md new file mode 100644 index 000000000..3c28e020d --- /dev/null +++ b/skills/dpdata-cli/README.md @@ -0,0 +1,71 @@ +# dpdata CLI Agent Skill + +An Agent Skill for using [dpdata](https://github.com/deepmodeling/dpdata) CLI to convert and manipulate atomic simulation data formats. + +## Installation + +To install this skill, provide the skill's GitHub URL to your agent (e.g., OpenClaw): + +```text +Install the dpdata-cli skill from https://github.com/deepmodeling/dpdata/tree/devel/skills/dpdata-cli +``` + +Or manually copy to your agent's skills directory: + +```bash +cp -r skills/dpdata-cli /path/to/your/agent/skills/ +``` + +## Usage + +Once installed, ask your AI agent to work with dpdata: + +```text +Convert my VASP OUTCAR to DeePMD-kit format +``` + +```text +Convert LAMMPS dump file to VASP POSCAR +``` + +## Features + +- **Format Conversion**: Convert between 50+ DFT/MD formats +- **DeePMD-kit Support**: Prepare training data for machine learning potentials +- **Auto-detection**: Automatic format detection for common file types +- **Multi-system Support**: Handle directories with multiple systems + +## Supported Formats + +Formats may be updated. See [Formats Reference (stable)](https://docs.deepmodeling.com/projects/dpdata/en/stable/formats.html) for the latest list. + +### DeePMD-kit + +`deepmd/raw`, `deepmd/comp`, `deepmd/npy`, `deepmd/npy/mixed`, `deepmd/hdf5` + +### VASP + +`vasp/poscar`, `vasp/contcar`, `vasp/outcar`, `vasp/xml` + +### LAMMPS + +`lammps/lmp`, `lammps/dump` + +### ABACUS + +`stru`, `abacus/scf`, `abacus/md`, `abacus/relax` + +### And many more... + +QE, CP2K, Gaussian, ORCA, PSI4, FHI-aims, SIESTA, PWmat, AMBER, GROMACS, ASE, pymatgen, XYZ, etc. + +## Requirements + +- [uv](https://docs.astral.sh/uv/) for running dpdata via `uvx` + +## References + +- [dpdata Documentation](https://docs.deepmodeling.com/projects/dpdata/) +- [CLI Reference](https://docs.deepmodeling.com/projects/dpdata/en/stable/cli.html) +- [Formats Reference](https://docs.deepmodeling.com/projects/dpdata/en/stable/formats.html) +- [GitHub Repository](https://github.com/deepmodeling/dpdata) diff --git a/skills/dpdata-cli/SKILL.md b/skills/dpdata-cli/SKILL.md new file mode 100644 index 000000000..fd18301a3 --- /dev/null +++ b/skills/dpdata-cli/SKILL.md @@ -0,0 +1,188 @@ +--- +name: dpdata-cli +description: Convert and manipulate atomic simulation data formats using dpdata CLI. Use when converting between DFT/MD output formats (VASP, LAMMPS, QE, CP2K, Gaussian, ABACUS, etc.), preparing training data for DeePMD-kit, or working with DeePMD formats. Supports 50+ formats including deepmd/raw, deepmd/comp, deepmd/npy, deepmd/hdf5. +compatibility: Requires uvx (uv) for running dpdata +metadata: + author: njzjz-bot + version: '1.0' + repository: https://github.com/deepmodeling/dpdata +--- + +# dpdata CLI + +dpdata is a tool for manipulating multiple atomic simulation data formats. This skill enables format conversion between various DFT/MD software outputs via command line. + +## Quick Start + +Run dpdata via uvx: + +```bash +uvx dpdata [options] +``` + +## Command Line Usage + +```text +dpdata: Manipulating multiple atomic simulation data formats +usage: dpdata [-h] [--to_file TO_FILE] [--from_format FROM_FORMAT] + [--to_format TO_FORMAT] [--no-labeled] [--multi] + [--type-map TYPE_MAP [TYPE_MAP ...]] [--version] + from_file +``` + +### Arguments + +| Argument | Description | +| --------------------- | ----------------------------------------------------- | +| `from_file` | Read data from a file (positional) | +| `--to_file`, `-O` | Dump data to a file | +| `--from_format`, `-i` | Format of from_file (default: "auto") | +| `--to_format`, `-o` | Format of to_file | +| `--no-labeled`, `-n` | Labels aren't provided (default: False) | +| `--multi`, `-m` | System contains multiple directories (default: False) | +| `--type-map`, `-t` | Type map for atom types | +| `--version` | Show dpdata version and exit | + +## Common Examples + +### Convert VASP OUTCAR to deepmd format + +```bash +uvx dpdata OUTCAR -i vasp/outcar -O deepmd_data -o deepmd/raw +``` + +### Convert LAMMPS dump to VASP POSCAR + +```bash +uvx dpdata dump.lammps -i lammps/dump -O POSCAR -o vasp/poscar +``` + +### Convert with type map + +```bash +uvx dpdata OUTCAR -i vasp/outcar -O deepmd_data -o deepmd/raw -t C H O N +``` + +### Convert multiple systems + +```bash +uvx dpdata data_dir -i vasp/outcar -O output_dir -o deepmd/comp --multi +``` + +### Convert to deepmd/npy (compressed format) + +```bash +uvx dpdata OUTCAR -i vasp/outcar -O deepmd_npy -o deepmd/npy +``` + +### Convert to deepmd/hdf5 + +```bash +uvx dpdata OUTCAR -i vasp/outcar -O data.h5 -o deepmd/hdf5 +``` + +## Supported Formats + +Formats may be updated. For the complete and latest list, see: + +- [Formats Reference (stable)](https://docs.deepmodeling.com/projects/dpdata/en/stable/formats.html) + +### DeePMD-kit Formats + +| Format Name | Description | +| ---------------------------- | ---------------------------------- | +| `deepmd/raw` | DeePMD-kit raw text format | +| `deepmd/comp` / `deepmd/npy` | DeePMD-kit compressed numpy format | +| `deepmd/npy/mixed` | DeePMD-kit mixed type format | +| `deepmd/hdf5` | DeePMD-kit HDF5 format | + +### VASP Formats + +| Format Name | Description | +| ----------------------------------------------------- | -------------------- | +| `vasp/poscar` / `vasp/contcar` / `poscar` / `contcar` | VASP structure files | +| `vasp/outcar` / `outcar` | VASP OUTCAR output | +| `vasp/xml` / `xml` | VASP XML output | +| `vasp/string` | VASP string format | + +### LAMMPS Formats + +| Format Name | Description | +| ---------------------- | ---------------- | +| `lammps/lmp` / `lmp` | LAMMPS data file | +| `lammps/dump` / `dump` | LAMMPS dump file | + +### ABACUS Formats + +| Format Name | Description | +| -------------------------------------------------------- | --------------------- | +| `stru` / `abacus/stru` | ABACUS structure file | +| `abacus/lcao/scf` / `abacus/pw/scf` / `abacus/scf` | ABACUS SCF output | +| `abacus/lcao/md` / `abacus/pw/md` / `abacus/md` | ABACUS MD output | +| `abacus/lcao/relax` / `abacus/pw/relax` / `abacus/relax` | ABACUS relax output | + +### Quantum ESPRESSO Formats + +| Format Name | Description | +| ------------ | ---------------- | +| `qe/cp/traj` | QE CP trajectory | +| `qe/pw/scf` | QE PWscf output | + +### CP2K Formats + +| Format Name | Description | +| ------------------ | ---------------- | +| `cp2k/output` | CP2K output | +| `cp2k/aimd_output` | CP2K AIMD output | + +### Gaussian Formats + +| Format Name | Description | +| --------------- | ----------------------------- | +| `gaussian/log` | Gaussian log file | +| `gaussian/fchk` | Gaussian formatted checkpoint | +| `gaussian/md` | Gaussian MD output | +| `gaussian/gjf` | Gaussian input file | + +### Other Formats + +| Format Name | Description | +| ------------------------------------------------------------------- | --------------------- | +| `xyz` | XYZ format | +| `mace/xyz` / `nequip/xyz` / `gpumd/xyz` / `extxyz` / `quip/gap/xyz` | Extended XYZ variants | +| `ase/structure` | ASE structure format | +| `ase/traj` | ASE trajectory | +| `pymatgen/structure` | pymatgen structure | +| `pymatgen/molecule` | pymatgen molecule | +| `gromacs/gro` / `gro` | GROMACS gro file | +| `siesta/output` | SIESTA output | +| `siesta/aimd_output` | SIESTA AIMD output | +| `pwmat/output` / `pwmat/mlmd` / `pwmat/movement` | PWmat output | +| `pwmat/final.config` / `pwmat/atom.config` | PWmat config | +| `orca/spout` | ORCA output | +| `psi4/out` | PSI4 output | +| `dftbplus` | DFTB+ output | +| `fhi_aims/output` / `fhi_aims/md` | FHI-aims output | +| `amber/md` | AMBER MD | +| `n2p2` | n2p2 format | +| `mol_file` / `mol` | MOL file | +| `sdf_file` / `sdf` | SDF file | +| `openmx/md` | OpenMX MD | +| `sqm/out` | SQM output | +| `sqm/in` | SQM input | +| `list` | List format | +| `3dmol` | 3Dmol visualization | + +## Tips + +1. **Auto-detection**: Use `-i auto` (default) to let dpdata detect format automatically +1. **Type mapping**: Use `-t` to specify atom type order for deepmd formats +1. **Multi-system**: Use `--multi` for directories containing multiple systems +1. **Compressed output**: Use `deepmd/npy` or `deepmd/hdf5` for smaller file sizes + +## References + +- [dpdata Documentation](https://docs.deepmodeling.com/projects/dpdata/) +- [CLI Reference](https://docs.deepmodeling.com/projects/dpdata/en/stable/cli.html) +- [Formats Reference](https://docs.deepmodeling.com/projects/dpdata/en/stable/formats.html) +- [GitHub Repository](https://github.com/deepmodeling/dpdata) diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index bd8036876..d5b0dec64 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -597,3 +597,338 @@ def test_aparam_exists(self): self.systems[formula].data["aparam"], decimal=self.places, ) + + +class TestMixedMultiSystemsPadding( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test round-trip with atom_numb_pad. + + C1H4 (5 atoms) and C1H3 (4 atoms) are both padded to 8 atoms, + so only 1 subfolder should be created. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad", atom_numb_pad=8) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad"): + shutil.rmtree("tmp.deepmd.mixed.pad") + + def test_single_subfolder(self): + """Both 4-atom and 5-atom systems padded to 8 -> 1 subfolder.""" + subdirs = [ + d + for d in os.listdir("tmp.deepmd.mixed.pad") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad", d)) + ] + self.assertEqual(len(subdirs), 1) + self.assertEqual(subdirs[0], "8") + + def test_padded_virtual_atoms(self): + """Verify on-disk real atom count matches loaded natoms, and virtual + atoms have type -1 with zero coords and forces. + """ + loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()} + mixed_sets = glob("tmp.deepmd.mixed.pad/*/set.*") + self.assertGreater(len(mixed_sets), 0) + for s in mixed_sets: + rat = np.load(os.path.join(s, "real_atom_types.npy")) + coord = np.load(os.path.join(s, "coord.npy")) + force = np.load(os.path.join(s, "force.npy")) + padded_natoms = rat.shape[1] + for ii in range(rat.shape[0]): + row = rat[ii] + n_real = int(np.sum(row >= 0)) + # on-disk real atom count must match one of the loaded systems + self.assertIn(n_real, loaded_natoms.values()) + # real atoms first, then virtual atoms + np.testing.assert_array_equal(row[:n_real] >= 0, True) + np.testing.assert_array_equal(row[n_real:], -1) + # virtual atom coords and forces must be zero + coord_frame = coord[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(coord_frame[n_real:], 0.0) + force_frame = force[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(force_frame[n_real:], 0.0) + + def test_loaded_natoms(self): + """Loaded systems should have original (unpadded) atom counts.""" + for formula, sys in self.systems.systems.items(): + if "H4" in formula: + self.assertEqual(sys.get_natoms(), 5) + elif "H3" in formula: + self.assertEqual(sys.get_natoms(), 4) + # no virtual atoms should remain in loaded data + self.assertTrue(np.all(sys.data["atom_types"] >= 0)) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingMultipleGroups( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test padding with systems that span multiple padded groups. + + With atom_numb_pad=4: C1H3 (4 atoms) -> 4, C1H4 (5 atoms) -> 8. + Two subfolders should be created. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad2", atom_numb_pad=4) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad2", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad2"): + shutil.rmtree("tmp.deepmd.mixed.pad2") + + def test_two_subfolders(self): + """4-atom -> 4, 5-atom -> 8 => 2 subfolders.""" + subdirs = sorted( + d + for d in os.listdir("tmp.deepmd.mixed.pad2") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad2", d)) + ) + self.assertEqual(len(subdirs), 2) + self.assertIn("4", subdirs) + self.assertIn("8", subdirs) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingTypeMap( + unittest.TestCase, CompLabeledMultiSys, MSAllIsNoPBC +): + """Test padding + custom type_map on reload. + + This verifies the index_map bug fix for -1 values in real_atom_types. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.tm", atom_numb_pad=8) + + new_type_map = ["H", "C"] + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad.tm", + fmt="deepmd/npy/mixed", + type_map=new_type_map, + ) + + # Apply same type_map to original for comparison + for kk in [ii.formula for ii in self.ms]: + self.ms[kk].apply_type_map(new_type_map) + tmp_ss = self.ms.systems.pop(kk) + self.ms.systems[tmp_ss.formula] = tmp_ss + + self.ms_1 = self.ms + self.ms_2 = self.systems + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad.tm"): + shutil.rmtree("tmp.deepmd.mixed.pad.tm") + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingAparam( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test padding with custom per-atom data (aparam).""" + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + new_datatypes = [ + DataType( + "fparam", + np.ndarray, + shape=(Axis.NFRAMES, 2), + required=False, + ), + DataType( + "aparam", + np.ndarray, + shape=(Axis.NFRAMES, Axis.NATOMS, 3), + required=False, + ), + ] + for datatype in new_datatypes: + dpdata.System.register_data_type(datatype) + dpdata.LabeledSystem.register_data_type(datatype) + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + tmp_data_1 = system_1.data.copy() + nframes_1 = tmp_data_1["coords"].shape[0] + natoms_1 = tmp_data_1["atom_types"].shape[0] + tmp_data_1["fparam"] = np.random.random([nframes_1, 2]) + tmp_data_1["aparam"] = np.random.random([nframes_1, natoms_1, 3]) + system_1_with_params = dpdata.LabeledSystem(data=tmp_data_1) + + tmp_data_2 = system_2.data.copy() + nframes_2 = tmp_data_2["coords"].shape[0] + natoms_2 = tmp_data_2["atom_types"].shape[0] + tmp_data_2["fparam"] = np.random.random([nframes_2, 2]) + tmp_data_2["aparam"] = np.random.random([nframes_2, natoms_2, 3]) + system_2_with_params = dpdata.LabeledSystem(data=tmp_data_2) + + self.ms = dpdata.MultiSystems(system_1_with_params, system_2_with_params) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.ap", atom_numb_pad=8) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad.ap", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad.ap"): + shutil.rmtree("tmp.deepmd.mixed.pad.ap") + + def test_single_subfolder(self): + subdirs = [ + d + for d in os.listdir("tmp.deepmd.mixed.pad.ap") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad.ap", d)) + ] + self.assertEqual(len(subdirs), 1) + + def test_fparam_preserved(self): + for formula in self.system_names: + if formula in self.ms.systems and formula in self.systems.systems: + np.testing.assert_almost_equal( + self.ms[formula].data["fparam"], + self.systems[formula].data["fparam"], + decimal=self.places, + ) + + def test_aparam_preserved(self): + """Per-atom aparam should be correctly padded and unpadded.""" + for formula in self.system_names: + if formula in self.ms.systems and formula in self.systems.systems: + np.testing.assert_almost_equal( + self.ms[formula].data["aparam"], + self.systems[formula].data["aparam"], + decimal=self.places, + ) + + def test_virtual_atoms_zero_on_disk(self): + """Verify virtual atoms have zero aparam on disk.""" + loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()} + mixed_sets = glob("tmp.deepmd.mixed.pad.ap/*/set.*") + self.assertGreater(len(mixed_sets), 0) + for s in mixed_sets: + rat = np.load(os.path.join(s, "real_atom_types.npy")) + aparam = np.load(os.path.join(s, "aparam.npy")) + padded_natoms = rat.shape[1] + for ii in range(rat.shape[0]): + row = rat[ii] + n_real = int(np.sum(row >= 0)) + self.assertIn(n_real, loaded_natoms.values()) + # aparam shape on disk: (nframes, padded_natoms * 3) + aparam_frame = aparam[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(aparam_frame[n_real:], 0.0) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) diff --git a/tests/test_lmdb.py b/tests/test_lmdb.py new file mode 100644 index 000000000..ee651edce --- /dev/null +++ b/tests/test_lmdb.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import os +import shutil +import unittest + +import lmdb +import msgpack +import msgpack_numpy as m +import numpy as np +from comp_sys import ( + CompLabeledMultiSys, + CompLabeledSys, + CompSys, + IsPBC, + MSAllIsNoPBC, +) +from context import dpdata + +from dpdata.lmdb.format import LMDBFrameError, LMDBMetadataError + + +class TestLMDBLabeledSystem(unittest.TestCase, CompLabeledSys, IsPBC): + def setUp(self): + self.system_1 = dpdata.LabeledSystem("poscars/OUTCAR.h2o.md", fmt="vasp/outcar") + self.lmdb_path = "tmp_labeled.lmdb" + self.system_1.to("lmdb", self.lmdb_path) + self.system_2 = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self): + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + +class TestLMDBSystem(unittest.TestCase, CompSys, IsPBC): + def setUp(self): + self.system_1 = dpdata.System("poscars/POSCAR.h2o.md", fmt="vasp/poscar") + self.lmdb_path = "tmp_system.lmdb" + self.system_1.to("lmdb", self.lmdb_path) + self.system_2 = dpdata.System(self.lmdb_path, fmt="lmdb") + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self): + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + +class TestLMDBMultiSystems(unittest.TestCase, CompLabeledMultiSys, MSAllIsNoPBC): + def setUp(self): + self.lmdb_path = "tmp_multi.lmdb" + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_reordered.gaussianlog", fmt="gaussian/log" + ) + system_3 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms_1 = dpdata.MultiSystems(system_1, system_2, system_3) + + # Standard API + self.ms_1.to("lmdb", self.lmdb_path) + + # Standard API + self.ms_2 = dpdata.MultiSystems.from_file(self.lmdb_path, fmt="lmdb") + + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self): + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + +class TestLMDBErrorHandling(unittest.TestCase): + def setUp(self): + self.lmdb_path_missing_metadata = "tmp_missing_metadata.lmdb" + self.lmdb_path_missing_frame = "tmp_missing_frame.lmdb" + + # Ensure cleanup in case of previous test failures + if os.path.exists(self.lmdb_path_missing_metadata): + shutil.rmtree(self.lmdb_path_missing_metadata) + if os.path.exists(self.lmdb_path_missing_frame): + shutil.rmtree(self.lmdb_path_missing_frame) + + # For test_load_missing_frame_data, create a valid LMDB environment + # and write metadata, but no actual frames. + env = lmdb.open(self.lmdb_path_missing_frame, map_size=1000000000) + with env.begin(write=True) as txn: + metadata = { + "nframes": 1, + "system_info": [ + { + "formula": "H2O", + "natoms": [1, 2], + "nframes": 1, + "start_idx": 0, + } + ], + } + m.patch() # Ensure numpy patching for metadata + txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True)) + env.close() + + def tearDown(self): + if os.path.exists(self.lmdb_path_missing_metadata): + shutil.rmtree(self.lmdb_path_missing_metadata) + if os.path.exists(self.lmdb_path_missing_frame): + shutil.rmtree(self.lmdb_path_missing_frame) + + def test_load_missing_metadata(self): + # Create a valid, empty LMDB environment, then test for missing metadata + lmdb.open( + self.lmdb_path_missing_metadata, map_size=1000000000 + ).close() # Creates empty LMDB environment + + with self.assertRaisesRegex( + LMDBMetadataError, "LMDB database does not contain metadata." + ): + # Standard API + dpdata.MultiSystems.from_file(self.lmdb_path_missing_metadata, fmt="lmdb") + + def test_load_missing_frame_data(self): + with self.assertRaisesRegex( + LMDBFrameError, "Frame data not found for key: b'000000000000'" + ): + # Standard API + dpdata.MultiSystems.from_file(self.lmdb_path_missing_frame, fmt="lmdb") + + +class TestLMDBConfig(unittest.TestCase): + def setUp(self): + self.lmdb_path = "tmp_config.lmdb" + self.system = dpdata.LabeledSystem("poscars/OUTCAR.h2o.md", fmt="vasp/outcar") + + def tearDown(self): + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + def test_custom_frame_idx_fmt(self): + fmt = "06d" + # Standard API with custom kwarg + ms = dpdata.MultiSystems(self.system) + ms.to("lmdb", self.lmdb_path, frame_idx_fmt=fmt) + + # 1. Verify key format in database + with lmdb.open(self.lmdb_path, readonly=True) as env: + with env.begin() as txn: + # Frame 0 should be "000000" + self.assertIsNotNone(txn.get(b"000000")) + # Frame 0 should NOT be "000000000000" + self.assertIsNone(txn.get(b"000000000000")) + + # 2. Verify loading works automatically via standard API + system_loaded = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + self.assertEqual(len(system_loaded), len(self.system)) + np.testing.assert_allclose( + system_loaded.data["coords"], self.system.data["coords"] + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_lmdb_custom_dtype.py b/tests/test_lmdb_custom_dtype.py new file mode 100644 index 000000000..4786a30c7 --- /dev/null +++ b/tests/test_lmdb_custom_dtype.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import os +import shutil +import unittest + +import numpy as np + +import dpdata +from dpdata.data_type import Axis, DataType + + +class TestLMDBCustomDType(unittest.TestCase): + def setUp(self): + self.original_system_dtypes = dpdata.System.DTYPES + self.original_labeled_system_dtypes = dpdata.LabeledSystem.DTYPES + + # Register custom data types as optional + self.dt_frame = DataType( + "frame_data", np.ndarray, shape=(Axis.NFRAMES, 2), required=False + ) + self.dt_static = DataType("static_data", np.ndarray, shape=(2,), required=False) + + dpdata.System.register_data_type(self.dt_frame, self.dt_static) + dpdata.LabeledSystem.register_data_type(self.dt_frame, self.dt_static) + + self.lmdb_path = "tmp_custom_dtype.lmdb" + + # Create a system with custom data + # Assuming running from tests/ directory + try: + self.system = dpdata.LabeledSystem( + "poscars/OUTCAR.h2o.md", fmt="vasp/outcar" + ) + except FileNotFoundError: + self.system = dpdata.LabeledSystem( + "tests/poscars/OUTCAR.h2o.md", fmt="vasp/outcar" + ) + + nframes = self.system.get_nframes() + self.system.data["frame_data"] = np.random.rand(nframes, 2) + self.system.data["static_data"] = np.array([1.0, 2.0]) + self.system.check_data() + + def tearDown(self): + dpdata.System.DTYPES = self.original_system_dtypes + dpdata.LabeledSystem.DTYPES = self.original_labeled_system_dtypes + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + def test_custom_dtype_preservation(self): + self.system.to("lmdb", self.lmdb_path) + system_loaded = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + + np.testing.assert_allclose( + system_loaded.data["frame_data"], self.system.data["frame_data"] + ) + np.testing.assert_allclose( + system_loaded.data["static_data"], self.system.data["static_data"] + ) + + def test_multi_systems_custom_dtype(self): + ms = dpdata.MultiSystems(self.system) + ms.to("lmdb", self.lmdb_path) + ms_loaded = dpdata.MultiSystems.from_file(self.lmdb_path, fmt="lmdb") + + system_loaded = list(ms_loaded.systems.values())[0] + np.testing.assert_allclose( + system_loaded.data["frame_data"], self.system.data["frame_data"] + ) + np.testing.assert_allclose( + system_loaded.data["static_data"], self.system.data["static_data"] + ) + + def test_custom_dtype_auto_registration(self): + # Save with custom data types registered + self.system.to("lmdb", self.lmdb_path) + + # Simulate a clean session by unregistering the custom types + dpdata.System.DTYPES = self.original_system_dtypes + dpdata.LabeledSystem.DTYPES = self.original_labeled_system_dtypes + + # Verify they are currently missing + self.assertNotIn("frame_data", [dt.name for dt in dpdata.LabeledSystem.DTYPES]) + + # Load from LMDB - should trigger auto-registration + system_loaded = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + + # Verify data is loaded and types are registered + self.assertIn("frame_data", [dt.name for dt in dpdata.LabeledSystem.DTYPES]) + self.assertIn("static_data", [dt.name for dt in dpdata.LabeledSystem.DTYPES]) + + np.testing.assert_allclose( + system_loaded.data["frame_data"], self.system.data["frame_data"] + ) + np.testing.assert_allclose( + system_loaded.data["static_data"], self.system.data["static_data"] + ) + + +class TestLMDBFparamAparam(unittest.TestCase): + def setUp(self): + self.original_system_dtypes = dpdata.System.DTYPES + self.original_labeled_system_dtypes = dpdata.LabeledSystem.DTYPES + + new_datatypes = [ + DataType( + "fparam", + np.ndarray, + shape=(Axis.NFRAMES, 2), + required=False, + ), + DataType( + "aparam", + np.ndarray, + shape=(Axis.NFRAMES, Axis.NATOMS, 3), + required=False, + ), + ] + + for datatype in new_datatypes: + dpdata.System.register_data_type(datatype) + dpdata.LabeledSystem.register_data_type(datatype) + + self.lmdb_path = "tmp_fparam_aparam.lmdb" + + try: + self.system = dpdata.LabeledSystem( + "poscars/OUTCAR.h2o.md", fmt="vasp/outcar" + ) + except FileNotFoundError: + self.system = dpdata.LabeledSystem( + "tests/poscars/OUTCAR.h2o.md", fmt="vasp/outcar" + ) + + nframes = self.system.get_nframes() + natoms = self.system.get_natoms() + self.system.data["fparam"] = np.random.rand(nframes, 2) + self.system.data["aparam"] = np.random.rand(nframes, natoms, 3) + self.system.check_data() + + def tearDown(self): + dpdata.System.DTYPES = self.original_system_dtypes + dpdata.LabeledSystem.DTYPES = self.original_labeled_system_dtypes + if os.path.exists(self.lmdb_path): + shutil.rmtree(self.lmdb_path) + + def test_fparam_aparam_preservation(self): + self.system.to("lmdb", self.lmdb_path) + system_loaded = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + + np.testing.assert_allclose( + system_loaded.data["fparam"], self.system.data["fparam"] + ) + np.testing.assert_allclose( + system_loaded.data["aparam"], self.system.data["aparam"] + ) + + def test_fparam_aparam_auto_registration(self): + # Save with fparam/aparam registered + self.system.to("lmdb", self.lmdb_path) + + # Simulate a clean session by restoring original DTYPES + dpdata.System.DTYPES = self.original_system_dtypes + dpdata.LabeledSystem.DTYPES = self.original_labeled_system_dtypes + + # Load from LMDB + system_loaded = dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + + # Verify auto-registration and data correctness + self.assertIn("fparam", [dt.name for dt in dpdata.LabeledSystem.DTYPES]) + self.assertIn("aparam", [dt.name for dt in dpdata.LabeledSystem.DTYPES]) + + np.testing.assert_allclose( + system_loaded.data["fparam"], self.system.data["fparam"] + ) + np.testing.assert_allclose( + system_loaded.data["aparam"], self.system.data["aparam"] + ) + + def test_symbolic_axis_natoms_preservation(self): + # 1. Save system with aparam (which uses Axis.NATOMS) + self.system.to("lmdb", self.lmdb_path) + + # 2. Simulate new session + dpdata.System.DTYPES = self.original_system_dtypes + dpdata.LabeledSystem.DTYPES = self.original_labeled_system_dtypes + + # 3. Load triggers auto-registration + dpdata.LabeledSystem(self.lmdb_path, fmt="lmdb") + + # 4. Find the newly registered DataType for 'aparam' + aparam_dt = next( + dt for dt in dpdata.LabeledSystem.DTYPES if dt.name == "aparam" + ) + + # 5. Assert that it contains Axis.NATOMS, not a fixed integer + self.assertIn(Axis.NATOMS, aparam_dt.shape) + + # 6. Functional verification + data_diff = { + "atom_numbs": [5], + "atom_names": ["H"], + "atom_types": np.array([0, 0, 0, 0, 0]), + "coords": np.random.rand(1, 5, 3), + "cells": np.random.rand(1, 3, 3), + "orig": np.array([0, 0, 0]), + "energies": np.array([1.0]), + "forces": np.random.rand(1, 5, 3), + "aparam": np.random.rand(1, 5, 3), + } + try: + dpdata.LabeledSystem(data=data_diff) + except dpdata.data_type.DataError as e: + self.fail(f"DataError raised despite symbolic NATOMS: {e}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_multisystems.py b/tests/test_multisystems.py index 88d4593a1..fd9b278dd 100644 --- a/tests/test_multisystems.py +++ b/tests/test_multisystems.py @@ -219,6 +219,8 @@ def test_long_filename1(self): ms = dpdata.MultiSystems(system) with tempfile.TemporaryDirectory() as tmpdir: ms.to_deepmd_npy(tmpdir) + # test visiting system by short_name + ms[system.short_name] def test_long_filename2(self): system = dpdata.System( @@ -234,6 +236,8 @@ def test_long_filename2(self): ms = dpdata.MultiSystems(system) with tempfile.TemporaryDirectory() as tmpdir: ms.to_deepmd_npy(tmpdir) + # test visiting system by short_name + ms[system.short_name] if __name__ == "__main__":