Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
expand_sys_str,
j_loader,
)
from deepmd.dpmodel.utils.lmdb_data import (
is_lmdb,
)
from deepmd.loggers.loggers import (
set_log_handles,
)
Expand Down Expand Up @@ -109,10 +112,36 @@ def prepare_trainer_input_single(
)
training_systems = training_dataset_params["systems"]
trn_patterns = training_dataset_params.get("rglob_patterns", None)
training_systems = process_systems(training_systems, patterns=trn_patterns)
training_systems = process_systems(
training_systems,
patterns=trn_patterns,
fmt=training_dataset_params.get("format", None),
out_fmt=training_dataset_params.get(
"out_format", training_dataset_params.get("output_format", None)
),
)
if len(training_systems) == 1 and is_lmdb(training_systems[0]):
raise NotImplementedError(
"Paddle backend does not support LMDB training data yet. "
"Set training_data.out_format to 'deepmd/hdf5' when using "
"training_data.format for automatic conversion."
)
if validation_systems is not None:
val_patterns = validation_dataset_params.get("rglob_patterns", None)
validation_systems = process_systems(validation_systems, val_patterns)
validation_systems = process_systems(
validation_systems,
val_patterns,
fmt=validation_dataset_params.get("format", None),
out_fmt=validation_dataset_params.get(
"out_format", validation_dataset_params.get("output_format", None)
),
)
if len(validation_systems) == 1 and is_lmdb(validation_systems[0]):
raise NotImplementedError(
"Paddle backend does not support LMDB validation data yet. "
"Set validation_data.out_format to 'deepmd/hdf5' when using "
"validation_data.format for automatic conversion."
)
Comment on lines +123 to +144

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Block any LMDB-resolved dataset for Paddle, not only single-path LMDB.

Line 123 and Line 139 only reject len(...) == 1 LMDB results, but the message says Paddle does not support LMDB data yet. LMDB lists can bypass this guard and fail later in less clear ways.

Suggested fix
-        if len(training_systems) == 1 and is_lmdb(training_systems[0]):
+        if any(is_lmdb(path) for path in training_systems):
             raise NotImplementedError(
                 "Paddle backend does not support LMDB training data yet. "
                 "Set training_data.out_format to 'deepmd/hdf5' when using "
                 "training_data.format for automatic conversion."
             )
@@
-            if len(validation_systems) == 1 and is_lmdb(validation_systems[0]):
+            if any(is_lmdb(path) for path in validation_systems):
                 raise NotImplementedError(
                     "Paddle backend does not support LMDB validation data yet. "
                     "Set validation_data.out_format to 'deepmd/hdf5' when using "
                     "validation_data.format for automatic conversion."
                 )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepmd/pd/entrypoints/main.py` around lines 123 - 144, The current LMDB
validation checks for both training_systems and validation_systems only reject
LMDB when the result is a single system (len(...) == 1), but the error messages
indicate that Paddle does not support LMDB data in general. Remove the len(...)
== 1 condition from both the training_systems check (around line 123) and the
validation_systems check (around line 139) so that any LMDB dataset is rejected
regardless of whether it's a single system or multiple systems in the list. This
ensures that any LMDB-resolved dataset triggers the NotImplementedError with a
clear message, preventing less clear failures downstream.


# stat files
stat_file_path_single = data_dict_single.get("stat_file", None)
Expand Down
86 changes: 53 additions & 33 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,24 @@ def prepare_trainer_input_single(
def _make_dp_loader_set(
systems: str | list[str],
dataset_params: dict[str, Any],
) -> DpLoaderSet:
"""Create a DpLoaderSet from systems with pattern expansion."""
) -> DpLoaderSet | LmdbDataset:
"""Create a dataset from systems with pattern expansion/conversion."""
patterns = dataset_params.get("rglob_patterns", None)
systems = process_systems(systems, patterns=patterns)
systems = process_systems(
systems,
patterns=patterns,
fmt=dataset_params.get("format", None),
out_fmt=dataset_params.get(
"out_format", dataset_params.get("output_format", None)
),
)
if len(systems) == 1 and is_lmdb(systems[0]):
return LmdbDataset(
systems[0],
model_params_single["type_map"],
dataset_params["batch_size"],
auto_prob_style=dataset_params.get("auto_prob", None),
)
return DpLoaderSet(
Comment on lines +197 to 204

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Inspect PT data loader implementations/usages for LMDB awareness.
fd -t f --extension py | xargs rg -n -C3 '\bclass\s+DpLoaderSet\b|\bclass\s+LmdbDataset\b|\bis_lmdb\s*\('

# Check whether tests cover multi-system conversion to LMDB in PT paths.
rg -n -C3 'format|out_format|output_format|lmdb|DpLoaderSet|LmdbDataset' source/tests deepmd/pt

Repository: deepmodeling/deepmd-kit

Length of output: 50380


🏁 Script executed:

# Check DpLoaderSet implementation in PT
head -150 deepmd/pt/utils/dataloader.py | tail -100

# Check pt_expt's validation that rejects multi-LMDB
sed -n '80,95p' deepmd/pt_expt/entrypoints/main.py

# Verify if pt should have same check as pt_expt
diff -u <(sed -n '80,95p' deepmd/pt_expt/entrypoints/main.py) <(sed -n '190,205p' deepmd/pt/entrypoints/main.py) || true

Repository: deepmodeling/deepmd-kit

Length of output: 4467


🏁 Script executed:

# Check DeepmdDataSetForLoader to see if it handles LMDB
rg -A 20 "class DeepmdDataSetForLoader" deepmd/pt/

# Check if there's any LMDB handling in DpLoaderSet or construct_dataset
rg -B 5 -A 10 "def construct_dataset" deepmd/pt/utils/dataloader.py

# Check what process_systems actually returns for multi-LMDB cases
rg -B 3 -A 10 "def process_systems" deepmd/utils/data_system.py | head -60

Repository: deepmodeling/deepmd-kit

Length of output: 2444


🏁 Script executed:

# Check if DeepmdData can handle LMDB paths
rg -B 5 -A 25 "class DeepmdData" deepmd/pt/utils/dataset.py | head -50

# Check if DeepmdData/DeepmdDataSystem supports LMDB directly
rg "LMDB\|lmdb" deepmd/pt/utils/dataset.py

Repository: deepmodeling/deepmd-kit

Length of output: 1015


🏁 Script executed:

# Check DeepmdData implementation - is it from pt or common
rg -l "class DeepmdData" deepmd/

# Find where DeepmdData is defined
fd -t f -name "*.py" | xargs rg "^class DeepmdData" | head -5

# Check if DeepmdData supports LMDB
rg -B 2 -A 20 "def __init__" deepmd/utils/data.py | grep -A 20 "class DeepmdData" | head -40

Repository: deepmodeling/deepmd-kit

Length of output: 642


🏁 Script executed:

# Check DeepmdData __init__ to see what it accepts
sed -n '/^class DeepmdData/,/^class /p' deepmd/utils/data.py | head -80

# Check if there's any LMDB support in DeepmdData
rg "LMDB|lmdb" deepmd/utils/data.py

# Check if process_systems can return LMDB paths and how DpLoaderSet handles them
sed -n '1330,1360p' deepmd/utils/data_system.py

Repository: deepmodeling/deepmd-kit

Length of output: 4291


🏁 Script executed:

# Verify DeepmdData cannot handle LMDB by checking what happens if we pass LMDB path
rg -B 5 -A 30 "def __init__" deepmd/utils/data.py | grep -A 30 "class DeepmdData" | tail -40

# See what error DeepmdData would raise for LMDB
rg "is_dir\|set_prefix" deepmd/utils/data.py | head -10

# Confirm pt_expt validation function and compare intent
sed -n '75,100p' deepmd/pt_expt/entrypoints/main.py

Repository: deepmodeling/deepmd-kit

Length of output: 1077


🏁 Script executed:

# Check what error DeepmdData actually raises when given an LMDB path
rg -A 5 "is_dir" deepmd/utils/data.py | head -20

# Verify if there are tests for multi-LMDB conversion to see current behavior
rg -l "process_systems.*lmdb\|LMDB.*process_systems" source/tests/ | head -5

# Check if pt/entrypoints/main.py should have the _detect_lmdb_path guard like pt_expt
sed -n '190,210p' deepmd/pt/entrypoints/main.py

Repository: deepmodeling/deepmd-kit

Length of output: 1179


Add LMDB validation guard to prevent silent failures when multi-system conversion produces LMDB paths.

The check at line 197 only routes the single-LMDB case to LmdbDataset. If process_systems(...) returns multiple LMDB paths (e.g., from multi-input conversion with default LMDB output), they fall through to DpLoaderSet, which passes them to DeepmdData. Since DeepmdData expects set.* subdirectories and will fail with a confusing "No set.* found" error when given LMDB paths, add an explicit guard matching the one in deepmd/pt_expt/entrypoints/main.py lines 84-89:

if isinstance(systems, list) and any(isinstance(s, str) and is_lmdb(s) for s in systems):
    raise ValueError(
        "LMDB datasets must be passed as a scalar 'systems' string "
        "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
        "with LMDB paths are not supported."
    )

Place this check before the len(systems) == 1 condition to catch and clarify the error early.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepmd/pt/entrypoints/main.py` around lines 197 - 204, Add a validation guard
before the existing condition that checks `len(systems) == 1 and
is_lmdb(systems[0])` to prevent multiple LMDB paths from being passed to
DpLoaderSet. The guard should use `isinstance(systems, list)` combined with
`any(isinstance(s, str) and is_lmdb(s) for s in systems)` to detect when systems
is a list containing LMDB paths and raise a clear ValueError message explaining
that LMDB datasets must be passed as a scalar string rather than as a list.

systems,
dataset_params["batch_size"],
Expand All @@ -196,7 +210,11 @@ def _make_dp_loader_set(
)

# LMDB path: single string → LmdbDataset
if isinstance(training_systems, str) and is_lmdb(training_systems):
if (
training_dataset_params.get("format", None) is None
and isinstance(training_systems, str)
and is_lmdb(training_systems)
):
auto_prob = training_dataset_params.get("auto_prob", None)
train_data_single = LmdbDataset(
training_systems,
Expand All @@ -206,6 +224,7 @@ def _make_dp_loader_set(
)
if (
validation_systems is not None
and validation_dataset_params.get("format", None) is None
and isinstance(validation_systems, str)
and is_lmdb(validation_systems)
):
Expand Down Expand Up @@ -397,50 +416,51 @@ def train(
"Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)"
)

if not multi_task:
type_map = config["model"].get("type_map")
training_systems = config["training"]["training_data"].get("systems")
def _get_neighbor_stat_data_from_params(
dataset_params: dict[str, Any],
type_map: list[str] | None,
) -> Any:
training_systems = dataset_params.get("systems")
if (
training_systems is not None
dataset_params.get("format", None) is None
and training_systems is not None
and isinstance(training_systems, str)
and is_lmdb(training_systems)
):
systems = [training_systems]
else:
systems = process_systems(
training_systems,
patterns=dataset_params.get("rglob_patterns", None),
fmt=dataset_params.get("format", None),
out_fmt=dataset_params.get(
"out_format", dataset_params.get("output_format", None)
),
)
if len(systems) == 1 and is_lmdb(systems[0]):
from deepmd.dpmodel.utils.lmdb_data import (
make_neighbor_stat_data,
)

train_data = make_neighbor_stat_data(training_systems, type_map)
else:
train_data = get_data(
config["training"]["training_data"], 0, type_map, None
)
return make_neighbor_stat_data(systems[0], type_map)
return get_data(dataset_params, 0, type_map, None)

if not multi_task:
type_map = config["model"].get("type_map")
train_data = _get_neighbor_stat_data_from_params(
config["training"]["training_data"], type_map
)
config["model"], min_nbor_dist = BaseModel.update_sel(
train_data, type_map, config["model"]
)
else:
min_nbor_dist = {}
for model_item in config["model"]["model_dict"]:
type_map = config["model"]["model_dict"][model_item].get("type_map")
training_systems = config["training"]["data_dict"][model_item][
"training_data"
].get("systems")
if (
training_systems is not None
and isinstance(training_systems, str)
and is_lmdb(training_systems)
):
from deepmd.dpmodel.utils.lmdb_data import (
make_neighbor_stat_data,
)

train_data = make_neighbor_stat_data(training_systems, type_map)
else:
train_data = get_data(
config["training"]["data_dict"][model_item]["training_data"],
0,
type_map,
None,
)
train_data = _get_neighbor_stat_data_from_params(
config["training"]["data_dict"][model_item]["training_data"],
type_map,
)
config["model"]["model_dict"][model_item], min_nbor_dist[model_item] = (
BaseModel.update_sel(
train_data, type_map, config["model"]["model_dict"][model_item]
Expand Down
38 changes: 36 additions & 2 deletions deepmd/pt_expt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,31 @@ def _get_neighbor_stat_data(
``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for
npy/HDF5 directories.
"""
lmdb_path = _detect_lmdb_path(dataset_params.get("systems"))
lmdb_path = (
None
if dataset_params.get("format", None) is not None
else _detect_lmdb_path(dataset_params.get("systems"))
)
if lmdb_path is not None:
from deepmd.dpmodel.utils.lmdb_data import (
make_neighbor_stat_data,
)

return make_neighbor_stat_data(lmdb_path, type_map)
systems = process_systems(
dataset_params["systems"],
patterns=dataset_params.get("rglob_patterns", None),
fmt=dataset_params.get("format", None),
out_fmt=dataset_params.get(
"out_format", dataset_params.get("output_format", None)
),
)
if len(systems) == 1 and is_lmdb(systems[0]):
from deepmd.dpmodel.utils.lmdb_data import (
make_neighbor_stat_data,
)

return make_neighbor_stat_data(systems[0], type_map)
return get_data(dataset_params, 0, type_map, None)
Comment on lines +118 to 132

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Re-apply scalar-LMDB validation after format-based conversion.

_detect_lmdb_path explicitly disallows LMDB in list-form systems, but when format is set the code skips that check and only handles the single-LMDB case. If process_systems(...) returns multiple LMDB paths, this now falls through to generic paths (get_data / DeepmdDataSystem) instead of raising the intended clear error.

Suggested fix pattern
     systems = process_systems(
         dataset_params["systems"],
         patterns=dataset_params.get("rglob_patterns", None),
         fmt=dataset_params.get("format", None),
         out_fmt=dataset_params.get(
             "out_format", dataset_params.get("output_format", None)
         ),
     )
+    lmdb_systems = [s for s in systems if is_lmdb(s)]
+    if lmdb_systems and not (len(systems) == 1 and len(lmdb_systems) == 1):
+        raise ValueError(
+            "LMDB datasets must be passed as a scalar 'systems' string "
+            "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
+            "with LMDB paths are not supported."
+        )
     if len(systems) == 1 and is_lmdb(systems[0]):
         ...

Apply the same guard in both _get_neighbor_stat_data and _build_data_system.

Also applies to: 168-175

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepmd/pt_expt/entrypoints/main.py` around lines 118 - 132, The current code
in _get_neighbor_stat_data only validates the single-LMDB case with `if
len(systems) == 1 and is_lmdb(systems[0])`, but when format-based conversion
produces multiple LMDB paths, this check is skipped and execution falls through
to get_data() instead of raising an appropriate error for list-form LMDB
systems. Add validation guards in both _get_neighbor_stat_data and
_build_data_system functions to ensure that after process_systems() is called,
if any LMDB systems are returned, they are validated to not be in list form
(similar to what _detect_lmdb_path does), and raise a clear error before
reaching the fallback get_data() or DeepmdDataSystem paths.



Expand All @@ -126,7 +144,11 @@ def _build_data_system(
:class:`DeepmdDataSystem` path with system expansion.
"""
systems_raw = dataset_params["systems"]
lmdb_path = _detect_lmdb_path(systems_raw)
lmdb_path = (
None
if dataset_params.get("format", None) is not None
else _detect_lmdb_path(systems_raw)
)
if lmdb_path is not None:
return LmdbDataSystem(
lmdb_path=lmdb_path,
Expand All @@ -138,7 +160,19 @@ def _build_data_system(
systems = process_systems(
systems_raw,
patterns=dataset_params.get("rglob_patterns", None),
fmt=dataset_params.get("format", None),
out_fmt=dataset_params.get(
"out_format", dataset_params.get("output_format", None)
),
)
if len(systems) == 1 and is_lmdb(systems[0]):
return LmdbDataSystem(
lmdb_path=systems[0],
type_map=type_map,
batch_size=dataset_params["batch_size"],
auto_prob_style=dataset_params.get("auto_prob"),
seed=seed,
)
return DeepmdDataSystem(
systems=systems,
batch_size=dataset_params["batch_size"],
Expand Down
54 changes: 54 additions & 0 deletions deepmd/utils/argcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -4745,6 +4745,19 @@ def training_data_args() -> list[
doc_patterns = (
"The customized patterns used in `rglob` to collect all training systems. "
)
doc_format = (
"The input data format passed to dpdata for automatic conversion. "
"If this key is not set, `systems` must already point to DeePMD data. "
"If this key is set to a non-DeePMD format, each selected input path is "
"loaded by dpdata and converted before training. Use dpdata format names "
"such as `extxyz`, `ase/structure`, `ase/traj`, or `auto`."
)
doc_out_format = (
"The output data format passed to dpdata for automatic conversion. "
"When `format` requests conversion from a non-DeePMD format, this key "
"defaults to `lmdb`. Use a DeePMD format supported by dpdata, such as "
"`lmdb`, `deepmd/hdf5`, or `deepmd/npy`."
)
doc_batch_size = f'This key can be \n\n\
- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
- int: all {link_sys} use the same batch size.\n\n\
Expand Down Expand Up @@ -4783,6 +4796,20 @@ def training_data_args() -> list[
default=None,
doc=doc_patterns + doc_only_pt_supported,
),
Argument(
"format",
[str, None],
optional=True,
doc=doc_format,
),
Argument(
"out_format",
[str, None],
optional=True,
default="lmdb",
doc=doc_out_format,
alias=["output_format"],
),
Argument(
"batch_size",
[list[int], int, str],
Expand Down Expand Up @@ -4842,6 +4869,19 @@ def validation_data_args() -> list[
doc_patterns = (
"The customized patterns used in `rglob` to collect all validation systems. "
)
doc_format = (
"The input data format passed to dpdata for automatic conversion. "
"If this key is not set, `systems` must already point to DeePMD data. "
"If this key is set to a non-DeePMD format, each selected input path is "
"loaded by dpdata and converted before validation. Use dpdata format names "
"such as `extxyz`, `ase/structure`, `ase/traj`, or `auto`."
)
doc_out_format = (
"The output data format passed to dpdata for automatic conversion. "
"When `format` requests conversion from a non-DeePMD format, this key "
"defaults to `lmdb`. Use a DeePMD format supported by dpdata, such as "
"`lmdb`, `deepmd/hdf5`, or `deepmd/npy`."
)
doc_batch_size = f'This key can be \n\n\
- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
- int: all {link_sys} use the same batch size.\n\n\
Expand All @@ -4867,6 +4907,20 @@ def validation_data_args() -> list[
default=None,
doc=doc_patterns + doc_only_pt_supported,
),
Argument(
"format",
[str, None],
optional=True,
doc=doc_format,
),
Argument(
"out_format",
[str, None],
optional=True,
default="lmdb",
doc=doc_out_format,
alias=["output_format"],
),
Argument(
"batch_size",
[list[int], int, str],
Expand Down
Loading
Loading