-
Notifications
You must be signed in to change notification settings - Fork 629
feat(data): add dpdata format conversion #5565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -183,10 +183,24 @@ def prepare_trainer_input_single( | |
| def _make_dp_loader_set( | ||
| systems: str | list[str], | ||
| dataset_params: dict[str, Any], | ||
| ) -> DpLoaderSet: | ||
| """Create a DpLoaderSet from systems with pattern expansion.""" | ||
| ) -> DpLoaderSet | LmdbDataset: | ||
| """Create a dataset from systems with pattern expansion/conversion.""" | ||
| patterns = dataset_params.get("rglob_patterns", None) | ||
| systems = process_systems(systems, patterns=patterns) | ||
| systems = process_systems( | ||
| systems, | ||
| patterns=patterns, | ||
| fmt=dataset_params.get("format", None), | ||
| out_fmt=dataset_params.get( | ||
| "out_format", dataset_params.get("output_format", None) | ||
| ), | ||
| ) | ||
| if len(systems) == 1 and is_lmdb(systems[0]): | ||
| return LmdbDataset( | ||
| systems[0], | ||
| model_params_single["type_map"], | ||
| dataset_params["batch_size"], | ||
| auto_prob_style=dataset_params.get("auto_prob", None), | ||
| ) | ||
| return DpLoaderSet( | ||
|
Comment on lines
+197
to
204
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
# Inspect PT data loader implementations/usages for LMDB awareness.
fd -t f --extension py | xargs rg -n -C3 '\bclass\s+DpLoaderSet\b|\bclass\s+LmdbDataset\b|\bis_lmdb\s*\('
# Check whether tests cover multi-system conversion to LMDB in PT paths.
rg -n -C3 'format|out_format|output_format|lmdb|DpLoaderSet|LmdbDataset' source/tests deepmd/ptRepository: deepmodeling/deepmd-kit Length of output: 50380 🏁 Script executed: # Check DpLoaderSet implementation in PT
head -150 deepmd/pt/utils/dataloader.py | tail -100
# Check pt_expt's validation that rejects multi-LMDB
sed -n '80,95p' deepmd/pt_expt/entrypoints/main.py
# Verify if pt should have same check as pt_expt
diff -u <(sed -n '80,95p' deepmd/pt_expt/entrypoints/main.py) <(sed -n '190,205p' deepmd/pt/entrypoints/main.py) || trueRepository: deepmodeling/deepmd-kit Length of output: 4467 🏁 Script executed: # Check DeepmdDataSetForLoader to see if it handles LMDB
rg -A 20 "class DeepmdDataSetForLoader" deepmd/pt/
# Check if there's any LMDB handling in DpLoaderSet or construct_dataset
rg -B 5 -A 10 "def construct_dataset" deepmd/pt/utils/dataloader.py
# Check what process_systems actually returns for multi-LMDB cases
rg -B 3 -A 10 "def process_systems" deepmd/utils/data_system.py | head -60Repository: deepmodeling/deepmd-kit Length of output: 2444 🏁 Script executed: # Check if DeepmdData can handle LMDB paths
rg -B 5 -A 25 "class DeepmdData" deepmd/pt/utils/dataset.py | head -50
# Check if DeepmdData/DeepmdDataSystem supports LMDB directly
rg "LMDB\|lmdb" deepmd/pt/utils/dataset.pyRepository: deepmodeling/deepmd-kit Length of output: 1015 🏁 Script executed: # Check DeepmdData implementation - is it from pt or common
rg -l "class DeepmdData" deepmd/
# Find where DeepmdData is defined
fd -t f -name "*.py" | xargs rg "^class DeepmdData" | head -5
# Check if DeepmdData supports LMDB
rg -B 2 -A 20 "def __init__" deepmd/utils/data.py | grep -A 20 "class DeepmdData" | head -40Repository: deepmodeling/deepmd-kit Length of output: 642 🏁 Script executed: # Check DeepmdData __init__ to see what it accepts
sed -n '/^class DeepmdData/,/^class /p' deepmd/utils/data.py | head -80
# Check if there's any LMDB support in DeepmdData
rg "LMDB|lmdb" deepmd/utils/data.py
# Check if process_systems can return LMDB paths and how DpLoaderSet handles them
sed -n '1330,1360p' deepmd/utils/data_system.pyRepository: deepmodeling/deepmd-kit Length of output: 4291 🏁 Script executed: # Verify DeepmdData cannot handle LMDB by checking what happens if we pass LMDB path
rg -B 5 -A 30 "def __init__" deepmd/utils/data.py | grep -A 30 "class DeepmdData" | tail -40
# See what error DeepmdData would raise for LMDB
rg "is_dir\|set_prefix" deepmd/utils/data.py | head -10
# Confirm pt_expt validation function and compare intent
sed -n '75,100p' deepmd/pt_expt/entrypoints/main.pyRepository: deepmodeling/deepmd-kit Length of output: 1077 🏁 Script executed: # Check what error DeepmdData actually raises when given an LMDB path
rg -A 5 "is_dir" deepmd/utils/data.py | head -20
# Verify if there are tests for multi-LMDB conversion to see current behavior
rg -l "process_systems.*lmdb\|LMDB.*process_systems" source/tests/ | head -5
# Check if pt/entrypoints/main.py should have the _detect_lmdb_path guard like pt_expt
sed -n '190,210p' deepmd/pt/entrypoints/main.pyRepository: deepmodeling/deepmd-kit Length of output: 1179 Add LMDB validation guard to prevent silent failures when multi-system conversion produces LMDB paths. The check at line 197 only routes the single-LMDB case to if isinstance(systems, list) and any(isinstance(s, str) and is_lmdb(s) for s in systems):
raise ValueError(
"LMDB datasets must be passed as a scalar 'systems' string "
"(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
"with LMDB paths are not supported."
)Place this check before the 🤖 Prompt for AI Agents |
||
| systems, | ||
| dataset_params["batch_size"], | ||
|
|
@@ -196,7 +210,11 @@ def _make_dp_loader_set( | |
| ) | ||
|
|
||
| # LMDB path: single string → LmdbDataset | ||
| if isinstance(training_systems, str) and is_lmdb(training_systems): | ||
| if ( | ||
| training_dataset_params.get("format", None) is None | ||
| and isinstance(training_systems, str) | ||
| and is_lmdb(training_systems) | ||
| ): | ||
| auto_prob = training_dataset_params.get("auto_prob", None) | ||
| train_data_single = LmdbDataset( | ||
| training_systems, | ||
|
|
@@ -206,6 +224,7 @@ def _make_dp_loader_set( | |
| ) | ||
| if ( | ||
| validation_systems is not None | ||
| and validation_dataset_params.get("format", None) is None | ||
| and isinstance(validation_systems, str) | ||
| and is_lmdb(validation_systems) | ||
| ): | ||
|
|
@@ -397,50 +416,51 @@ def train( | |
| "Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)" | ||
| ) | ||
|
|
||
| if not multi_task: | ||
| type_map = config["model"].get("type_map") | ||
| training_systems = config["training"]["training_data"].get("systems") | ||
| def _get_neighbor_stat_data_from_params( | ||
| dataset_params: dict[str, Any], | ||
| type_map: list[str] | None, | ||
| ) -> Any: | ||
| training_systems = dataset_params.get("systems") | ||
| if ( | ||
| training_systems is not None | ||
| dataset_params.get("format", None) is None | ||
| and training_systems is not None | ||
| and isinstance(training_systems, str) | ||
| and is_lmdb(training_systems) | ||
| ): | ||
| systems = [training_systems] | ||
| else: | ||
| systems = process_systems( | ||
| training_systems, | ||
| patterns=dataset_params.get("rglob_patterns", None), | ||
| fmt=dataset_params.get("format", None), | ||
| out_fmt=dataset_params.get( | ||
| "out_format", dataset_params.get("output_format", None) | ||
| ), | ||
| ) | ||
| if len(systems) == 1 and is_lmdb(systems[0]): | ||
| from deepmd.dpmodel.utils.lmdb_data import ( | ||
| make_neighbor_stat_data, | ||
| ) | ||
|
|
||
| train_data = make_neighbor_stat_data(training_systems, type_map) | ||
| else: | ||
| train_data = get_data( | ||
| config["training"]["training_data"], 0, type_map, None | ||
| ) | ||
| return make_neighbor_stat_data(systems[0], type_map) | ||
| return get_data(dataset_params, 0, type_map, None) | ||
|
|
||
| if not multi_task: | ||
| type_map = config["model"].get("type_map") | ||
| train_data = _get_neighbor_stat_data_from_params( | ||
| config["training"]["training_data"], type_map | ||
| ) | ||
| config["model"], min_nbor_dist = BaseModel.update_sel( | ||
| train_data, type_map, config["model"] | ||
| ) | ||
| else: | ||
| min_nbor_dist = {} | ||
| for model_item in config["model"]["model_dict"]: | ||
| type_map = config["model"]["model_dict"][model_item].get("type_map") | ||
| training_systems = config["training"]["data_dict"][model_item][ | ||
| "training_data" | ||
| ].get("systems") | ||
| if ( | ||
| training_systems is not None | ||
| and isinstance(training_systems, str) | ||
| and is_lmdb(training_systems) | ||
| ): | ||
| from deepmd.dpmodel.utils.lmdb_data import ( | ||
| make_neighbor_stat_data, | ||
| ) | ||
|
|
||
| train_data = make_neighbor_stat_data(training_systems, type_map) | ||
| else: | ||
| train_data = get_data( | ||
| config["training"]["data_dict"][model_item]["training_data"], | ||
| 0, | ||
| type_map, | ||
| None, | ||
| ) | ||
| train_data = _get_neighbor_stat_data_from_params( | ||
| config["training"]["data_dict"][model_item]["training_data"], | ||
| type_map, | ||
| ) | ||
| config["model"]["model_dict"][model_item], min_nbor_dist[model_item] = ( | ||
| BaseModel.update_sel( | ||
| train_data, type_map, config["model"]["model_dict"][model_item] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -104,13 +104,31 @@ def _get_neighbor_stat_data( | |
| ``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for | ||
| npy/HDF5 directories. | ||
| """ | ||
| lmdb_path = _detect_lmdb_path(dataset_params.get("systems")) | ||
| lmdb_path = ( | ||
| None | ||
| if dataset_params.get("format", None) is not None | ||
| else _detect_lmdb_path(dataset_params.get("systems")) | ||
| ) | ||
| if lmdb_path is not None: | ||
| from deepmd.dpmodel.utils.lmdb_data import ( | ||
| make_neighbor_stat_data, | ||
| ) | ||
|
|
||
| return make_neighbor_stat_data(lmdb_path, type_map) | ||
| systems = process_systems( | ||
| dataset_params["systems"], | ||
| patterns=dataset_params.get("rglob_patterns", None), | ||
| fmt=dataset_params.get("format", None), | ||
| out_fmt=dataset_params.get( | ||
| "out_format", dataset_params.get("output_format", None) | ||
| ), | ||
| ) | ||
| if len(systems) == 1 and is_lmdb(systems[0]): | ||
| from deepmd.dpmodel.utils.lmdb_data import ( | ||
| make_neighbor_stat_data, | ||
| ) | ||
|
|
||
| return make_neighbor_stat_data(systems[0], type_map) | ||
| return get_data(dataset_params, 0, type_map, None) | ||
|
Comment on lines
+118
to
132
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re-apply scalar-LMDB validation after format-based conversion.
Suggested fix pattern systems = process_systems(
dataset_params["systems"],
patterns=dataset_params.get("rglob_patterns", None),
fmt=dataset_params.get("format", None),
out_fmt=dataset_params.get(
"out_format", dataset_params.get("output_format", None)
),
)
+ lmdb_systems = [s for s in systems if is_lmdb(s)]
+ if lmdb_systems and not (len(systems) == 1 and len(lmdb_systems) == 1):
+ raise ValueError(
+ "LMDB datasets must be passed as a scalar 'systems' string "
+ "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
+ "with LMDB paths are not supported."
+ )
if len(systems) == 1 and is_lmdb(systems[0]):
...Apply the same guard in both Also applies to: 168-175 🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
|
|
@@ -126,7 +144,11 @@ def _build_data_system( | |
| :class:`DeepmdDataSystem` path with system expansion. | ||
| """ | ||
| systems_raw = dataset_params["systems"] | ||
| lmdb_path = _detect_lmdb_path(systems_raw) | ||
| lmdb_path = ( | ||
| None | ||
| if dataset_params.get("format", None) is not None | ||
| else _detect_lmdb_path(systems_raw) | ||
| ) | ||
| if lmdb_path is not None: | ||
| return LmdbDataSystem( | ||
| lmdb_path=lmdb_path, | ||
|
|
@@ -138,7 +160,19 @@ def _build_data_system( | |
| systems = process_systems( | ||
| systems_raw, | ||
| patterns=dataset_params.get("rglob_patterns", None), | ||
| fmt=dataset_params.get("format", None), | ||
| out_fmt=dataset_params.get( | ||
| "out_format", dataset_params.get("output_format", None) | ||
| ), | ||
| ) | ||
| if len(systems) == 1 and is_lmdb(systems[0]): | ||
| return LmdbDataSystem( | ||
| lmdb_path=systems[0], | ||
| type_map=type_map, | ||
| batch_size=dataset_params["batch_size"], | ||
| auto_prob_style=dataset_params.get("auto_prob"), | ||
| seed=seed, | ||
| ) | ||
| return DeepmdDataSystem( | ||
| systems=systems, | ||
| batch_size=dataset_params["batch_size"], | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Block any LMDB-resolved dataset for Paddle, not only single-path LMDB.
Line 123 and Line 139 only reject
len(...) == 1LMDB results, but the message says Paddle does not support LMDB data yet. LMDB lists can bypass this guard and fail later in less clear ways.Suggested fix
🤖 Prompt for AI Agents