Spaces:

SEUyishu
/

MatDeepLearn

Sleeping

App Files Files Community

SEUyishu commited on Dec 4, 2025

Commit

0187c48

verified ·

1 Parent(s): 5aaf4b5

Update mcp_output/mcp_plugin/mcp_service.py

Browse files

Files changed (1) hide show

mcp_output/mcp_plugin/mcp_service.py +85 -10

mcp_output/mcp_plugin/mcp_service.py CHANGED Viewed

@@ -229,6 +229,35 @@ def _safe_extract_tar(archive: tarfile.TarFile, destination: str) -> None:
                 shutil.copyfileobj(src, dst)
 # ============================================================================
 # 会话管理工具
 # ============================================================================
@@ -1097,7 +1126,10 @@ def process_structure_data(
     target_index: int = 0,
     graph_max_radius: float = 8.0,
     graph_max_neighbors: int = 12,
-    reprocess: bool = False
 ) -> dict:
     """
     Process atomic structure data into graph format.
@@ -1111,6 +1143,9 @@ def process_structure_data(
         graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
         graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
         reprocess (bool): Whether to reprocess data even if processed files exist.
     Returns:
         dict: Contains processing status and dataset information.
@@ -1127,6 +1162,8 @@ def process_structure_data(
         # If structure_contents provided, create temp directory
         temp_dir = None
         if structure_contents is not None:
             if targets_csv is None:
                 return {"success": False, "error": "targets_csv is required when providing structure_contents"}
@@ -1136,7 +1173,8 @@ def process_structure_data(
             # Write structure files
             for filename, content in structure_contents.items():
-                filepath = os.path.join(temp_dir, filename)
                 with open(filepath, 'w') as f:
                     f.write(content)
@@ -1145,20 +1183,33 @@ def process_structure_data(
                 f.write(targets_csv)
             data_path = temp_dir
         if data_path is None:
             return {"success": False, "error": "Either data_path or structure_contents must be provided"}
         if not os.path.exists(data_path):
             return {"success": False, "error": f"Data path not found: {data_path}"}
         processing_args = {
             "dataset_type": "inmemory",
             "data_path": data_path,
             "target_path": "targets.csv",
-            "dictionary_source": "default",
-            "dictionary_path": "atom_dict.json",
-            "data_format": "json",
             "verbose": "True",
             "graph_max_radius": graph_max_radius,
             "graph_max_neighbors": graph_max_neighbors,
@@ -1187,6 +1238,7 @@ def process_structure_data(
             "data_path": data_path,
             "is_temporary": temp_dir is not None
         }
         # Note: Don't delete temp_dir yet, it may be needed for training
         if temp_dir:
@@ -1819,7 +1871,10 @@ def process_session_data(
     target_index: int = 0,
     graph_max_radius: float = 8.0,
     graph_max_neighbors: int = 12,
-    reprocess: bool = True
 ) -> dict:
     """
     Process all uploaded structure files in a session into graph format.
@@ -1830,6 +1885,9 @@ def process_session_data(
         graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
         graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
         reprocess (bool): Force reprocessing even if already processed (default: True).
     Returns:
         dict: Processing status and dataset statistics.
@@ -1851,20 +1909,36 @@ def process_session_data(
                 "error": "targets.csv not found. Please upload targets using upload_targets first."
             }
-        files = [f for f in os.listdir(data_path) if f != "targets.csv" and not f.startswith('.')]
         if len(files) == 0:
             return {
                 "success": False,
                 "error": "No structure files found. Please upload structure files first."
             }
         processing_args = {
             "dataset_type": "inmemory",
             "data_path": data_path,
             "target_path": "targets.csv",
-            "dictionary_source": "default",
-            "dictionary_path": "atom_dict.json",
-            "data_format": "json",
             "verbose": "True",
             "graph_max_radius": graph_max_radius,
             "graph_max_neighbors": graph_max_neighbors,
@@ -1897,6 +1971,7 @@ def process_session_data(
                 "avg_edges_per_structure": float(np.mean(num_edges_list)),
                 "num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
             },
             "ready_for_training": True,
             "next_step": "Use train_session_model to train a model on this data."
         }

                 shutil.copyfileobj(src, dst)
+def _infer_structure_format(structure_files: List[str]) -> Optional[str]:
+    """Guess the structure file extension for ASE parsing."""
+    normalized: Dict[str, int] = {}
+    for file_name in structure_files:
+        root, ext = os.path.splitext(file_name)
+        if ext:
+            ext = ext.lower().lstrip('.')
+        else:
+            upper_name = os.path.basename(file_name).upper()
+            if upper_name in {"POSCAR", "CONTCAR"}:
+                ext = "vasp"
+            else:
+                ext = ""
+        if not ext:
+            continue
+        normalized[ext] = normalized.get(ext, 0) + 1
+    if not normalized:
+        return None
+    if len(normalized) == 1:
+        return next(iter(normalized))
+    # Prefer common solid-state formats
+    priority = ["cif", "vasp", "poscar", "xyz", "json"]
+    sorted_items = sorted(normalized.items(), key=lambda item: (-item[1], priority.index(item[0]) if item[0] in priority else 99))
+    top_ext, top_count = sorted_items[0]
+    if len(structure_files) == top_count:
+        return top_ext
+    return None
 # ============================================================================
 # 会话管理工具
 # ============================================================================
     target_index: int = 0,
     graph_max_radius: float = 8.0,
     graph_max_neighbors: int = 12,
+    reprocess: bool = False,
+    data_format: Optional[str] = None,
+    dictionary_source: str = "default",
+    dictionary_path: str = "atom_dict.json"
 ) -> dict:
     """
     Process atomic structure data into graph format.
         graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
         graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
         reprocess (bool): Whether to reprocess data even if processed files exist.
+        data_format (str, optional): Explicit structure file format (e.g., 'cif', 'vasp', 'xyz', 'json').
+        dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
+        dictionary_path (str): Relative path to atom dictionary when dictionary_source='provided'.
     Returns:
         dict: Contains processing status and dataset information.
         # If structure_contents provided, create temp directory
         temp_dir = None
+        determined_format = data_format.lower() if isinstance(data_format, str) else None
         if structure_contents is not None:
             if targets_csv is None:
                 return {"success": False, "error": "targets_csv is required when providing structure_contents"}
             # Write structure files
             for filename, content in structure_contents.items():
+                clean_name = _normalize_filename(filename)
+                filepath = _safe_join(temp_dir, clean_name)
                 with open(filepath, 'w') as f:
                     f.write(content)
                 f.write(targets_csv)
             data_path = temp_dir
+            if determined_format is None:
+                detected = _infer_structure_format(list(structure_contents.keys()))
+                if detected:
+                    determined_format = detected
+        elif determined_format is None:
+            determined_format = _infer_structure_format([
+                f for f in os.listdir(data_path)
+                if os.path.isfile(os.path.join(data_path, f)) and f.lower() != "targets.csv"
+            ])
         if data_path is None:
             return {"success": False, "error": "Either data_path or structure_contents must be provided"}
         if not os.path.exists(data_path):
             return {"success": False, "error": f"Data path not found: {data_path}"}
+        if determined_format is None:
+            determined_format = "json"
         processing_args = {
             "dataset_type": "inmemory",
             "data_path": data_path,
             "target_path": "targets.csv",
+            "dictionary_source": dictionary_source,
+            "dictionary_path": dictionary_path,
+            "data_format": determined_format,
             "verbose": "True",
             "graph_max_radius": graph_max_radius,
             "graph_max_neighbors": graph_max_neighbors,
             "data_path": data_path,
             "is_temporary": temp_dir is not None
         }
+        result["data_format"] = determined_format
         # Note: Don't delete temp_dir yet, it may be needed for training
         if temp_dir:
     target_index: int = 0,
     graph_max_radius: float = 8.0,
     graph_max_neighbors: int = 12,
+    reprocess: bool = True,
+    data_format: Optional[str] = None,
+    dictionary_source: str = "default",
+    dictionary_path: str = "atom_dict.json"
 ) -> dict:
     """
     Process all uploaded structure files in a session into graph format.
         graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
         graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
         reprocess (bool): Force reprocessing even if already processed (default: True).
+        data_format (str, optional): Explicit structure format ('cif', 'vasp', 'xyz', 'json', ...). Use 'auto' to infer.
+        dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
+        dictionary_path (str): Path to atom dictionary within the session data when using 'provided'.
     Returns:
         dict: Processing status and dataset statistics.
                 "error": "targets.csv not found. Please upload targets using upload_targets first."
             }
+        skip_files = {"targets.csv", "id.json", "atom_dict.json"}
+        files = [
+            f for f in os.listdir(data_path)
+            if f not in skip_files and not f.startswith('.')
+        ]
         if len(files) == 0:
             return {
                 "success": False,
                 "error": "No structure files found. Please upload structure files first."
             }
+        determined_format = data_format.lower() if isinstance(data_format, str) else None
+        if determined_format in {"auto", "infer", ""}:
+            determined_format = None
+        if determined_format is None:
+            detected = _infer_structure_format(files)
+            if detected:
+                determined_format = detected
+        if determined_format is None:
+            determined_format = "json"
         processing_args = {
             "dataset_type": "inmemory",
             "data_path": data_path,
             "target_path": "targets.csv",
+            "dictionary_source": dictionary_source,
+            "dictionary_path": dictionary_path,
+            "data_format": determined_format,
             "verbose": "True",
             "graph_max_radius": graph_max_radius,
             "graph_max_neighbors": graph_max_neighbors,
                 "avg_edges_per_structure": float(np.mean(num_edges_list)),
                 "num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
             },
+            "data_format": determined_format,
             "ready_for_training": True,
             "next_step": "Use train_session_model to train a model on this data."
         }