Spaces:
Sleeping
Sleeping
Update mcp_output/mcp_plugin/mcp_service.py
Browse files
mcp_output/mcp_plugin/mcp_service.py
CHANGED
|
@@ -229,6 +229,35 @@ def _safe_extract_tar(archive: tarfile.TarFile, destination: str) -> None:
|
|
| 229 |
shutil.copyfileobj(src, dst)
|
| 230 |
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
# ============================================================================
|
| 233 |
# 会话管理工具
|
| 234 |
# ============================================================================
|
|
@@ -1097,7 +1126,10 @@ def process_structure_data(
|
|
| 1097 |
target_index: int = 0,
|
| 1098 |
graph_max_radius: float = 8.0,
|
| 1099 |
graph_max_neighbors: int = 12,
|
| 1100 |
-
reprocess: bool = False
|
|
|
|
|
|
|
|
|
|
| 1101 |
) -> dict:
|
| 1102 |
"""
|
| 1103 |
Process atomic structure data into graph format.
|
|
@@ -1111,6 +1143,9 @@ def process_structure_data(
|
|
| 1111 |
graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
|
| 1112 |
graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
|
| 1113 |
reprocess (bool): Whether to reprocess data even if processed files exist.
|
|
|
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
Returns:
|
| 1116 |
dict: Contains processing status and dataset information.
|
|
@@ -1127,6 +1162,8 @@ def process_structure_data(
|
|
| 1127 |
|
| 1128 |
# If structure_contents provided, create temp directory
|
| 1129 |
temp_dir = None
|
|
|
|
|
|
|
| 1130 |
if structure_contents is not None:
|
| 1131 |
if targets_csv is None:
|
| 1132 |
return {"success": False, "error": "targets_csv is required when providing structure_contents"}
|
|
@@ -1136,7 +1173,8 @@ def process_structure_data(
|
|
| 1136 |
|
| 1137 |
# Write structure files
|
| 1138 |
for filename, content in structure_contents.items():
|
| 1139 |
-
|
|
|
|
| 1140 |
with open(filepath, 'w') as f:
|
| 1141 |
f.write(content)
|
| 1142 |
|
|
@@ -1145,20 +1183,33 @@ def process_structure_data(
|
|
| 1145 |
f.write(targets_csv)
|
| 1146 |
|
| 1147 |
data_path = temp_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1148 |
|
| 1149 |
if data_path is None:
|
| 1150 |
return {"success": False, "error": "Either data_path or structure_contents must be provided"}
|
| 1151 |
|
| 1152 |
if not os.path.exists(data_path):
|
| 1153 |
return {"success": False, "error": f"Data path not found: {data_path}"}
|
|
|
|
|
|
|
|
|
|
| 1154 |
|
| 1155 |
processing_args = {
|
| 1156 |
"dataset_type": "inmemory",
|
| 1157 |
"data_path": data_path,
|
| 1158 |
"target_path": "targets.csv",
|
| 1159 |
-
"dictionary_source":
|
| 1160 |
-
"dictionary_path":
|
| 1161 |
-
"data_format":
|
| 1162 |
"verbose": "True",
|
| 1163 |
"graph_max_radius": graph_max_radius,
|
| 1164 |
"graph_max_neighbors": graph_max_neighbors,
|
|
@@ -1187,6 +1238,7 @@ def process_structure_data(
|
|
| 1187 |
"data_path": data_path,
|
| 1188 |
"is_temporary": temp_dir is not None
|
| 1189 |
}
|
|
|
|
| 1190 |
|
| 1191 |
# Note: Don't delete temp_dir yet, it may be needed for training
|
| 1192 |
if temp_dir:
|
|
@@ -1819,7 +1871,10 @@ def process_session_data(
|
|
| 1819 |
target_index: int = 0,
|
| 1820 |
graph_max_radius: float = 8.0,
|
| 1821 |
graph_max_neighbors: int = 12,
|
| 1822 |
-
reprocess: bool = True
|
|
|
|
|
|
|
|
|
|
| 1823 |
) -> dict:
|
| 1824 |
"""
|
| 1825 |
Process all uploaded structure files in a session into graph format.
|
|
@@ -1830,6 +1885,9 @@ def process_session_data(
|
|
| 1830 |
graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
|
| 1831 |
graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
|
| 1832 |
reprocess (bool): Force reprocessing even if already processed (default: True).
|
|
|
|
|
|
|
|
|
|
| 1833 |
|
| 1834 |
Returns:
|
| 1835 |
dict: Processing status and dataset statistics.
|
|
@@ -1851,20 +1909,36 @@ def process_session_data(
|
|
| 1851 |
"error": "targets.csv not found. Please upload targets using upload_targets first."
|
| 1852 |
}
|
| 1853 |
|
| 1854 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1855 |
if len(files) == 0:
|
| 1856 |
return {
|
| 1857 |
"success": False,
|
| 1858 |
"error": "No structure files found. Please upload structure files first."
|
| 1859 |
}
|
| 1860 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1861 |
processing_args = {
|
| 1862 |
"dataset_type": "inmemory",
|
| 1863 |
"data_path": data_path,
|
| 1864 |
"target_path": "targets.csv",
|
| 1865 |
-
"dictionary_source":
|
| 1866 |
-
"dictionary_path":
|
| 1867 |
-
"data_format":
|
| 1868 |
"verbose": "True",
|
| 1869 |
"graph_max_radius": graph_max_radius,
|
| 1870 |
"graph_max_neighbors": graph_max_neighbors,
|
|
@@ -1897,6 +1971,7 @@ def process_session_data(
|
|
| 1897 |
"avg_edges_per_structure": float(np.mean(num_edges_list)),
|
| 1898 |
"num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
|
| 1899 |
},
|
|
|
|
| 1900 |
"ready_for_training": True,
|
| 1901 |
"next_step": "Use train_session_model to train a model on this data."
|
| 1902 |
}
|
|
|
|
| 229 |
shutil.copyfileobj(src, dst)
|
| 230 |
|
| 231 |
|
| 232 |
+
def _infer_structure_format(structure_files: List[str]) -> Optional[str]:
|
| 233 |
+
"""Guess the structure file extension for ASE parsing."""
|
| 234 |
+
normalized: Dict[str, int] = {}
|
| 235 |
+
for file_name in structure_files:
|
| 236 |
+
root, ext = os.path.splitext(file_name)
|
| 237 |
+
if ext:
|
| 238 |
+
ext = ext.lower().lstrip('.')
|
| 239 |
+
else:
|
| 240 |
+
upper_name = os.path.basename(file_name).upper()
|
| 241 |
+
if upper_name in {"POSCAR", "CONTCAR"}:
|
| 242 |
+
ext = "vasp"
|
| 243 |
+
else:
|
| 244 |
+
ext = ""
|
| 245 |
+
if not ext:
|
| 246 |
+
continue
|
| 247 |
+
normalized[ext] = normalized.get(ext, 0) + 1
|
| 248 |
+
if not normalized:
|
| 249 |
+
return None
|
| 250 |
+
if len(normalized) == 1:
|
| 251 |
+
return next(iter(normalized))
|
| 252 |
+
# Prefer common solid-state formats
|
| 253 |
+
priority = ["cif", "vasp", "poscar", "xyz", "json"]
|
| 254 |
+
sorted_items = sorted(normalized.items(), key=lambda item: (-item[1], priority.index(item[0]) if item[0] in priority else 99))
|
| 255 |
+
top_ext, top_count = sorted_items[0]
|
| 256 |
+
if len(structure_files) == top_count:
|
| 257 |
+
return top_ext
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
|
| 261 |
# ============================================================================
|
| 262 |
# 会话管理工具
|
| 263 |
# ============================================================================
|
|
|
|
| 1126 |
target_index: int = 0,
|
| 1127 |
graph_max_radius: float = 8.0,
|
| 1128 |
graph_max_neighbors: int = 12,
|
| 1129 |
+
reprocess: bool = False,
|
| 1130 |
+
data_format: Optional[str] = None,
|
| 1131 |
+
dictionary_source: str = "default",
|
| 1132 |
+
dictionary_path: str = "atom_dict.json"
|
| 1133 |
) -> dict:
|
| 1134 |
"""
|
| 1135 |
Process atomic structure data into graph format.
|
|
|
|
| 1143 |
graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
|
| 1144 |
graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
|
| 1145 |
reprocess (bool): Whether to reprocess data even if processed files exist.
|
| 1146 |
+
data_format (str, optional): Explicit structure file format (e.g., 'cif', 'vasp', 'xyz', 'json').
|
| 1147 |
+
dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
|
| 1148 |
+
dictionary_path (str): Relative path to atom dictionary when dictionary_source='provided'.
|
| 1149 |
|
| 1150 |
Returns:
|
| 1151 |
dict: Contains processing status and dataset information.
|
|
|
|
| 1162 |
|
| 1163 |
# If structure_contents provided, create temp directory
|
| 1164 |
temp_dir = None
|
| 1165 |
+
determined_format = data_format.lower() if isinstance(data_format, str) else None
|
| 1166 |
+
|
| 1167 |
if structure_contents is not None:
|
| 1168 |
if targets_csv is None:
|
| 1169 |
return {"success": False, "error": "targets_csv is required when providing structure_contents"}
|
|
|
|
| 1173 |
|
| 1174 |
# Write structure files
|
| 1175 |
for filename, content in structure_contents.items():
|
| 1176 |
+
clean_name = _normalize_filename(filename)
|
| 1177 |
+
filepath = _safe_join(temp_dir, clean_name)
|
| 1178 |
with open(filepath, 'w') as f:
|
| 1179 |
f.write(content)
|
| 1180 |
|
|
|
|
| 1183 |
f.write(targets_csv)
|
| 1184 |
|
| 1185 |
data_path = temp_dir
|
| 1186 |
+
|
| 1187 |
+
if determined_format is None:
|
| 1188 |
+
detected = _infer_structure_format(list(structure_contents.keys()))
|
| 1189 |
+
if detected:
|
| 1190 |
+
determined_format = detected
|
| 1191 |
+
elif determined_format is None:
|
| 1192 |
+
determined_format = _infer_structure_format([
|
| 1193 |
+
f for f in os.listdir(data_path)
|
| 1194 |
+
if os.path.isfile(os.path.join(data_path, f)) and f.lower() != "targets.csv"
|
| 1195 |
+
])
|
| 1196 |
|
| 1197 |
if data_path is None:
|
| 1198 |
return {"success": False, "error": "Either data_path or structure_contents must be provided"}
|
| 1199 |
|
| 1200 |
if not os.path.exists(data_path):
|
| 1201 |
return {"success": False, "error": f"Data path not found: {data_path}"}
|
| 1202 |
+
|
| 1203 |
+
if determined_format is None:
|
| 1204 |
+
determined_format = "json"
|
| 1205 |
|
| 1206 |
processing_args = {
|
| 1207 |
"dataset_type": "inmemory",
|
| 1208 |
"data_path": data_path,
|
| 1209 |
"target_path": "targets.csv",
|
| 1210 |
+
"dictionary_source": dictionary_source,
|
| 1211 |
+
"dictionary_path": dictionary_path,
|
| 1212 |
+
"data_format": determined_format,
|
| 1213 |
"verbose": "True",
|
| 1214 |
"graph_max_radius": graph_max_radius,
|
| 1215 |
"graph_max_neighbors": graph_max_neighbors,
|
|
|
|
| 1238 |
"data_path": data_path,
|
| 1239 |
"is_temporary": temp_dir is not None
|
| 1240 |
}
|
| 1241 |
+
result["data_format"] = determined_format
|
| 1242 |
|
| 1243 |
# Note: Don't delete temp_dir yet, it may be needed for training
|
| 1244 |
if temp_dir:
|
|
|
|
| 1871 |
target_index: int = 0,
|
| 1872 |
graph_max_radius: float = 8.0,
|
| 1873 |
graph_max_neighbors: int = 12,
|
| 1874 |
+
reprocess: bool = True,
|
| 1875 |
+
data_format: Optional[str] = None,
|
| 1876 |
+
dictionary_source: str = "default",
|
| 1877 |
+
dictionary_path: str = "atom_dict.json"
|
| 1878 |
) -> dict:
|
| 1879 |
"""
|
| 1880 |
Process all uploaded structure files in a session into graph format.
|
|
|
|
| 1885 |
graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
|
| 1886 |
graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
|
| 1887 |
reprocess (bool): Force reprocessing even if already processed (default: True).
|
| 1888 |
+
data_format (str, optional): Explicit structure format ('cif', 'vasp', 'xyz', 'json', ...). Use 'auto' to infer.
|
| 1889 |
+
dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
|
| 1890 |
+
dictionary_path (str): Path to atom dictionary within the session data when using 'provided'.
|
| 1891 |
|
| 1892 |
Returns:
|
| 1893 |
dict: Processing status and dataset statistics.
|
|
|
|
| 1909 |
"error": "targets.csv not found. Please upload targets using upload_targets first."
|
| 1910 |
}
|
| 1911 |
|
| 1912 |
+
skip_files = {"targets.csv", "id.json", "atom_dict.json"}
|
| 1913 |
+
files = [
|
| 1914 |
+
f for f in os.listdir(data_path)
|
| 1915 |
+
if f not in skip_files and not f.startswith('.')
|
| 1916 |
+
]
|
| 1917 |
if len(files) == 0:
|
| 1918 |
return {
|
| 1919 |
"success": False,
|
| 1920 |
"error": "No structure files found. Please upload structure files first."
|
| 1921 |
}
|
| 1922 |
|
| 1923 |
+
determined_format = data_format.lower() if isinstance(data_format, str) else None
|
| 1924 |
+
if determined_format in {"auto", "infer", ""}:
|
| 1925 |
+
determined_format = None
|
| 1926 |
+
|
| 1927 |
+
if determined_format is None:
|
| 1928 |
+
detected = _infer_structure_format(files)
|
| 1929 |
+
if detected:
|
| 1930 |
+
determined_format = detected
|
| 1931 |
+
|
| 1932 |
+
if determined_format is None:
|
| 1933 |
+
determined_format = "json"
|
| 1934 |
+
|
| 1935 |
processing_args = {
|
| 1936 |
"dataset_type": "inmemory",
|
| 1937 |
"data_path": data_path,
|
| 1938 |
"target_path": "targets.csv",
|
| 1939 |
+
"dictionary_source": dictionary_source,
|
| 1940 |
+
"dictionary_path": dictionary_path,
|
| 1941 |
+
"data_format": determined_format,
|
| 1942 |
"verbose": "True",
|
| 1943 |
"graph_max_radius": graph_max_radius,
|
| 1944 |
"graph_max_neighbors": graph_max_neighbors,
|
|
|
|
| 1971 |
"avg_edges_per_structure": float(np.mean(num_edges_list)),
|
| 1972 |
"num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
|
| 1973 |
},
|
| 1974 |
+
"data_format": determined_format,
|
| 1975 |
"ready_for_training": True,
|
| 1976 |
"next_step": "Use train_session_model to train a model on this data."
|
| 1977 |
}
|