Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

ff3a113

verified ·

1 Parent(s): 2fcce3e

Update processor.py

Browse files

Files changed (1) hide show

processor.py +90 -210

processor.py CHANGED Viewed

@@ -19,10 +19,6 @@ class DatasetCommandCenter:
     # ==========================================
     def get_dataset_metadata(self, dataset_id):
-        """
-        Fetches available Configs (subsets), Splits, and License info
-        without downloading the actual data rows.
-        """
         configs = ['default']
         splits = ['train', 'test', 'validation']
         license_name = "unknown"
@@ -31,30 +27,22 @@ class DatasetCommandCenter:
             # 1. Fetch Configs
             try:
                 found_configs = get_dataset_config_names(dataset_id, token=self.token)
-                if found_configs:
-                    configs = found_configs
-            except Exception:
-                pass # Keep default
-            # 2. Fetch Metadata (Splits & License)
             try:
                 selected = configs[0]
-                # This API call can fail on some datasets, so we wrap it safely
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 info = None
-                if selected in infos:
-                    info = infos[selected]
-                elif 'default' in infos:
-                    info = infos['default']
-                elif infos:
-                    info = list(infos.values())[0]
                 if info:
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
-            except Exception:
-                pass # Keep defaults if metadata fails
             return {
                 "status": "success",
@@ -66,176 +54,119 @@ class DatasetCommandCenter:
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
-        """
-        Updates the Split dropdown when the user changes the Config.
-        """
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
-            if config_name in infos:
-                splits = list(infos[config_name].splits.keys())
-            elif len(infos) > 0:
-                splits = list(infos.values())[0].splits.keys()
-            else:
-                splits = ['train', 'test']
             return {"status": "success", "splits": splits}
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
     def _flatten_object(self, obj, parent_key='', sep='.'):
-        """
-        Recursively finds all keys in nested dicts or JSON strings
-        to populate the 'Simple Path' dropdown in the UI.
-        """
         items = {}
-        # Transparently parse JSON strings
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                try:
-                    obj = json.loads(s)
-                except:
-                    pass # Keep as string if parse fails
         if isinstance(obj, dict):
             for k, v in obj.items():
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
                 items.update(self._flatten_object(v, new_key, sep=sep))
         elif isinstance(obj, list):
-            # We mark lists but do not recurse infinitely
-            new_key = f"{parent_key}" if parent_key else "list_content"
-            items[new_key] = "List"
         else:
-            # Leaf node
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
-        """
-        Scans the first 10 rows to build a Schema Tree for the UI.
-        """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
-            schema_map = {} # Used for List Mode detection
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
-                # 1. Clean row for UI Preview (convert objects to strings)
-                clean_row = {}
-                for k, v in row.items():
-                    if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
-                        clean_row[k] = str(v)
-                    else:
-                        clean_row[k] = v
                 sample_rows.append(clean_row)
-                # 2. Deep Flattening for "Simple Path" dropdowns
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
-                # 3. Top Level Analysis for "List Mode" detection
                 for k, v in row.items():
-                    if k not in schema_map:
-                        schema_map[k] = {"type": "Object"}
                     val = v
                     if isinstance(val, str):
                         try: val = json.loads(val)
                         except: pass
-                    if isinstance(val, list):
-                        schema_map[k]["type"] = "List"
-            # Reconstruct Schema Tree for UI grouping
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
                 root = path.split('.')[0]
-                if root not in schema_tree:
-                    schema_tree[root] = []
                 schema_tree[root].append(path)
             return {
                 "status": "success",
                 "samples": sample_rows,
-                "schema_tree": schema_tree, # Used by Simple Path Dropdown
-                "schema": schema_map,       # Used by List Mode Dropdown
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     # ==========================================
-    # 2. CORE EXTRACTION LOGIC
     # ==========================================
     def _get_value_by_path(self, obj, path):
-        """
-        Navigates dot notation (meta.user.id), automatically parsing
-        JSON strings if encountered along the path.
-        """
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
-            # Auto-parse JSON string if encountered
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
-                    try:
-                        current = json.loads(s)
-                    except:
-                        pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
             else:
-                return None # Path broken
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
-        """
-        Logic for: FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
-        """
         data = row.get(source_col)
-        # Parse if string
         if isinstance(data, str):
-            try:
-                data = json.loads(data)
-            except:
-                return None
-        if not isinstance(data, list):
-            return None
         matched_item = None
         for item in data:
-            # String comparison for safety
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
         if matched_item:
             return self._get_value_by_path(matched_item, target_path)
         return None
     def _apply_projection(self, row, recipe):
-        """
-        Builds the new row based on the recipe.
-        Raises ValueError if user Python code fails (Fail Fast).
-        """
         new_row = {}
-        # Setup Eval Context (Variables available in Python Mode)
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
@@ -248,169 +179,118 @@ class DatasetCommandCenter:
             try:
                 if t_type == 'simple':
                     new_row[target_col] = self._get_value_by_path(row, col_def['source'])
                 elif t_type == 'list_search':
                     new_row[target_col] = self._extract_from_list_logic(
-                        row,
-                        col_def['source'],
-                        col_def['filter_key'],
-                        col_def['filter_val'],
-                        col_def['target_key']
                     )
                 elif t_type == 'python':
-                    # Execute user code
-                    expression = col_def['expression']
-                    val = eval(expression, {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
-                # Fail Fast: Stop the generator immediately if a column fails
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
     # ==========================================
-    # 3. DOCUMENTATION (MODEL CARD)
     # ==========================================
     def _generate_card(self, source_id, target_id, recipe, license_name):
-        """
-        Creates a README.md for the new dataset.
-        """
         card_data = DatasetCardData(
             language="en",
             license=license_name,
-            tags=["dataset-command-center", "etl", "generated-dataset"],
             base_model=source_id,
         )
         content = f"""
 # {target_id.split('/')[-1]}
 This dataset is a transformation of [{source_id}](https://huggingface.co/datasets/{source_id}).
-It was generated using the **Hugging Face Dataset Command Center**.
-## Transformation Recipe
-The following operations were applied to the source data:
-| Target Column | Operation Type | Logic |
-|---------------|----------------|-------|
 """
         for col in recipe['columns']:
-            c_type = col.get('type', 'simple')
-            c_name = col['name']
-            logic = "-"
-            if c_type == 'simple':
-                logic = f"Mapped from `{col.get('source')}`"
-            elif c_type == 'list_search':
-                logic = f"Extracted `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
-            elif c_type == 'python':
-                logic = f"Python: `{col.get('expression')}`"
-            content += f"| **{c_name}** | {c_type} | {logic} |\n"
-        if recipe.get('filter_rule'):
-            content += f"\n### Row Filtering\n**Filter Applied:** `{recipe['filter_rule']}`\n"
-        content += f"\n## Original License\nThis dataset inherits the license: `{license_name}` from the source."
-        card = DatasetCard.from_template(card_data, content=content)
-        return card
-    # ==========================================
-    # 4. EXECUTION
-    # ==========================================
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
-        logger.info(f"Job started: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
-                if max_rows and count >= int(max_rows):
-                    break
-                # 1. Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
                         ctx['re'] = re
-                        if not eval(recipe['filter_rule'], {}, ctx):
-                            continue
                     except Exception as e:
-                        raise ValueError(f"Filter crashed on row {i}: {e}")
-                # 2. Projection
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
-                except ValueError as ve:
-                    # Pass the specific column error up
-                    raise ve
-                except Exception as e:
-                    raise ValueError(f"Unexpected crash on row {i}: {e}")
         try:
-            # 1. Process & Push Data
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
-            # 2. Generate & Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
-            except Exception as e:
-                logger.error(f"Failed to push Dataset Card: {e}")
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
             logger.error(f"Job Failed: {e}")
-            return {"status": "failed", "error": str(e)}
-    # ==========================================
-    # 5. PREVIEW
-    # ==========================================
-    def preview_transform(self, dataset_id, config, split, recipe):
-        conf = config if config != 'default' else None
-        try:
-            ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
-            processed = []
-            for i, row in enumerate(ds_stream):
-                if len(processed) >= 5: break
-                # Check Filter
-                passed = True
-                if recipe.get('filter_rule'):
-                    try:
-                        ctx = row.copy()
-                        ctx['row'] = row
-                        ctx['json'] = json
-                        ctx['re'] = re
-                        if not eval(recipe['filter_rule'], {}, ctx):
-                            passed = False
-                    except:
-                        passed = False # Skip invalid rows in preview
-                if passed:
-                    try:
-                        new_row = self._apply_projection(row, recipe)
-                        processed.append(new_row)
-                    except Exception as e:
-                        # In preview, we want to see the error, not crash
-                        processed.append({"_preview_error": f"Error: {str(e)}"})
-            return processed
-        except Exception as e:
-             # Return global error if loading fails
-             raise e

     # ==========================================
     def get_dataset_metadata(self, dataset_id):
         configs = ['default']
         splits = ['train', 'test', 'validation']
         license_name = "unknown"
             # 1. Fetch Configs
             try:
                 found_configs = get_dataset_config_names(dataset_id, token=self.token)
+                if found_configs: configs = found_configs
+            except: pass
+            # 2. Fetch Metadata
             try:
                 selected = configs[0]
                 infos = get_dataset_infos(dataset_id, token=self.token)
                 info = None
+                if selected in infos: info = infos[selected]
+                elif 'default' in infos: info = infos['default']
+                elif infos: info = list(infos.values())[0]
                 if info:
                     splits = list(info.splits.keys())
                     license_name = info.license or "unknown"
+            except: pass
             return {
                 "status": "success",
             return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
+            splits = list(infos[config_name].splits.keys())
             return {"status": "success", "splits": splits}
         except:
             return {"status": "success", "splits": ['train', 'test', 'validation']}
     def _flatten_object(self, obj, parent_key='', sep='.'):
+        """Recursively finds keys for the UI dropdowns."""
         items = {}
         if isinstance(obj, str):
             s = obj.strip()
             if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                try: obj = json.loads(s)
+                except: pass
         if isinstance(obj, dict):
             for k, v in obj.items():
                 new_key = f"{parent_key}{sep}{k}" if parent_key else k
                 items.update(self._flatten_object(v, new_key, sep=sep))
         elif isinstance(obj, list):
+            items[parent_key or "list"] = "List"
         else:
             items[parent_key] = type(obj).__name__
         return items
     def inspect_dataset(self, dataset_id, config, split):
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
             available_paths = set()
+            schema_map = {}
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # Clean row for UI (No objects)
+                clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
+                # Schema Discovery
                 flattened = self._flatten_object(row)
                 available_paths.update(flattened.keys())
+                # List Mode Detection
                 for k, v in row.items():
+                    if k not in schema_map: schema_map[k] = {"type": "Object"}
                     val = v
                     if isinstance(val, str):
                         try: val = json.loads(val)
                         except: pass
+                    if isinstance(val, list): schema_map[k]["type"] = "List"
             sorted_paths = sorted(list(available_paths))
             schema_tree = {}
             for path in sorted_paths:
                 root = path.split('.')[0]
+                if root not in schema_tree: schema_tree[root] = []
                 schema_tree[root].append(path)
             return {
                 "status": "success",
                 "samples": sample_rows,
+                "schema_tree": schema_tree,
+                "schema": schema_map,
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
     # ==========================================
+    # 2. CORE LOGIC
     # ==========================================
     def _get_value_by_path(self, obj, path):
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                    try: current = json.loads(s)
+                    except: pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
             else:
+                return None
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         data = row.get(source_col)
         if isinstance(data, str):
+            try: data = json.loads(data)
+            except: return None
+        if not isinstance(data, list): return None
         matched_item = None
         for item in data:
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
         if matched_item:
             return self._get_value_by_path(matched_item, target_path)
         return None
     def _apply_projection(self, row, recipe):
         new_row = {}
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
             try:
                 if t_type == 'simple':
                     new_row[target_col] = self._get_value_by_path(row, col_def['source'])
                 elif t_type == 'list_search':
                     new_row[target_col] = self._extract_from_list_logic(
+                        row, col_def['source'], col_def['filter_key'], col_def['filter_val'], col_def['target_key']
                     )
                 elif t_type == 'python':
+                    val = eval(col_def['expression'], {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
+    def _sanitize_for_json(self, obj):
+        """Helper to ensure objects are JSON serializable (fixes Preview crash)."""
+        if isinstance(obj, dict):
+            return {k: self._sanitize_for_json(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [self._sanitize_for_json(v) for v in obj]
+        elif isinstance(obj, (str, int, float, bool, type(None))):
+            return obj
+        else:
+            return str(obj) # Convert Timestamps, Images, etc to string
     # ==========================================
+    # 3. PREVIEW & EXECUTE
     # ==========================================
+    def preview_transform(self, dataset_id, config, split, recipe):
+        conf = config if config != 'default' else None
+        try:
+            ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
+            processed = []
+            for i, row in enumerate(ds_stream):
+                if len(processed) >= 5: break
+                # Filter
+                passed = True
+                if recipe.get('filter_rule'):
+                    try:
+                        ctx = row.copy()
+                        ctx['row'] = row
+                        ctx['json'] = json
+                        ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx): passed = False
+                    except: passed = False # Skip crashing rows in preview
+                if passed:
+                    try:
+                        projected = self._apply_projection(row, recipe)
+                        # SANITIZE OUTPUT so Flask doesn't crash on Timestamps/Images
+                        clean_projected = self._sanitize_for_json(projected)
+                        processed.append(clean_projected)
+                    except Exception as e:
+                        processed.append({"_preview_error": f"Error: {str(e)}"})
+            return processed
+        except Exception as e:
+            raise e
     def _generate_card(self, source_id, target_id, recipe, license_name):
         card_data = DatasetCardData(
             language="en",
             license=license_name,
+            tags=["dataset-command-center", "etl"],
             base_model=source_id,
         )
         content = f"""
 # {target_id.split('/')[-1]}
 This dataset is a transformation of [{source_id}](https://huggingface.co/datasets/{source_id}).
+## Recipe
 """
         for col in recipe['columns']:
+            content += f"- **{col['name']}**: {col.get('type')} ({col.get('source') or 'expr'})\n"
+        content += f"\n**License:** {license_name}"
+        return DatasetCard.from_template(card_data, content=content)
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
+        logger.info(f"Job: {source_id} -> {target_id}")
         conf = config if config != 'default' else None
         def gen():
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
+                if max_rows and count >= int(max_rows): break
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         ctx['json'] = json
                         ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx): continue
                     except Exception as e:
+                        raise ValueError(f"Filter error row {i}: {e}")
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
+                except ValueError as ve: raise ve
+                except Exception as e: raise ValueError(f"Error row {i}: {e}")
         try:
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
+            except: pass
             return {"status": "success", "rows_processed": len(new_dataset)}
         except Exception as e:
             logger.error(f"Job Failed: {e}")
+            return {"status": "failed", "error": str(e)}