Spaces:

ysharma
/

OPF-Document-PII-Explorer

Running on Zero

App Files Files Community

ysharma HF Staff commited on 18 days ago

Commit

cd1fc53

verified ·

1 Parent(s): 4e5b336

Update app_v6.py

Browse files

Files changed (1) hide show

app_v6.py +28 -2

app_v6.py CHANGED Viewed

@@ -90,6 +90,32 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
 # APPLICATION  LAYER
 # =====================================================================
 def extract_text(file_path: str, suffix: str | None = None) -> str:
     suffix = (suffix or Path(file_path).suffix).lower()
     if suffix == ".pdf":
@@ -220,7 +246,7 @@ def analyze_document_api(file: FileData) -> dict:
     """
     path = file.get("path") or ""
     orig_name = file.get("orig_name") or Path(path).name
-    suffix = Path(orig_name).suffix.lower()
     if suffix not in (".pdf", ".doc", ".docx"):
         return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
@@ -258,7 +284,7 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
     download the file and also display timing."""
     path = file.get("path") or ""
     orig_name = file.get("orig_name") or Path(path).name
-    suffix = Path(orig_name).suffix.lower()
     if suffix != ".pdf":
         return {"error": "PDF redaction only accepts PDF input."}
     try:

 # APPLICATION  LAYER
 # =====================================================================
+def _sniff_suffix(path: str) -> str:
+    """Detect file type from magic bytes when the filename extension is
+    missing (Gradio's server-side temp path often drops the suffix)."""
+    try:
+        with open(path, "rb") as f:
+            header = f.read(8)
+    except OSError:
+        return ""
+    if header.startswith(b"%PDF-"):
+        return ".pdf"
+    if header.startswith(b"PK\x03\x04"):  # zip container — .docx
+        return ".docx"
+    if header.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"):  # OLE2 — legacy .doc
+        return ".doc"
+    return ""
+def _resolve_suffix(path: str, orig_name: str) -> str:
+    """Pick the best available suffix: orig_name → path → magic bytes."""
+    for candidate in (orig_name, path):
+        s = Path(candidate or "").suffix.lower()
+        if s:
+            return s
+    return _sniff_suffix(path)
 def extract_text(file_path: str, suffix: str | None = None) -> str:
     suffix = (suffix or Path(file_path).suffix).lower()
     if suffix == ".pdf":
     """
     path = file.get("path") or ""
     orig_name = file.get("orig_name") or Path(path).name
+    suffix = _resolve_suffix(path, orig_name)
     if suffix not in (".pdf", ".doc", ".docx"):
         return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
     download the file and also display timing."""
     path = file.get("path") or ""
     orig_name = file.get("orig_name") or Path(path).name
+    suffix = _resolve_suffix(path, orig_name)
     if suffix != ".pdf":
         return {"error": "PDF redaction only accepts PDF input."}
     try: