Spaces:

ysharma
/

OPF-Document-PII-Explorer

Running on Zero

App Files Files Community

ysharma HF Staff commited on 15 days ago

Commit

4e5b336

verified ·

1 Parent(s): e0bfcea

Update app_v6.py

Browse files

Files changed (1) hide show

app_v6.py +7 -8

app_v6.py CHANGED Viewed

@@ -1,5 +1,4 @@
 """
-=======================================
 PII Reveal - Document Privacy Explorer
 =======================================
@@ -91,8 +90,8 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
 # APPLICATION  LAYER
 # =====================================================================
-def extract_text(file_path: str) -> str:
-    suffix = Path(file_path).suffix.lower()
     if suffix == ".pdf":
         import fitz
         doc = fitz.open(file_path)
@@ -220,13 +219,13 @@ def analyze_document_api(file: FileData) -> dict:
         client.predict("/analyze_document", file=handle_file(path))
     """
     path = file.get("path") or ""
-    suffix = Path(path).suffix.lower()
     orig_name = file.get("orig_name") or Path(path).name
     if suffix not in (".pdf", ".doc", ".docx"):
-        return {"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}
     try:
-        text = extract_text(path)
         if not text.strip():
             return {"error": "No text content found."}
         source_text, spans = run_pii_analysis(text)
@@ -258,7 +257,8 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
     Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
     download the file and also display timing."""
     path = file.get("path") or ""
-    suffix = Path(path).suffix.lower()
     if suffix != ".pdf":
         return {"error": "PDF redaction only accepts PDF input."}
     try:
@@ -281,7 +281,6 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
     except Exception as e:
         return {"error": str(e)}
-    orig_name = file.get("orig_name") or Path(path).name
     stem = Path(orig_name).stem or "document"
     out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
     out_path.write_bytes(pdf_bytes)

 """
 PII Reveal - Document Privacy Explorer
 =======================================
 # APPLICATION  LAYER
 # =====================================================================
+def extract_text(file_path: str, suffix: str | None = None) -> str:
+    suffix = (suffix or Path(file_path).suffix).lower()
     if suffix == ".pdf":
         import fitz
         doc = fitz.open(file_path)
         client.predict("/analyze_document", file=handle_file(path))
     """
     path = file.get("path") or ""
     orig_name = file.get("orig_name") or Path(path).name
+    suffix = Path(orig_name).suffix.lower()
     if suffix not in (".pdf", ".doc", ".docx"):
+        return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
     try:
+        text = extract_text(path, suffix=suffix)
         if not text.strip():
             return {"error": "No text content found."}
         source_text, spans = run_pii_analysis(text)
     Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
     download the file and also display timing."""
     path = file.get("path") or ""
+    orig_name = file.get("orig_name") or Path(path).name
+    suffix = Path(orig_name).suffix.lower()
     if suffix != ".pdf":
         return {"error": "PDF redaction only accepts PDF input."}
     try:
     except Exception as e:
         return {"error": str(e)}
     stem = Path(orig_name).stem or "document"
     out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
     out_path.write_bytes(pdf_bytes)