Spaces:
Running on Zero
Running on Zero
Update app_v6.py
Browse files
app_v6.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
"""
|
| 2 |
-
=======================================
|
| 3 |
PII Reveal - Document Privacy Explorer
|
| 4 |
=======================================
|
| 5 |
|
|
@@ -91,8 +90,8 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
|
|
| 91 |
# APPLICATION LAYER
|
| 92 |
# =====================================================================
|
| 93 |
|
| 94 |
-
def extract_text(file_path: str) -> str:
|
| 95 |
-
suffix = Path(file_path).suffix.lower()
|
| 96 |
if suffix == ".pdf":
|
| 97 |
import fitz
|
| 98 |
doc = fitz.open(file_path)
|
|
@@ -220,13 +219,13 @@ def analyze_document_api(file: FileData) -> dict:
|
|
| 220 |
client.predict("/analyze_document", file=handle_file(path))
|
| 221 |
"""
|
| 222 |
path = file.get("path") or ""
|
| 223 |
-
suffix = Path(path).suffix.lower()
|
| 224 |
orig_name = file.get("orig_name") or Path(path).name
|
|
|
|
| 225 |
if suffix not in (".pdf", ".doc", ".docx"):
|
| 226 |
-
return {"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}
|
| 227 |
|
| 228 |
try:
|
| 229 |
-
text = extract_text(path)
|
| 230 |
if not text.strip():
|
| 231 |
return {"error": "No text content found."}
|
| 232 |
source_text, spans = run_pii_analysis(text)
|
|
@@ -258,7 +257,8 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
|
|
| 258 |
Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
|
| 259 |
download the file and also display timing."""
|
| 260 |
path = file.get("path") or ""
|
| 261 |
-
|
|
|
|
| 262 |
if suffix != ".pdf":
|
| 263 |
return {"error": "PDF redaction only accepts PDF input."}
|
| 264 |
try:
|
|
@@ -281,7 +281,6 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
|
|
| 281 |
except Exception as e:
|
| 282 |
return {"error": str(e)}
|
| 283 |
|
| 284 |
-
orig_name = file.get("orig_name") or Path(path).name
|
| 285 |
stem = Path(orig_name).stem or "document"
|
| 286 |
out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
|
| 287 |
out_path.write_bytes(pdf_bytes)
|
|
|
|
| 1 |
"""
|
|
|
|
| 2 |
PII Reveal - Document Privacy Explorer
|
| 3 |
=======================================
|
| 4 |
|
|
|
|
| 90 |
# APPLICATION LAYER
|
| 91 |
# =====================================================================
|
| 92 |
|
| 93 |
+
def extract_text(file_path: str, suffix: str | None = None) -> str:
|
| 94 |
+
suffix = (suffix or Path(file_path).suffix).lower()
|
| 95 |
if suffix == ".pdf":
|
| 96 |
import fitz
|
| 97 |
doc = fitz.open(file_path)
|
|
|
|
| 219 |
client.predict("/analyze_document", file=handle_file(path))
|
| 220 |
"""
|
| 221 |
path = file.get("path") or ""
|
|
|
|
| 222 |
orig_name = file.get("orig_name") or Path(path).name
|
| 223 |
+
suffix = Path(orig_name).suffix.lower()
|
| 224 |
if suffix not in (".pdf", ".doc", ".docx"):
|
| 225 |
+
return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
|
| 226 |
|
| 227 |
try:
|
| 228 |
+
text = extract_text(path, suffix=suffix)
|
| 229 |
if not text.strip():
|
| 230 |
return {"error": "No text content found."}
|
| 231 |
source_text, spans = run_pii_analysis(text)
|
|
|
|
| 257 |
Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
|
| 258 |
download the file and also display timing."""
|
| 259 |
path = file.get("path") or ""
|
| 260 |
+
orig_name = file.get("orig_name") or Path(path).name
|
| 261 |
+
suffix = Path(orig_name).suffix.lower()
|
| 262 |
if suffix != ".pdf":
|
| 263 |
return {"error": "PDF redaction only accepts PDF input."}
|
| 264 |
try:
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
return {"error": str(e)}
|
| 283 |
|
|
|
|
| 284 |
stem = Path(orig_name).stem or "document"
|
| 285 |
out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
|
| 286 |
out_path.write_bytes(pdf_bytes)
|