Spaces:
Running on Zero
Running on Zero
Update app_v6.py
Browse files
app_v6.py
CHANGED
|
@@ -90,6 +90,32 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
|
|
| 90 |
# APPLICATION LAYER
|
| 91 |
# =====================================================================
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def extract_text(file_path: str, suffix: str | None = None) -> str:
|
| 94 |
suffix = (suffix or Path(file_path).suffix).lower()
|
| 95 |
if suffix == ".pdf":
|
|
@@ -220,7 +246,7 @@ def analyze_document_api(file: FileData) -> dict:
|
|
| 220 |
"""
|
| 221 |
path = file.get("path") or ""
|
| 222 |
orig_name = file.get("orig_name") or Path(path).name
|
| 223 |
-
suffix =
|
| 224 |
if suffix not in (".pdf", ".doc", ".docx"):
|
| 225 |
return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
|
| 226 |
|
|
@@ -258,7 +284,7 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
|
|
| 258 |
download the file and also display timing."""
|
| 259 |
path = file.get("path") or ""
|
| 260 |
orig_name = file.get("orig_name") or Path(path).name
|
| 261 |
-
suffix =
|
| 262 |
if suffix != ".pdf":
|
| 263 |
return {"error": "PDF redaction only accepts PDF input."}
|
| 264 |
try:
|
|
|
|
| 90 |
# APPLICATION LAYER
|
| 91 |
# =====================================================================
|
| 92 |
|
| 93 |
+
def _sniff_suffix(path: str) -> str:
|
| 94 |
+
"""Detect file type from magic bytes when the filename extension is
|
| 95 |
+
missing (Gradio's server-side temp path often drops the suffix)."""
|
| 96 |
+
try:
|
| 97 |
+
with open(path, "rb") as f:
|
| 98 |
+
header = f.read(8)
|
| 99 |
+
except OSError:
|
| 100 |
+
return ""
|
| 101 |
+
if header.startswith(b"%PDF-"):
|
| 102 |
+
return ".pdf"
|
| 103 |
+
if header.startswith(b"PK\x03\x04"): # zip container — .docx
|
| 104 |
+
return ".docx"
|
| 105 |
+
if header.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"): # OLE2 — legacy .doc
|
| 106 |
+
return ".doc"
|
| 107 |
+
return ""
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _resolve_suffix(path: str, orig_name: str) -> str:
|
| 111 |
+
"""Pick the best available suffix: orig_name → path → magic bytes."""
|
| 112 |
+
for candidate in (orig_name, path):
|
| 113 |
+
s = Path(candidate or "").suffix.lower()
|
| 114 |
+
if s:
|
| 115 |
+
return s
|
| 116 |
+
return _sniff_suffix(path)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
def extract_text(file_path: str, suffix: str | None = None) -> str:
|
| 120 |
suffix = (suffix or Path(file_path).suffix).lower()
|
| 121 |
if suffix == ".pdf":
|
|
|
|
| 246 |
"""
|
| 247 |
path = file.get("path") or ""
|
| 248 |
orig_name = file.get("orig_name") or Path(path).name
|
| 249 |
+
suffix = _resolve_suffix(path, orig_name)
|
| 250 |
if suffix not in (".pdf", ".doc", ".docx"):
|
| 251 |
return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
|
| 252 |
|
|
|
|
| 284 |
download the file and also display timing."""
|
| 285 |
path = file.get("path") or ""
|
| 286 |
orig_name = file.get("orig_name") or Path(path).name
|
| 287 |
+
suffix = _resolve_suffix(path, orig_name)
|
| 288 |
if suffix != ".pdf":
|
| 289 |
return {"error": "PDF redaction only accepts PDF input."}
|
| 290 |
try:
|