ysharma HF Staff commited on
Commit
cd1fc53
·
verified ·
1 Parent(s): 4e5b336

Update app_v6.py

Browse files
Files changed (1) hide show
  1. app_v6.py +28 -2
app_v6.py CHANGED
@@ -90,6 +90,32 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
90
  # APPLICATION LAYER
91
  # =====================================================================
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def extract_text(file_path: str, suffix: str | None = None) -> str:
94
  suffix = (suffix or Path(file_path).suffix).lower()
95
  if suffix == ".pdf":
@@ -220,7 +246,7 @@ def analyze_document_api(file: FileData) -> dict:
220
  """
221
  path = file.get("path") or ""
222
  orig_name = file.get("orig_name") or Path(path).name
223
- suffix = Path(orig_name).suffix.lower()
224
  if suffix not in (".pdf", ".doc", ".docx"):
225
  return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
226
 
@@ -258,7 +284,7 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
258
  download the file and also display timing."""
259
  path = file.get("path") or ""
260
  orig_name = file.get("orig_name") or Path(path).name
261
- suffix = Path(orig_name).suffix.lower()
262
  if suffix != ".pdf":
263
  return {"error": "PDF redaction only accepts PDF input."}
264
  try:
 
90
  # APPLICATION LAYER
91
  # =====================================================================
92
 
93
+ def _sniff_suffix(path: str) -> str:
94
+ """Detect file type from magic bytes when the filename extension is
95
+ missing (Gradio's server-side temp path often drops the suffix)."""
96
+ try:
97
+ with open(path, "rb") as f:
98
+ header = f.read(8)
99
+ except OSError:
100
+ return ""
101
+ if header.startswith(b"%PDF-"):
102
+ return ".pdf"
103
+ if header.startswith(b"PK\x03\x04"): # zip container — .docx
104
+ return ".docx"
105
+ if header.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"): # OLE2 — legacy .doc
106
+ return ".doc"
107
+ return ""
108
+
109
+
110
+ def _resolve_suffix(path: str, orig_name: str) -> str:
111
+ """Pick the best available suffix: orig_name → path → magic bytes."""
112
+ for candidate in (orig_name, path):
113
+ s = Path(candidate or "").suffix.lower()
114
+ if s:
115
+ return s
116
+ return _sniff_suffix(path)
117
+
118
+
119
  def extract_text(file_path: str, suffix: str | None = None) -> str:
120
  suffix = (suffix or Path(file_path).suffix).lower()
121
  if suffix == ".pdf":
 
246
  """
247
  path = file.get("path") or ""
248
  orig_name = file.get("orig_name") or Path(path).name
249
+ suffix = _resolve_suffix(path, orig_name)
250
  if suffix not in (".pdf", ".doc", ".docx"):
251
  return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
252
 
 
284
  download the file and also display timing."""
285
  path = file.get("path") or ""
286
  orig_name = file.get("orig_name") or Path(path).name
287
+ suffix = _resolve_suffix(path, orig_name)
288
  if suffix != ".pdf":
289
  return {"error": "PDF redaction only accepts PDF input."}
290
  try: