ysharma HF Staff commited on
Commit
4e5b336
·
verified ·
1 Parent(s): e0bfcea

Update app_v6.py

Browse files
Files changed (1) hide show
  1. app_v6.py +7 -8
app_v6.py CHANGED
@@ -1,5 +1,4 @@
1
  """
2
- =======================================
3
  PII Reveal - Document Privacy Explorer
4
  =======================================
5
 
@@ -91,8 +90,8 @@ def predict_text(text: str) -> tuple[str, list[dict]]:
91
  # APPLICATION LAYER
92
  # =====================================================================
93
 
94
- def extract_text(file_path: str) -> str:
95
- suffix = Path(file_path).suffix.lower()
96
  if suffix == ".pdf":
97
  import fitz
98
  doc = fitz.open(file_path)
@@ -220,13 +219,13 @@ def analyze_document_api(file: FileData) -> dict:
220
  client.predict("/analyze_document", file=handle_file(path))
221
  """
222
  path = file.get("path") or ""
223
- suffix = Path(path).suffix.lower()
224
  orig_name = file.get("orig_name") or Path(path).name
 
225
  if suffix not in (".pdf", ".doc", ".docx"):
226
- return {"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}
227
 
228
  try:
229
- text = extract_text(path)
230
  if not text.strip():
231
  return {"error": "No text content found."}
232
  source_text, spans = run_pii_analysis(text)
@@ -258,7 +257,8 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
258
  Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
259
  download the file and also display timing."""
260
  path = file.get("path") or ""
261
- suffix = Path(path).suffix.lower()
 
262
  if suffix != ".pdf":
263
  return {"error": "PDF redaction only accepts PDF input."}
264
  try:
@@ -281,7 +281,6 @@ def redact_pdf_api(file: FileData, spans: str, active: str) -> dict:
281
  except Exception as e:
282
  return {"error": str(e)}
283
 
284
- orig_name = file.get("orig_name") or Path(path).name
285
  stem = Path(orig_name).stem or "document"
286
  out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
287
  out_path.write_bytes(pdf_bytes)
 
1
  """
 
2
  PII Reveal - Document Privacy Explorer
3
  =======================================
4
 
 
90
  # APPLICATION LAYER
91
  # =====================================================================
92
 
93
+ def extract_text(file_path: str, suffix: str | None = None) -> str:
94
+ suffix = (suffix or Path(file_path).suffix).lower()
95
  if suffix == ".pdf":
96
  import fitz
97
  doc = fitz.open(file_path)
 
219
  client.predict("/analyze_document", file=handle_file(path))
220
  """
221
  path = file.get("path") or ""
 
222
  orig_name = file.get("orig_name") or Path(path).name
223
+ suffix = Path(orig_name).suffix.lower()
224
  if suffix not in (".pdf", ".doc", ".docx"):
225
+ return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."}
226
 
227
  try:
228
+ text = extract_text(path, suffix=suffix)
229
  if not text.strip():
230
  return {"error": "No text content found."}
231
  source_text, spans = run_pii_analysis(text)
 
257
  Returns {"pdf": FileData, "elapsed_ms": int} so the caller can
258
  download the file and also display timing."""
259
  path = file.get("path") or ""
260
+ orig_name = file.get("orig_name") or Path(path).name
261
+ suffix = Path(orig_name).suffix.lower()
262
  if suffix != ".pdf":
263
  return {"error": "PDF redaction only accepts PDF input."}
264
  try:
 
281
  except Exception as e:
282
  return {"error": str(e)}
283
 
 
284
  stem = Path(orig_name).stem or "document"
285
  out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf"
286
  out_path.write_bytes(pdf_bytes)