Spaces:

lablab-ai-amd-developer-hackathon
/

ClauseGuard-AI

Sleeping

ClauseGuard-AI / clauseguard /tools /file_tools.py

muhammadbinmurtza

Restructure: clauseguard as package subfolder, app_file: clauseguard/app.py

913a064 14 days ago

4.62 kB

	"""File parsing utilities for contract documents."""

	import io
	from typing import Union

	import chardet


	def read_pdf(file_bytes: bytes) -> str:
	"""Extract text from a PDF file using PyMuPDF.

	Falls back to pdfplumber if PyMuPDF extraction returns empty.

	Args:
	file_bytes: Raw bytes of the PDF file.

	Returns:
	Extracted text content as a string.

	Raises:
	ValueError: If the PDF cannot be read or contains no extractable text.
	"""
	import fitz

	try:
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	except Exception as e:
	raise ValueError(f"Unable to open PDF file: {e}") from e

	text_parts: list[str] = []
	for page in doc:
	text_parts.append(page.get_text())

	doc.close()
	result = "\n".join(text_parts).strip()

	if result:
	return result

	result = _read_pdf_with_pdfplumber(file_bytes)
	if result:
	return result

	raise ValueError("PDF file contains no extractable text — try pasting the text directly")


	def _read_pdf_with_pdfplumber(file_bytes: bytes) -> str:
	"""Fallback PDF extraction using pdfplumber."""
	try:
	import pdfplumber
	except ImportError:
	return ""

	try:
	with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
	text_parts: list[str] = []
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text_parts.append(page_text)
	return "\n".join(text_parts).strip()
	except Exception:
	return ""


	def read_docx(file_bytes: bytes) -> str:
	"""Extract text from a DOCX file using python-docx.

	Args:
	file_bytes: Raw bytes of the DOCX file.

	Returns:
	Extracted text content as a string.

	Raises:
	ValueError: If the DOCX cannot be read or contains no text.
	"""
	from docx import Document

	try:
	doc = Document(io.BytesIO(file_bytes))
	except Exception as e:
	raise ValueError(f"Unable to open DOCX file: {e}") from e

	paragraphs: list[str] = []
	for para in doc.paragraphs:
	if para.text.strip():
	paragraphs.append(para.text)

	result = "\n".join(paragraphs).strip()

	if not result:
	raise ValueError("DOCX file contains no extractable text")

	return result


	def detect_encoding(file_bytes: bytes) -> str:
	"""Detect the character encoding of a byte string using chardet.

	Args:
	file_bytes: Raw bytes to detect encoding for.

	Returns:
	Detected encoding name string (e.g., 'utf-8', 'latin-1').
	"""
	detection = chardet.detect(file_bytes)
	return detection.get("encoding", "utf-8") or "utf-8"


	def read_txt(file_bytes: bytes) -> str:
	"""Read a plain text file with automatic encoding detection.

	Args:
	file_bytes: Raw bytes of the text file.

	Returns:
	Decoded text content as a string.

	Raises:
	ValueError: If the file cannot be decoded or is empty.
	"""
	encoding = detect_encoding(file_bytes)

	try:
	text = file_bytes.decode(encoding)
	except (UnicodeDecodeError, LookupError):
	text = file_bytes.decode("utf-8", errors="replace")

	result = text.strip()

	if not result:
	raise ValueError("Text file is empty or contains no readable content")

	return result


	SUPPORTED_EXTENSIONS = frozenset({".pdf", ".txt", ".docx"})
	READER_MAP = {
	".pdf": read_pdf,
	".txt": read_txt,
	".docx": read_docx,
	}


	def extract_text(file_bytes: bytes, filename: str) -> str:
	"""Route file to the appropriate reader based on extension.

	Args:
	file_bytes: Raw bytes of the file.
	filename: Original filename used to determine file type.

	Returns:
	Extracted text content as a string.

	Raises:
	ValueError: If the file extension is not supported or the file is unreadable.
	"""
	if not filename:
	raise ValueError("Filename is required to determine file type")

	ext = _get_extension(filename)

	if ext not in READER_MAP:
	raise ValueError(
	f"Unsupported file type: {ext}. Supported types: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
	)

	reader = READER_MAP[ext]
	return reader(file_bytes)


	def _get_extension(filename: str) -> str:
	"""Extract the lowercase file extension from a filename."""
	dot_index = filename.rfind(".")
	if dot_index == -1 or dot_index == len(filename) - 1:
	raise ValueError(f"Cannot determine file type from filename: {filename}")
	return filename[dot_index:].lower()