File size: 4,617 Bytes
3552405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""File parsing utilities for contract documents."""

import io
from typing import Union

import chardet


def read_pdf(file_bytes: bytes) -> str:
    """Extract text from a PDF file using PyMuPDF.

    Falls back to pdfplumber if PyMuPDF extraction returns empty.

    Args:
        file_bytes: Raw bytes of the PDF file.

    Returns:
        Extracted text content as a string.

    Raises:
        ValueError: If the PDF cannot be read or contains no extractable text.
    """
    import fitz

    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
    except Exception as e:
        raise ValueError(f"Unable to open PDF file: {e}") from e

    text_parts: list[str] = []
    for page in doc:
        text_parts.append(page.get_text())

    doc.close()
    result = "\n".join(text_parts).strip()

    if result:
        return result

    result = _read_pdf_with_pdfplumber(file_bytes)
    if result:
        return result

    raise ValueError("PDF file contains no extractable text — try pasting the text directly")


def _read_pdf_with_pdfplumber(file_bytes: bytes) -> str:
    """Fallback PDF extraction using pdfplumber."""
    try:
        import pdfplumber
    except ImportError:
        return ""

    try:
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            text_parts: list[str] = []
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(page_text)
        return "\n".join(text_parts).strip()
    except Exception:
        return ""


def read_docx(file_bytes: bytes) -> str:
    """Extract text from a DOCX file using python-docx.

    Args:
        file_bytes: Raw bytes of the DOCX file.

    Returns:
        Extracted text content as a string.

    Raises:
        ValueError: If the DOCX cannot be read or contains no text.
    """
    from docx import Document

    try:
        doc = Document(io.BytesIO(file_bytes))
    except Exception as e:
        raise ValueError(f"Unable to open DOCX file: {e}") from e

    paragraphs: list[str] = []
    for para in doc.paragraphs:
        if para.text.strip():
            paragraphs.append(para.text)

    result = "\n".join(paragraphs).strip()

    if not result:
        raise ValueError("DOCX file contains no extractable text")

    return result


def detect_encoding(file_bytes: bytes) -> str:
    """Detect the character encoding of a byte string using chardet.

    Args:
        file_bytes: Raw bytes to detect encoding for.

    Returns:
        Detected encoding name string (e.g., 'utf-8', 'latin-1').
    """
    detection = chardet.detect(file_bytes)
    return detection.get("encoding", "utf-8") or "utf-8"


def read_txt(file_bytes: bytes) -> str:
    """Read a plain text file with automatic encoding detection.

    Args:
        file_bytes: Raw bytes of the text file.

    Returns:
        Decoded text content as a string.

    Raises:
        ValueError: If the file cannot be decoded or is empty.
    """
    encoding = detect_encoding(file_bytes)

    try:
        text = file_bytes.decode(encoding)
    except (UnicodeDecodeError, LookupError):
        text = file_bytes.decode("utf-8", errors="replace")

    result = text.strip()

    if not result:
        raise ValueError("Text file is empty or contains no readable content")

    return result


SUPPORTED_EXTENSIONS = frozenset({".pdf", ".txt", ".docx"})
READER_MAP = {
    ".pdf": read_pdf,
    ".txt": read_txt,
    ".docx": read_docx,
}


def extract_text(file_bytes: bytes, filename: str) -> str:
    """Route file to the appropriate reader based on extension.

    Args:
        file_bytes: Raw bytes of the file.
        filename: Original filename used to determine file type.

    Returns:
        Extracted text content as a string.

    Raises:
        ValueError: If the file extension is not supported or the file is unreadable.
    """
    if not filename:
        raise ValueError("Filename is required to determine file type")

    ext = _get_extension(filename)

    if ext not in READER_MAP:
        raise ValueError(
            f"Unsupported file type: {ext}. Supported types: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
        )

    reader = READER_MAP[ext]
    return reader(file_bytes)


def _get_extension(filename: str) -> str:
    """Extract the lowercase file extension from a filename."""
    dot_index = filename.rfind(".")
    if dot_index == -1 or dot_index == len(filename) - 1:
        raise ValueError(f"Cannot determine file type from filename: {filename}")
    return filename[dot_index:].lower()