File size: 1,093 Bytes
03faf26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import io


def _extract_pdf_text(file_content: bytes) -> str:
    from pypdf import PdfReader

    reader = PdfReader(io.BytesIO(file_content))
    pages = []
    for page in reader.pages:
        pages.append(page.extract_text() or "")
    return "\n".join(pages)


def _extract_docx_text(file_content: bytes) -> str:
    from docx import Document

    doc = Document(io.BytesIO(file_content))
    paragraphs = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
    return "\n".join(paragraphs)


def extract_resume_text(filename: str, file_content: bytes) -> str:
    ext = (filename or "").lower().rsplit(".", 1)
    ext = f".{ext[-1]}" if len(ext) > 1 else ""

    if ext == ".pdf":
        text = _extract_pdf_text(file_content)
    elif ext == ".docx":
        text = _extract_docx_text(file_content)
    else:
        # Fallback path for txt/doc and unknown formats.
        text = file_content.decode("utf-8", errors="ignore")

    cleaned = text.replace("\x00", " ")
    cleaned = "\n".join(line.strip() for line in cleaned.splitlines() if line.strip())
    return cleaned