| import asyncio |
| from urllib.parse import urlparse, quote |
| from aiolimiter import AsyncLimiter |
| from pathlib import Path |
| import traceback |
| from typing import Literal, Tuple |
| from fastapi.routing import APIRouter |
| import logging |
| import io |
| import zipfile |
| import os |
| from httpx import AsyncClient |
| from pydantic import BaseModel |
| import subprocess |
| import pandas as pd |
| import re |
| import tempfile |
| from lxml import etree |
| from bs4 import BeautifulSoup |
| from fastapi import Depends, File, HTTPException, UploadFile |
| import urllib |
| from dependencies import get_http_client, get_llm_router |
| from fastapi.responses import StreamingResponse |
| from litellm.router import Router |
| from kreuzberg import ExtractionConfig, extract_bytes |
| import requests |
| from io import BytesIO |
|
|
| from schemas import DocInfo, GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse |
|
|
| |
| router = APIRouter(tags=["document extraction"]) |
|
|
| |
| NSMAP = { |
| 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', |
| 'v': 'urn:schemas-microsoft-com:vml' |
| } |
|
|
| SUGGESTION_START = '[[SUGGESTION_START]]' |
| SUGGESTION_END = '[[SUGGESTION_END]]' |
|
|
| def extract_docx_text_with_suggestions(docx_stream: io.BytesIO) -> str: |
| try: |
| with zipfile.ZipFile(docx_stream) as z: |
| xml = z.read('word/document.xml') |
| except KeyError: |
| raise FileNotFoundError('word/document.xml not found in the DOCX archive.') |
|
|
| root = etree.fromstring(xml) |
| body = root.find('w:body', NSMAP) |
|
|
| out: list[str] = [] |
|
|
| def walk(el, collector: list[str]): |
| tag = etree.QName(el).localname |
| if tag == 'del': |
| return |
| if tag == 'ins': |
| temp: list[str] = [] |
| for child in el: |
| walk(child, temp) |
| joined = ''.join(temp) |
| if joined.strip(): |
| collector.append(SUGGESTION_START) |
| collector.append(joined) |
| collector.append(SUGGESTION_END) |
| else: |
| collector.append(joined) |
| return |
| if tag == 'p': |
| for child in el: |
| walk(child, collector) |
| collector.append('\n') |
| return |
| if tag == 't': |
| collector.append(el.text or '') |
| return |
| if tag == 'tab': |
| collector.append('\t') |
| return |
| if tag == 'br': |
| collector.append('\n') |
| return |
| for child in el: |
| walk(child, collector) |
|
|
| if body is not None: |
| walk(body, out) |
| text = ''.join(out).replace('\r', '') |
| return text |
|
|
| |
|
|
| KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig( |
| ocr=None) |
|
|
| |
| LO_CONVERSION_MUTEX = asyncio.Lock() |
|
|
| |
| FORMAT_MIME_TYPES = { |
| ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| ".pdf": "application/pdf", |
| ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| ".doc": "application/msword", |
| ".ppt": "application/vnd.ms-powerpoint" |
| } |
|
|
|
|
| async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO: |
| """ |
| Converts the given file bytes using Libreoffice headless to the specified file type. |
| |
| Args: |
| contents: File contents |
| filename: File base name WITHOUT THE EXTENSION |
| input_ext: Input extension (WITHOUT THE DOT) |
| output_ext: Output extension (WITHOUT THE DOT) |
| filter: The conversion filter to use. |
| """ |
|
|
| async with LO_CONVERSION_MUTEX: |
| with tempfile.TemporaryDirectory() as tmpdir: |
| dir_path = Path(tmpdir) |
| input_file_path = dir_path / f"{filename}.{input_ext}" |
| output_file_path = dir_path / f"{filename}.{output_ext}" |
|
|
| |
| with open(input_file_path, "wb") as in_file: |
| in_file.write(contents.read()) |
|
|
| out_bytes = io.BytesIO() |
|
|
| |
| command = [ |
| "libreoffice", |
| "--headless", |
| "--convert-to", f"{output_ext}:{filter}" if filter else output_ext, |
| "--outdir", tmpdir, |
| str(input_file_path) |
| ] |
|
|
| |
| process = await asyncio.create_subprocess_exec( |
| *command, |
| stdout=asyncio.subprocess.PIPE, |
| stderr=asyncio.subprocess.PIPE |
| ) |
|
|
| stdout, stderr = await process.communicate() |
|
|
| exit_code = await process.wait() |
|
|
| if exit_code != 0 and not output_file_path.exists(): |
| raise subprocess.CalledProcessError( |
| exit_code, |
| command, |
| output=stdout, |
| stderr=stderr |
| ) |
|
|
| |
|
|
| with open(output_file_path, mode="rb") as out: |
| out_bytes.write(out.read()) |
|
|
| out_bytes.seek(0) |
| return out_bytes |
|
|
|
|
| async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]: |
| """ |
| Convert given file represented as a (filename, ext, bytes) to a list of lines. |
| File types which require conversion for handling are converted to the appropriate format before being converted to text. |
| """ |
|
|
| final_text: str = None |
| if ext == ".doc": |
| logging.debug(f"Converting {filename} .doc --> .docx") |
| docx_bytes = await convert_file_type(bytes, filename, "doc", "docx") |
| logging.debug(f"Extracting content with suggestion markers for filename: {filename}, ext: {ext} (converted)") |
| docx_bytes.seek(0) |
| final_text = extract_docx_text_with_suggestions(docx_bytes) |
| logging.debug(f"Got text content for filename: {filename}, ext: {ext}") |
| elif ext == ".docx": |
| logging.debug(f"Extracting .docx with suggestion markers for {filename}.") |
| bytes.seek(0) |
| final_text = extract_docx_text_with_suggestions(bytes) |
| logging.debug(f"Got text content for filename: {filename}, ext: {ext}") |
| elif ext == ".ppt": |
| logging.debug(f"Converting {filename} .ppt --> .pptx") |
| docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx") |
| logging.debug( |
| f"Extracting content for filename: {filename}, ext: {ext} with converted ppt") |
| extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG) |
| final_text = extracted_data.content |
| logging.debug(f"Got text content for filename: {filename}, ext: {ext}") |
| else: |
| if ext in FORMAT_MIME_TYPES: |
| logging.debug( |
| f"Extracting content for filename: {filename}, ext: {ext}") |
| extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG) |
| final_text = extracted_data.content |
| else: |
| raise Exception( |
| f"Unsupported file type: {ext}, filename: {filename}") |
|
|
| |
| txt_data = [""] + [line.strip() |
| for line in final_text.splitlines() if line.strip()] |
|
|
| return txt_data |
|
|
| |
| FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60) |
| |
| FTP_MAX_PARALLEL_WORKERS = asyncio.Semaphore(4) |
|
|
|
|
| async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]: |
| """ |
| Récupère le document zippé depuis l'URL et le retourne un tuple (nom, extension, contenu). |
| Le premier document avec une extension convertible en texte est séléctionné |
| """ |
|
|
| async with FTP_DOWNLOAD_RATE_LIMITER: |
| async with FTP_MAX_PARALLEL_WORKERS: |
| if not url.endswith("zip"): |
| raise ValueError("URL doit pointer vers un fichier ZIP") |
|
|
| |
| resp = await client.get(url, headers={ |
| "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| }) |
|
|
| resp.raise_for_status() |
|
|
| with zipfile.ZipFile(io.BytesIO(resp.content)) as zf: |
| |
| for entry in zf.infolist(): |
| if entry.is_dir(): |
| continue |
|
|
| file_name = entry.filename |
| root, ext = os.path.splitext(file_name) |
| ext = ext.lower() |
|
|
| |
| if ext not in FORMAT_MIME_TYPES: |
| logging.debug( |
| f"Skipping unsupported filetype found in archive: {ext}") |
| continue |
|
|
| doc_bytes = zf.read(file_name) |
|
|
| return (root, ext, io.BytesIO(doc_bytes)) |
|
|
| raise ValueError( |
| f"No file with a supported extension type was found in the archive file: {ext}") |
|
|
|
|
| def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO: |
| """ |
| Applique les révisions des .docx avant de retourner le contenu. |
| |
| Args: |
| docx_zip: Le document word sous forme de zip |
| """ |
|
|
| try: |
| xml_bytes = docx_zip.read('word/document.xml') |
| logging.debug("Read the document XML") |
| except KeyError: |
| raise FileNotFoundError( |
| "word/document.xml not found in the DOCX archive.") |
|
|
| parser = etree.XMLParser(remove_blank_text=True) |
| root = etree.fromstring(xml_bytes, parser=parser) |
|
|
| |
| for del_elem in root.xpath('//w:del', namespaces=NSMAP): |
| parent = del_elem.getparent() |
| if parent is not None: |
| parent.remove(del_elem) |
|
|
| |
| for ins_elem in root.xpath('//w:ins', namespaces=NSMAP): |
| parent = ins_elem.getparent() |
| if parent is not None: |
| index = parent.index(ins_elem) |
| for child in ins_elem.iterchildren(): |
| parent.insert(index, child) |
| index += 1 |
| parent.remove(ins_elem) |
|
|
| |
| for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']: |
| for elem in root.xpath(f'//{tag}', namespaces=NSMAP): |
| parent = elem.getparent() |
| if parent is not None: |
| parent.remove(elem) |
|
|
| |
| output = io.BytesIO() |
|
|
| with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: |
| |
| for file_info in docx_zip.infolist(): |
| if file_info.filename != 'word/document.xml': |
| new_zip.writestr(file_info, docx_zip.read(file_info.filename)) |
|
|
| |
| xml_str = etree.tostring( |
| root, |
| xml_declaration=True, |
| encoding='UTF-8', |
| pretty_print=True |
| ) |
| new_zip.writestr('word/document.xml', xml_str) |
|
|
| output.seek(0) |
| logging.debug("Exporting new docx revision OK") |
| return output |
|
|
| |
|
|
|
|
| @router.post("/get_meetings", response_model=GetMeetingsResponse) |
| async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)): |
| """ |
| Retrieves the list of meetings for the given working group. |
| """ |
| |
| working_group = req.working_group |
| tsg = re.sub(r"\d+", "", working_group) |
| wg_number = re.search(r"\d", working_group).group(0) |
|
|
| |
| logging.debug(f"FTP internal working group ID is {tsg}{wg_number}") |
| url = "https://www.3gpp.org/ftp/tsg_" + tsg |
| logging.debug(url) |
|
|
| ftp_request = await http_client.get(url) |
| soup = BeautifulSoup(ftp_request.text, "html.parser") |
|
|
| meeting_folders = [] |
| all_meetings = [] |
| wg_folders = [item.get_text() for item in soup.select("tr td a")] |
| selected_folder = None |
|
|
| |
| for folder in wg_folders: |
| if "wg" + str(wg_number) in folder.lower(): |
| selected_folder = folder |
| break |
|
|
| url += "/" + selected_folder |
| logging.debug(url) |
|
|
| if selected_folder: |
| resp = await http_client.get(url) |
| soup = BeautifulSoup(resp.text, "html.parser") |
| meeting_folders = [item.get_text() for item in soup.select("tr td a") if item.get_text( |
| ).startswith("TSG") or (item.get_text().startswith("CT") and "-" in item.get_text())] |
| all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace( |
| "-", " ") if meeting.startswith('TSG') else meeting.replace("-", "#") for meeting in meeting_folders] |
|
|
| return GetMeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders))) |
|
|
| |
|
|
|
|
| @router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse) |
| async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse: |
| """ |
| Downloads the document list dataframe for a given meeting OR alternatively returns the document list dataframe from the given FTP URL. |
| If `custom_url` field is set in the request schema, the other fields are ignored. |
| """ |
|
|
| if req.custom_url: |
| logging.info(f"Fetching TDocs at custom URL {req.custom_url}") |
|
|
| |
| if '3gpp.org' not in req.custom_url: |
| raise HTTPException(status_code=401) |
|
|
| |
| reponse = await http_client.get(req.custom_url) |
| soup = BeautifulSoup(reponse.text, "html.parser") |
|
|
| |
| file_links = [l.get('href') |
| for l in soup.select('table > tbody > tr a.file')] |
|
|
| |
| file_names = [ |
| urllib.parse.unquote(Path(urllib.parse.urlparse(url).path).stem) |
| for url in file_links |
| ] |
|
|
| |
| df = pd.DataFrame() |
| df["TDoc"] = file_names |
| df["URL"] = file_links |
|
|
| |
| DF_COL_TYPES = ["TDoc", "Title", "Type", "For", |
| "TDoc Status", "Agenda item description", "URL"] |
|
|
| for tp in DF_COL_TYPES: |
| df[tp] = "Unknown" |
|
|
| df["TDoc"] = file_names |
| df["URL"] = file_links |
| df["Type"] = "TDoc / xxxxCR" |
|
|
| return GetMeetingDocsResponse(data=df.to_dict(orient="records")) |
|
|
| else: |
| |
|
|
| |
| working_group = req.working_group |
| tsg = re.sub(r"\d+", "", working_group) |
| wg_number = re.search(r"\d", working_group).group(0) |
| url = "https://www.3gpp.org/ftp/tsg_" + tsg |
|
|
| logging.info( |
| f"Fetching TDocs dataframe for {working_group}:{req.meeting}") |
|
|
| resp = await http_client.get(url) |
| soup = BeautifulSoup(resp.text, "html.parser") |
| wg_folders = [item.get_text() for item in soup.select("tr td a")] |
| selected_folder = None |
| for folder in wg_folders: |
| if "wg" + str(wg_number) in folder.lower(): |
| selected_folder = folder |
| break |
|
|
| url += "/" + selected_folder + "/" + req.meeting + "/docs" |
| resp = await http_client.get(url) |
| soup = BeautifulSoup(resp.text, "html.parser") |
| files = [item.get_text() for item in soup.select("tr td a") |
| if item.get_text().endswith(".xlsx")] |
|
|
| if files == []: |
| raise HTTPException( |
| status_code=404, detail="No Excel file has been found") |
|
|
|
|
| file_url = f"{url}/{files[0]}" |
| file_url = quote(file_url, safe=":/") |
| |
| headers = { |
| "User-Agent": "Mozilla/5.0" |
| } |
| |
| resp = requests.get(file_url, headers=headers) |
| resp.raise_for_status() |
| |
| df = pd.read_excel(BytesIO(resp.content)) |
| filtered_df = df[~( |
| df["Uploaded"].isna())][["TDoc", "Title", "CR category", "For", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]] |
| filtered_df["URL"] = filtered_df["TDoc"].apply( |
| lambda tdoc: f"{url}/{tdoc}.zip") |
|
|
| df = filtered_df.fillna("") |
| return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "For", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records")) |
|
|
| |
|
|
|
|
| @router.post("/download_docs") |
| async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> StreamingResponse: |
| """Download the specified TDocs and zips them in a single archive""" |
|
|
| |
| document_ids = [doc.document for doc in req.documents] |
|
|
| logging.info(f"Downloading TDocs: {document_ids}") |
|
|
| |
| def __normalize_for_path(text: str) -> str: |
| if not text: |
| return "_unspecified_agenda_item" |
| text = re.sub(r'\s+', '_', text) |
| text = re.sub(r'[^\w\s-]', '', text).strip() |
| return text if text else "_unspecified_agenda_item" |
|
|
| async def _process_single_document(item: DocInfo): |
| """Attempts to convert a document to text and returns success status and content.""" |
| try: |
| filename, ext, bytes = await get_doc_archive(item.url, http_client) |
| text_lines = await extract_text_contents(filename, ext, bytes) |
| content_bytes = "\n".join(text_lines).encode("utf-8") |
| return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item} |
| except Exception as e: |
| logging.warning( |
| f"Failed to process document '{item.document}' from URL '{item.url}': {e}") |
| error_message = f"Document '{item.document}' text extraction failed: {e}".encode( |
| "utf-8") |
| return {"doc_id": item.document, "content": error_message, "failed": True, "agenda_item": item.agenda_item} |
|
|
| convert_tasks = await asyncio.gather(*[_process_single_document(doc) for doc in req.documents], return_exceptions=False) |
|
|
| zip_buffer = io.BytesIO() |
| with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: |
| for task in convert_tasks: |
| |
| agenda_item_str = task.get("agenda_item") or "" |
| directory_name = __normalize_for_path(agenda_item_str) |
|
|
| failed = "failed" in task |
| doc_id = task["doc_id"] |
| base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt" |
|
|
| |
| full_file_path = f"{directory_name}/{base_filename}" if req.sort_by_agenda_item else base_filename |
| zip_file.writestr(full_file_path, task["content"]) |
|
|
| zip_buffer.seek(0) |
|
|
| return StreamingResponse( |
| zip_buffer, |
| media_type="application/zip", |
| headers={"Content-Disposition": "attachment; filename=tdocs.zip"} |
| ) |
|
|
| |
|
|
|
|
| @router.post("/download_user_docs") |
| async def download_user_docs(files: list[UploadFile] = File(...)): |
| """Freeform convert the user files into text and downloads them as a single zip file.""" |
| file_infos = [] |
|
|
| |
| for file in files: |
| filename, ext = os.path.splitext(file.filename) |
| file_infos.append({ |
| "filename": filename, |
| "extension": ext, |
| "content": io.BytesIO(await file.read()) |
| }) |
|
|
| filenames = [file["filename"] for file in file_infos] |
| logging.info(f"Got {len(file_infos)} user files to convert.") |
| logging.debug(f"Filenames: {filenames}") |
|
|
| |
| async def _process_single_document(item: dict): |
| try: |
| text_lines = await extract_text_contents(item["filename"], item["extension"], item["content"]) |
| content_bytes = "\n".join(text_lines).encode("utf-8") |
| return {"doc_id": item["filename"], "content": content_bytes} |
| except Exception as e: |
| doc = item["filename"] |
| logging.warning( |
| f"Failed to process document '{doc}': {e}") |
| error_message = f"Document '{doc}' text extraction failed: {e}".encode( |
| "utf-8") |
| return {"doc_id": doc, "content": error_message, "failed": True} |
|
|
| convert_tasks = await asyncio.gather(*[_process_single_document(file) for file in file_infos], return_exceptions=False) |
|
|
| zip_buffer = io.BytesIO() |
| with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: |
| for task in convert_tasks: |
| failed = "failed" in task |
| doc_id = task["doc_id"] |
| base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt" |
| zip_file.writestr(base_filename, task["content"]) |
|
|
| zip_buffer.seek(0) |
|
|
| return StreamingResponse( |
| zip_buffer, |
| media_type="application/zip", |
| headers={"Content-Disposition": "attachment; filename=user_files.zip"} |
| ) |
|
|
| |
|
|
|
|
| class ProgressUpdate(BaseModel): |
| """Defines the structure of a single SSE message.""" |
| status: Literal["progress", "complete"] |
| data: dict |
| total_docs: int |
| processed_docs: int |
|
|
|
|
| @router.post("/extract_requirements/sse") |
| async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_router: Router = Depends(get_llm_router), http_client: AsyncClient = Depends(get_http_client)): |
| """Extract requirements from the specified xxxxCR docs using a LLM and returns SSE events about the progress of ongoing operations""" |
|
|
| documents = req.documents |
| n_docs = len(documents) |
|
|
| logging.info( |
| "Generating requirements for documents: {}".format(req.documents)) |
|
|
| |
| concurrency_sema = asyncio.Semaphore(4) |
|
|
| def prompt(doc_id, full): |
| return f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found. Remove the errors" |
|
|
| async def _process_document(doc) -> list[DocRequirements]: |
| doc_id = doc.document |
| url = doc.url |
|
|
| |
| try: |
| filename, ext, bytes = await get_doc_archive(url, http_client) |
| txt_data = await extract_text_contents(filename, ext, bytes) |
| full = "\n".join(txt_data) |
| except Exception as e: |
| fmt = "".join(traceback.format_exception(e)) |
| logging.error(f"Failed to process doc {doc_id} : {fmt}") |
| return [DocRequirements(document=doc_id, context="Failed to process document", requirements=[])] |
|
|
| try: |
| await concurrency_sema.acquire() |
|
|
| model_used = "gemini-v2" |
| resp_ai = await llm_router.acompletion( |
| model=model_used, |
| messages=[ |
| {"role": "user", "content": prompt(doc_id, full)}], |
| response_format=ExtractRequirementsResponse |
| ) |
| return ExtractRequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements |
| except Exception as e: |
| return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])] |
| finally: |
| concurrency_sema.release() |
|
|
| |
| process_futures = [_process_document(doc) for doc in documents] |
|
|
| |
| def progress_update(x): return f"data: {x.model_dump_json()}\n\n" |
|
|
| |
| async def _stream_generator(docs: list[asyncio.Future]): |
| items = [] |
| n_processed = 0 |
|
|
| yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=0)) |
|
|
| for doc in asyncio.as_completed(docs): |
| result = await doc |
| items.extend(result) |
| n_processed += 1 |
| yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=n_processed)) |
|
|
| final_response = ExtractRequirementsResponse(requirements=items) |
|
|
| yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed)) |
|
|
| return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream") |
|
|
| |
|
|
| class URLRequest(BaseModel): |
| url: str |
|
|
| @router.post("/extract_text_from_url") |
| async def extract_text_from_url(data: URLRequest, http_client: AsyncClient = Depends(get_http_client)) -> dict: |
| """Extract text from a given document URL and return the text content.""" |
|
|
| url = data.url |
| logging.info(f"Extracting text from URL: {url}") |
|
|
| try: |
| filename, ext, bytes = await get_doc_archive(url, http_client) |
| text_lines = await extract_text_contents(filename, ext, bytes) |
| content = "\n".join(text_lines) |
| return {"document": filename, "content": content} |
| except Exception as e: |
| logging.error(f"Failed to extract text from URL '{url}': {e}") |
| raise HTTPException(status_code=500, detail=f"Text extraction failed: {e}") |