Spaces:

Isshi14
/

CHECK

No application file

File size: 9,236 Bytes

ebd182e

"""
VoiceVerse AI — Content Ingestion Module.

Handles all input sources beyond file upload:
  - YouTube links      → transcript via youtube-transcript-api
  - Article / website  → readable text via trafilatura + BeautifulSoup fallback
  - Pasted raw text    → light cleaning and validation

Returns plain text string that feeds into RAGStore.add_document().
rag.py is completely unchanged.
"""

import re
import urllib.parse
from utils import logger


# ══════════════════════════════════════════════════════════════════════════════
# URL type detection
# ══════════════════════════════════════════════════════════════════════════════

def _is_youtube(url: str) -> bool:
    parsed = urllib.parse.urlparse(url.strip())
    host = parsed.netloc.lower().replace("www.", "")
    return host in ("youtube.com", "youtu.be")


def _extract_youtube_id(url: str) -> str | None:
    patterns = [
        r"(?:v=)([a-zA-Z0-9_-]{11})",
        r"youtu\.be/([a-zA-Z0-9_-]{11})",
        r"embed/([a-zA-Z0-9_-]{11})",
        r"shorts/([a-zA-Z0-9_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


# ══════════════════════════════════════════════════════════════════════════════
# YouTube transcript
# ══════════════════════════════════════════════════════════════════════════════

def extract_youtube(url: str) -> str:
    try:
        from youtube_transcript_api import (
            YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
        )
    except ImportError:
        raise ImportError(
            "youtube-transcript-api is not installed. "
            "Add 'youtube-transcript-api' to requirements.txt and restart the Space."
        )

    video_id = _extract_youtube_id(url)
    if not video_id:
        raise ValueError(f"Could not extract a YouTube video ID from: {url}")

    logger.info("Fetching YouTube transcript: video_id=%s", video_id)

    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        # Prefer English manual captions, then English auto, then anything available
        try:
            transcript = transcript_list.find_manually_created_transcript(
                ["en", "en-US", "en-GB"]
            )
        except NoTranscriptFound:
            try:
                transcript = transcript_list.find_generated_transcript(
                    ["en", "en-US", "en-GB"]
                )
            except NoTranscriptFound:
                transcript = next(iter(transcript_list))
                logger.info("No English transcript — using: %s", transcript.language)

        entries = transcript.fetch()
        text = " ".join(entry["text"] for entry in entries)

        # Clean YouTube caption artifacts
        text = re.sub(r"\[.*?\]", "", text)        # [Music], [Applause] etc.
        text = re.sub(r"\s{2,}", " ", text).strip()

        if len(text) < 50:
            raise ValueError("YouTube transcript is too short to process.")

        logger.info("YouTube transcript: %d chars", len(text))
        return text

    except (NoTranscriptFound, TranscriptsDisabled) as e:
        raise ValueError(
            f"No transcript available for this video. "
            f"The video may have captions disabled or be private.\n\n"
            f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
        )


# ══════════════════════════════════════════════════════════════════════════════
# Article / website URL
# ══════════════════════════════════════════════════════════════════════════════

def extract_url(url: str) -> str:
    """
    Fetch a webpage and extract readable text.
    Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
    """
    url = url.strip()
    logger.info("Fetching URL: %s", url)

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }

    # ── Attempt 1: trafilatura ────────────────────────────────────────────────
    try:
        import trafilatura
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(
                downloaded,
                include_comments=False,
                include_tables=True,
                no_fallback=False,
            )
            if text and len(text.strip()) > 100:
                logger.info("trafilatura extracted %d chars", len(text))
                return text.strip()
    except Exception as e:
        logger.warning("trafilatura failed (%s) — trying BeautifulSoup", e)

    # ── Attempt 2: requests + BeautifulSoup ──────────────────────────────────
    try:
        import requests
        from bs4 import BeautifulSoup

        resp = requests.get(url, headers=headers, timeout=15)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        for tag in soup(["script", "style", "nav", "footer", "header",
                          "aside", "form", "noscript", "iframe"]):
            tag.decompose()

        article = soup.find("article") or soup.find("main") or soup.find("body")
        text = (
            article.get_text(separator=" ", strip=True)
            if article
            else soup.get_text(separator=" ", strip=True)
        )
        text = re.sub(r"\s{3,}", "\n\n", text)
        text = re.sub(r" {2,}", " ", text).strip()

        if len(text) < 100:
            raise ValueError("Could not extract enough text from this page.")

        logger.info("BeautifulSoup extracted %d chars", len(text))
        return text

    except Exception as e:
        raise ValueError(
            f"Could not fetch content from: {url}\n\n"
            f"Reason: {e}\n\n"
            "The page may require a login or block bots. "
            "Try copying the article text and pasting it in the Paste Text tab."
        )


# ══════════════════════════════════════════════════════════════════════════════
# Pasted raw text
# ══════════════════════════════════════════════════════════════════════════════

def extract_pasted_text(text: str) -> str:
    if not text or not text.strip():
        raise ValueError("No text was pasted. Please paste some content.")

    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"\n{4,}", "\n\n\n", text)
    text = re.sub(r" {2,}", " ", text).strip()

    if len(text) < 50:
        raise ValueError(
            "Pasted text is too short. Please paste at least a paragraph of content."
        )

    logger.info("Pasted text ingested: %d chars", len(text))
    return text


# ══════════════════════════════════════════════════════════════════════════════
# Unified entry point
# ══════════════════════════════════════════════════════════════════════════════

def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
    """
    Auto-detect whether input is a YouTube URL, article URL, or plain text.

    Returns:
        (extracted_text, source_label)
    """
    raw = raw_input.strip()
    if not raw:
        raise ValueError("Please enter a URL or paste some text.")

    if re.match(r"https?://", raw, re.IGNORECASE):
        if _is_youtube(raw):
            return extract_youtube(raw), "YouTube"
        else:
            return extract_url(raw), "Article / Website"
    else:
        return extract_pasted_text(raw), "Pasted Text"