Spaces:

Isshi14
/

CHECK

No application file

App Files Files Community

CHECK / ingestion.py

Isshi14

Upload 12 files

ebd182e verified 3 months ago

raw

history blame contribute delete

9.24 kB

	"""
	VoiceVerse AI — Content Ingestion Module.

	Handles all input sources beyond file upload:
	- YouTube links → transcript via youtube-transcript-api
	- Article / website → readable text via trafilatura + BeautifulSoup fallback
	- Pasted raw text → light cleaning and validation

	Returns plain text string that feeds into RAGStore.add_document().
	rag.py is completely unchanged.
	"""

	import re
	import urllib.parse
	from utils import logger


	# ══════════════════════════════════════════════════════════════════════════════
	# URL type detection
	# ══════════════════════════════════════════════════════════════════════════════

	def _is_youtube(url: str) -> bool:
	parsed = urllib.parse.urlparse(url.strip())
	host = parsed.netloc.lower().replace("www.", "")
	return host in ("youtube.com", "youtu.be")


	def _extract_youtube_id(url: str) -> str \| None:
	patterns = [
	r"(?:v=)([a-zA-Z0-9_-]{11})",
	r"youtu\.be/([a-zA-Z0-9_-]{11})",
	r"embed/([a-zA-Z0-9_-]{11})",
	r"shorts/([a-zA-Z0-9_-]{11})",
	]
	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None


	# ══════════════════════════════════════════════════════════════════════════════
	# YouTube transcript
	# ══════════════════════════════════════════════════════════════════════════════

	def extract_youtube(url: str) -> str:
	try:
	from youtube_transcript_api import (
	YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
	)
	except ImportError:
	raise ImportError(
	"youtube-transcript-api is not installed. "
	"Add 'youtube-transcript-api' to requirements.txt and restart the Space."
	)

	video_id = _extract_youtube_id(url)
	if not video_id:
	raise ValueError(f"Could not extract a YouTube video ID from: {url}")

	logger.info("Fetching YouTube transcript: video_id=%s", video_id)

	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	# Prefer English manual captions, then English auto, then anything available
	try:
	transcript = transcript_list.find_manually_created_transcript(
	["en", "en-US", "en-GB"]
	)
	except NoTranscriptFound:
	try:
	transcript = transcript_list.find_generated_transcript(
	["en", "en-US", "en-GB"]
	)
	except NoTranscriptFound:
	transcript = next(iter(transcript_list))
	logger.info("No English transcript — using: %s", transcript.language)

	entries = transcript.fetch()
	text = " ".join(entry["text"] for entry in entries)

	# Clean YouTube caption artifacts
	text = re.sub(r"\[.*?\]", "", text) # [Music], [Applause] etc.
	text = re.sub(r"\s{2,}", " ", text).strip()

	if len(text) < 50:
	raise ValueError("YouTube transcript is too short to process.")

	logger.info("YouTube transcript: %d chars", len(text))
	return text

	except (NoTranscriptFound, TranscriptsDisabled) as e:
	raise ValueError(
	f"No transcript available for this video. "
	f"The video may have captions disabled or be private.\n\n"
	f"Tip: Copy the article/video text manually and use the Paste Text tab instead."
	)


	# ══════════════════════════════════════════════════════════════════════════════
	# Article / website URL
	# ══════════════════════════════════════════════════════════════════════════════

	def extract_url(url: str) -> str:
	"""
	Fetch a webpage and extract readable text.
	Tries trafilatura first (best article extractor), falls back to BeautifulSoup.
	"""
	url = url.strip()
	logger.info("Fetching URL: %s", url)

	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	)
	}

	# ── Attempt 1: trafilatura ────────────────────────────────────────────────
	try:
	import trafilatura
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	text = trafilatura.extract(
	downloaded,
	include_comments=False,
	include_tables=True,
	no_fallback=False,
	)
	if text and len(text.strip()) > 100:
	logger.info("trafilatura extracted %d chars", len(text))
	return text.strip()
	except Exception as e:
	logger.warning("trafilatura failed (%s) — trying BeautifulSoup", e)

	# ── Attempt 2: requests + BeautifulSoup ──────────────────────────────────
	try:
	import requests
	from bs4 import BeautifulSoup

	resp = requests.get(url, headers=headers, timeout=15)
	resp.raise_for_status()

	soup = BeautifulSoup(resp.text, "html.parser")
	for tag in soup(["script", "style", "nav", "footer", "header",
	"aside", "form", "noscript", "iframe"]):
	tag.decompose()

	article = soup.find("article") or soup.find("main") or soup.find("body")
	text = (
	article.get_text(separator=" ", strip=True)
	if article
	else soup.get_text(separator=" ", strip=True)
	)
	text = re.sub(r"\s{3,}", "\n\n", text)
	text = re.sub(r" {2,}", " ", text).strip()

	if len(text) < 100:
	raise ValueError("Could not extract enough text from this page.")

	logger.info("BeautifulSoup extracted %d chars", len(text))
	return text

	except Exception as e:
	raise ValueError(
	f"Could not fetch content from: {url}\n\n"
	f"Reason: {e}\n\n"
	"The page may require a login or block bots. "
	"Try copying the article text and pasting it in the Paste Text tab."
	)


	# ══════════════════════════════════════════════════════════════════════════════
	# Pasted raw text
	# ══════════════════════════════════════════════════════════════════════════════

	def extract_pasted_text(text: str) -> str:
	if not text or not text.strip():
	raise ValueError("No text was pasted. Please paste some content.")

	text = text.replace("\r\n", "\n").replace("\r", "\n")
	text = re.sub(r"\n{4,}", "\n\n\n", text)
	text = re.sub(r" {2,}", " ", text).strip()

	if len(text) < 50:
	raise ValueError(
	"Pasted text is too short. Please paste at least a paragraph of content."
	)

	logger.info("Pasted text ingested: %d chars", len(text))
	return text


	# ══════════════════════════════════════════════════════════════════════════════
	# Unified entry point
	# ══════════════════════════════════════════════════════════════════════════════

	def ingest_from_url_or_text(raw_input: str) -> tuple[str, str]:
	"""
	Auto-detect whether input is a YouTube URL, article URL, or plain text.

	Returns:
	(extracted_text, source_label)
	"""
	raw = raw_input.strip()
	if not raw:
	raise ValueError("Please enter a URL or paste some text.")

	if re.match(r"https?://", raw, re.IGNORECASE):
	if _is_youtube(raw):
	return extract_youtube(raw), "YouTube"
	else:
	return extract_url(raw), "Article / Website"
	else:
	return extract_pasted_text(raw), "Pasted Text"