moonlantern1 commited on
Commit
e073547
·
verified ·
1 Parent(s): 72f119a

Harden YouTube downloads on Hugging Face

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -3
  2. README.md +4 -0
  3. src/humeo/ingest.py +105 -26
Dockerfile CHANGED
@@ -6,9 +6,9 @@ ENV PYTHONUNBUFFERED=1 \
6
 
7
  WORKDIR /app
8
 
9
- RUN apt-get update && \
10
- apt-get install -y ffmpeg && \
11
- rm -rf /var/lib/apt/lists/*
12
 
13
  COPY . /app
14
 
 
6
 
7
  WORKDIR /app
8
 
9
+ RUN apt-get update && \
10
+ apt-get install -y ffmpeg nodejs ca-certificates && \
11
+ rm -rf /var/lib/apt/lists/*
12
 
13
  COPY . /app
14
 
README.md CHANGED
@@ -31,9 +31,13 @@ Required Space secrets:
31
 
32
  - `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
33
  - `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
 
34
 
35
  If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
36
  `ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
 
 
 
37
 
38
  ## Repo layout
39
 
 
31
 
32
  - `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
33
  - `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
34
+ - Optional for YouTube links on cloud IPs: `YTDLP_COOKIES_B64`
35
 
36
  If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
37
  `ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
38
+ If YouTube blocks Hugging Face with a bot/sign-in challenge, export a Netscape
39
+ `cookies.txt` from a logged-in browser, base64 encode it, and set it as the
40
+ `YTDLP_COOKIES_B64` Space secret. Local file upload does not need this.
41
 
42
  ## Repo layout
43
 
src/humeo/ingest.py CHANGED
@@ -7,13 +7,14 @@ Responsibilities:
7
  - Generate word-level timestamped transcript.
8
  """
9
 
10
- import json
11
- import logging
12
- import os
13
- import shutil
14
- import subprocess
15
- from math import ceil
16
- from pathlib import Path
 
17
 
18
  import httpx
19
 
@@ -27,9 +28,63 @@ OPENAI_MIN_CHUNK_SEC = 300.0
27
  ELEVENLABS_TRANSCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
28
  TRANSCRIPT_META_FILENAME = "transcript.meta.json"
29
  ELEVENLABS_SCRIBE_MODEL = "scribe_v2"
30
- _ELEVENLABS_SEGMENT_MAX_GAP_SEC = 0.65
31
- _ELEVENLABS_SEGMENT_MAX_DURATION_SEC = 6.0
32
- _ELEVENLABS_SEGMENT_MAX_WORDS = 18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  def stage_local_video(source: str | Path, output_dir: Path) -> Path:
@@ -72,22 +127,46 @@ def download_video(youtube_url: str, output_dir: Path) -> Path:
72
 
73
  Returns the path to the downloaded MP4 file.
74
  """
75
- output_template = str(output_dir / "source.%(ext)s")
76
- cmd = [
77
- "yt-dlp",
78
- "--format", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
79
- "--merge-output-format", "mp4",
80
- "--output", output_template,
81
- "--no-playlist",
82
- "--write-info-json",
83
- "--quiet",
84
- youtube_url,
85
- ]
86
-
87
- logger.info("Downloading video: %s", youtube_url)
88
- result = subprocess.run(cmd, check=True, capture_output=True, text=True)
89
- if result.stderr:
90
- logger.warning(result.stderr.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # yt-dlp should produce source.mp4
93
  video_path = output_dir / "source.mp4"
 
7
  - Generate word-level timestamped transcript.
8
  """
9
 
10
+ import json
11
+ import logging
12
+ import os
13
+ import shutil
14
+ import subprocess
15
+ import base64
16
+ from math import ceil
17
+ from pathlib import Path
18
 
19
  import httpx
20
 
 
28
  ELEVENLABS_TRANSCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
29
  TRANSCRIPT_META_FILENAME = "transcript.meta.json"
30
  ELEVENLABS_SCRIBE_MODEL = "scribe_v2"
31
+ _ELEVENLABS_SEGMENT_MAX_GAP_SEC = 0.65
32
+ _ELEVENLABS_SEGMENT_MAX_DURATION_SEC = 6.0
33
+ _ELEVENLABS_SEGMENT_MAX_WORDS = 18
34
+ YTDLP_BROWSER_USER_AGENT = (
35
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
36
+ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
37
+ )
38
+
39
+
40
+ def _decode_cookie_secret(raw: str) -> str:
41
+ text = raw.strip()
42
+ if "\\n" in text and "\n" not in text:
43
+ text = text.replace("\\n", "\n")
44
+ return text
45
+
46
+
47
+ def _yt_dlp_cookie_file(output_dir: Path) -> Path | None:
48
+ raw = (
49
+ os.environ.get("YTDLP_COOKIES")
50
+ or os.environ.get("YOUTUBE_COOKIES")
51
+ or ""
52
+ ).strip()
53
+ encoded = (
54
+ os.environ.get("YTDLP_COOKIES_B64")
55
+ or os.environ.get("YOUTUBE_COOKIES_B64")
56
+ or ""
57
+ ).strip()
58
+ if not raw and encoded:
59
+ try:
60
+ raw = base64.b64decode(encoded).decode("utf-8")
61
+ except Exception as exc:
62
+ raise RuntimeError("Could not decode YTDLP_COOKIES_B64.") from exc
63
+ if not raw:
64
+ return None
65
+
66
+ cookie_path = output_dir / "yt-dlp-cookies.txt"
67
+ cookie_path.write_text(_decode_cookie_secret(raw).rstrip() + "\n", encoding="utf-8")
68
+ try:
69
+ cookie_path.chmod(0o600)
70
+ except OSError:
71
+ pass
72
+ return cookie_path
73
+
74
+
75
+ def _yt_dlp_error(exc: subprocess.CalledProcessError) -> RuntimeError:
76
+ stdout = (exc.stdout or "").strip()
77
+ stderr = (exc.stderr or "").strip()
78
+ details = stderr or stdout or str(exc)
79
+ lowered = details.lower()
80
+ hint = ""
81
+ if any(token in lowered for token in ("sign in", "not a bot", "confirm you're not a bot", "cookies")):
82
+ hint = (
83
+ "\n\nYouTube blocked the Hugging Face downloader. Add a Space secret named "
84
+ "YTDLP_COOKIES_B64 containing a base64 encoded Netscape cookies.txt export "
85
+ "from a logged-in browser, or upload the MP4 directly."
86
+ )
87
+ return RuntimeError(f"yt-dlp failed to download the YouTube video:\n{details}{hint}")
88
 
89
 
90
  def stage_local_video(source: str | Path, output_dir: Path) -> Path:
 
127
 
128
  Returns the path to the downloaded MP4 file.
129
  """
130
+ output_template = str(output_dir / "source.%(ext)s")
131
+ cmd = [
132
+ "yt-dlp",
133
+ "--format",
134
+ "bv*[ext=mp4]+ba[ext=m4a]/bv*+ba/best[ext=mp4]/best",
135
+ "--merge-output-format",
136
+ "mp4",
137
+ "--output",
138
+ output_template,
139
+ "--no-playlist",
140
+ "--write-info-json",
141
+ "--retries",
142
+ "5",
143
+ "--fragment-retries",
144
+ "5",
145
+ "--extractor-retries",
146
+ "3",
147
+ "--socket-timeout",
148
+ "30",
149
+ "--force-ipv4",
150
+ "--user-agent",
151
+ YTDLP_BROWSER_USER_AGENT,
152
+ "--extractor-args",
153
+ (os.environ.get("YTDLP_EXTRACTOR_ARGS") or "youtube:player_client=default,web_creator"),
154
+ "--quiet",
155
+ ]
156
+ if shutil.which("node"):
157
+ cmd.extend(["--js-runtimes", "node", "--remote-components", "ejs:github"])
158
+ cookie_path = _yt_dlp_cookie_file(output_dir)
159
+ if cookie_path is not None:
160
+ cmd.extend(["--cookies", str(cookie_path)])
161
+ cmd.append(youtube_url)
162
+
163
+ logger.info("Downloading video: %s", youtube_url)
164
+ try:
165
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
166
+ except subprocess.CalledProcessError as exc:
167
+ raise _yt_dlp_error(exc) from exc
168
+ if result.stderr:
169
+ logger.warning(result.stderr.strip())
170
 
171
  # yt-dlp should produce source.mp4
172
  video_path = output_dir / "source.mp4"