|
|
| """
|
| YouTube Transcript Fetcher
|
| Fetches transcripts directly from YouTube videos using the YouTube Transcript API.
|
| No HTML parsing or scraping involved.
|
| """
|
|
|
| import argparse
|
| import json
|
| import sys
|
| import re
|
| from typing import Optional
|
|
|
| from youtube_transcript_api import YouTubeTranscriptApi
|
| from youtube_transcript_api.formatters import (
|
| TextFormatter,
|
| JSONFormatter,
|
| SRTFormatter,
|
| WebVTTFormatter,
|
| )
|
| from youtube_transcript_api._errors import (
|
| TranscriptsDisabled,
|
| NoTranscriptFound,
|
| VideoUnavailable,
|
| CouldNotRetrieveTranscript,
|
| )
|
|
|
|
|
| def extract_video_id(url_or_id: str) -> str:
|
| """
|
| Extract the video ID from a YouTube URL or return it directly if already an ID.
|
|
|
| Supports formats:
|
| - https://www.youtube.com/watch?v=VIDEO_ID
|
| - https://youtu.be/VIDEO_ID
|
| - https://www.youtube.com/shorts/VIDEO_ID
|
| - https://www.youtube.com/embed/VIDEO_ID
|
| - VIDEO_ID (raw)
|
| """
|
| patterns = [
|
| r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
|
| r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
|
| r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
|
| r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
|
| ]
|
| for pattern in patterns:
|
| match = re.search(pattern, url_or_id)
|
| if match:
|
| return match.group(1)
|
|
|
|
|
| if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
|
| return url_or_id
|
|
|
| raise ValueError(
|
| f"Could not extract a valid YouTube video ID from: {url_or_id}\n"
|
| "Accepted formats: full URL, youtu.be short link, or raw 11-character video ID."
|
| )
|
|
|
|
|
| def list_available_transcripts(video_id: str) -> None:
|
| """List all available transcript languages for a video."""
|
| api = YouTubeTranscriptApi()
|
| transcript_list = api.list(video_id)
|
|
|
| print(f"\nAvailable transcripts for video: {video_id}\n")
|
|
|
| manually_created = list(transcript_list._manually_created_transcripts.values())
|
| auto_generated = list(transcript_list._generated_transcripts.values())
|
|
|
| if manually_created:
|
| print("Manually created:")
|
| for t in manually_created:
|
| print(f" [{t.language_code}] {t.language}")
|
|
|
| if auto_generated:
|
| print("Auto-generated:")
|
| for t in auto_generated:
|
| print(f" [{t.language_code}] {t.language} (auto)")
|
|
|
| if not manually_created and not auto_generated:
|
| print(" No transcripts found.")
|
|
|
|
|
| def fetch_transcript(
|
| video_id: str,
|
| languages: Optional[list] = None,
|
| output_format: str = "text",
|
| preserve_timestamps: bool = False,
|
| output_file: Optional[str] = None,
|
| ) -> str:
|
| """
|
| Fetch transcript for a given video ID.
|
|
|
| Args:
|
| video_id: YouTube video ID.
|
| languages: Ordered list of language codes to try (e.g. ['en', 'es']).
|
| Falls back to the first available transcript if None.
|
| output_format: One of 'text', 'json', 'srt', 'vtt'.
|
| preserve_timestamps: Include timestamps in plain-text output.
|
| output_file: If provided, write transcript to this file path.
|
|
|
| Returns:
|
| The transcript as a formatted string.
|
| """
|
| if languages is None:
|
| languages = ["en"]
|
|
|
| try:
|
| api = YouTubeTranscriptApi()
|
| transcript_list = api.list(video_id)
|
|
|
|
|
| try:
|
| transcript = transcript_list.find_transcript(languages)
|
| except NoTranscriptFound:
|
|
|
| all_transcripts = list(transcript_list)
|
|
|
| if not all_transcripts:
|
| print(f"Error: No transcript is available for video '{video_id}'.", file=sys.stderr)
|
| sys.exit(1)
|
|
|
| transcript = all_transcripts[0]
|
| print(
|
| f"Warning: None of the requested languages found. "
|
| f"Using [{transcript.language_code}] {transcript.language} instead.",
|
| file=sys.stderr,
|
| )
|
|
|
| transcript_data = transcript.fetch()
|
|
|
|
|
| if output_format == "json":
|
| formatter = JSONFormatter()
|
| result = formatter.format_transcript(transcript_data, indent=2)
|
|
|
| elif output_format == "srt":
|
| formatter = SRTFormatter()
|
| result = formatter.format_transcript(transcript_data)
|
|
|
| elif output_format == "vtt":
|
| formatter = WebVTTFormatter()
|
| result = formatter.format_transcript(transcript_data)
|
|
|
| else:
|
| if preserve_timestamps:
|
| lines = []
|
| for entry in transcript_data:
|
| minutes = int(entry["start"] // 60)
|
| seconds = entry["start"] % 60
|
| lines.append(f"[{minutes:02d}:{seconds:05.2f}] {entry['text']}")
|
| result = "\n".join(lines)
|
| else:
|
| formatter = TextFormatter()
|
| result = formatter.format_transcript(transcript_data)
|
|
|
| if output_file:
|
| with open(output_file, "w", encoding="utf-8") as f:
|
| f.write(result)
|
| print(f"Transcript saved to: {output_file}")
|
|
|
| return result
|
|
|
| except TranscriptsDisabled:
|
| print(f"Error: Transcripts are disabled for video '{video_id}'.", file=sys.stderr)
|
| sys.exit(1)
|
| except VideoUnavailable:
|
| print(f"Error: Video '{video_id}' is unavailable or does not exist.", file=sys.stderr)
|
| sys.exit(1)
|
| except CouldNotRetrieveTranscript as e:
|
| print(f"Error for video '{video_id}': {e}", file=sys.stderr)
|
| sys.exit(1)
|
| except Exception as e:
|
| print(f"Unexpected error: {e}", file=sys.stderr)
|
| sys.exit(1)
|
|
|
|
|
| def fetch_multiple(
|
| video_ids: list,
|
| languages: Optional[list] = None,
|
| output_format: str = "text",
|
| preserve_timestamps: bool = False,
|
| output_dir: Optional[str] = None,
|
| ) -> dict:
|
| """
|
| Fetch transcripts for multiple video IDs.
|
|
|
| Args:
|
| video_ids: List of YouTube video IDs.
|
| languages: Language preference list.
|
| output_format: Output format string.
|
| preserve_timestamps: Include timestamps.
|
| output_dir: Directory to save individual transcript files.
|
|
|
| Returns:
|
| Dictionary mapping video_id -> transcript string (or error message).
|
| """
|
| import os
|
|
|
| results = {}
|
| for vid in video_ids:
|
| print(f"Fetching: {vid}", file=sys.stderr)
|
| try:
|
| out_file = None
|
| if output_dir:
|
| ext_map = {"text": "txt", "json": "json", "srt": "srt", "vtt": "vtt"}
|
| ext = ext_map.get(output_format, "txt")
|
| os.makedirs(output_dir, exist_ok=True)
|
| out_file = os.path.join(output_dir, f"{vid}.{ext}")
|
|
|
| transcript = fetch_transcript(
|
| video_id=vid,
|
| languages=languages,
|
| output_format=output_format,
|
| preserve_timestamps=preserve_timestamps,
|
| output_file=out_file,
|
| )
|
| results[vid] = {"status": "ok", "transcript": transcript}
|
| except SystemExit:
|
| results[vid] = {"status": "error", "transcript": None}
|
|
|
| return results
|
|
|
|
|
| def parse_args():
|
| parser = argparse.ArgumentParser(
|
| description="Fetch YouTube video transcripts directly — no scraping required.",
|
| formatter_class=argparse.RawTextHelpFormatter,
|
| )
|
|
|
| parser.add_argument(
|
| "video",
|
| nargs="+",
|
| help="YouTube video URL(s) or video ID(s).",
|
| )
|
|
|
| parser.add_argument(
|
| "-l", "--languages",
|
| nargs="+",
|
| default=["en"],
|
| metavar="LANG",
|
| help="Language codes in order of preference (default: en).\nExample: --languages en es fr",
|
| )
|
|
|
| parser.add_argument(
|
| "-f", "--format",
|
| choices=["text", "json", "srt", "vtt"],
|
| default="text",
|
| help="Output format (default: text).",
|
| )
|
|
|
| parser.add_argument(
|
| "-t", "--timestamps",
|
| action="store_true",
|
| help="Include timestamps in plain-text output.",
|
| )
|
|
|
| parser.add_argument(
|
| "-o", "--output",
|
| metavar="PATH",
|
| help="Output file path (single video) or directory (multiple videos).",
|
| )
|
|
|
| parser.add_argument(
|
| "--list",
|
| action="store_true",
|
| help="List all available transcript languages for the video(s) and exit.",
|
| )
|
|
|
| return parser.parse_args()
|
|
|
|
|
| def main():
|
| args = parse_args()
|
|
|
| video_ids = [extract_video_id(v) for v in args.video]
|
|
|
| if args.list:
|
| for vid in video_ids:
|
| list_available_transcripts(vid)
|
| return
|
|
|
| if len(video_ids) == 1:
|
| transcript = fetch_transcript(
|
| video_id=video_ids[0],
|
| languages=args.languages,
|
| output_format=args.format,
|
| preserve_timestamps=args.timestamps,
|
| output_file=args.output,
|
| )
|
| if not args.output:
|
| print(transcript)
|
| else:
|
| results = fetch_multiple(
|
| video_ids=video_ids,
|
| languages=args.languages,
|
| output_format=args.format,
|
| preserve_timestamps=args.timestamps,
|
| output_dir=args.output,
|
| )
|
| if not args.output:
|
| for vid, data in results.items():
|
| print(f"\n{'='*60}")
|
| print(f"Video ID: {vid}")
|
| print(f"{'='*60}")
|
| if data["status"] == "ok":
|
| print(data["transcript"])
|
| else:
|
| print("Failed to retrieve transcript.")
|
|
|
|
|
| if __name__ == "__main__":
|
| main() |