|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +from youtool import YouTube |
| 4 | + |
| 5 | +from .base import Command |
| 6 | + |
| 7 | + |
| 8 | +class VideoTranscription(Command): |
| 9 | + """Download video transcriptions from YouTube videos based on IDs or URLs (or CSV filename with URLs/IDs inside), and save them to files.""" |
| 10 | + |
| 11 | + name = "video-transcription" |
| 12 | + arguments = [ |
| 13 | + {"name": "--ids", "type": str, "help": "Video IDs", "nargs": "*", "mutually_exclusive_group": "input_source"}, |
| 14 | + {"name": "--urls", "type": str, "help": "Video URLs", "nargs": "*", "mutually_exclusive_group": "input_source"}, |
| 15 | + { |
| 16 | + "name": "--urls-file-path", |
| 17 | + "type": Path, |
| 18 | + "help": "Channels urls csv file path", |
| 19 | + "mutually_exclusive_group": "input_source", |
| 20 | + }, |
| 21 | + {"name": "--ids-file-path", "type": Path, "help": "Channel IDs CSV file path", "mutually_exclusive_group": "input_source"}, |
| 22 | + {"name": "--output-dir", "type": Path, "help": "Output directory to save transcriptions", "required": True}, |
| 23 | + {"name": "--language-code", "type": str, "help": "Language code for transcription", "required": True}, |
| 24 | + {"name": "--url_column_name", "type": str, "help": "URL column name on CSV input files"}, |
| 25 | + {"name": "--id_column_name", "type": str, "help": "ID column name on CSV input files"}, |
| 26 | + ] |
| 27 | + |
| 28 | + ID_COLUMN_NAME: str = "video_id" |
| 29 | + URL_COLUMN_NAME: str = "video_url" |
| 30 | + |
| 31 | + @classmethod |
| 32 | + def execute(cls, **kwargs) -> str: |
| 33 | + """Execute the video-transcription command to download transcriptions of videos from IDs or URLs and save them to a CSV file. |
| 34 | +
|
| 35 | + - a list of YouTube video IDs (`--ids`), or |
| 36 | + - a list of YouTube video URLs (`--urls`), or |
| 37 | + - a CSV file containing those URLs (`--urls-file-path`) or IDs (`--ids-file-path`). |
| 38 | +
|
| 39 | + Args: |
| 40 | + ids (list[str], optional): List of YouTube video IDs. |
| 41 | + Mutually exclusive with `urls` and `input_file_path`. |
| 42 | + urls (list[str], optional): List of YouTube video URLs. |
| 43 | + Mutually exclusive with `ids` and `input_file_path`. |
| 44 | + urls_file_path (Path, optional): Path to a CSV file containing YouTube video URLs. |
| 45 | + ids_file_path (Path, optional): Path to a CSV file containing YouTube video IDs. |
| 46 | + output_dir (Path, optional): Path to the output CSV file where video information will be saved. |
| 47 | + language_code (str): Language code for the transcription language. |
| 48 | + api_key (str): The API key to authenticate with the YouTube Data API. |
| 49 | + url_column_name (str, optional): Column name for URLs in the CSV input file. Defaults to "video_url". |
| 50 | + id_column_name (str, optional): Column name for IDs in the CSV output file. Defaults to "video_id". |
| 51 | +
|
| 52 | + Returns: |
| 53 | + str: A message indicating the result of the command. Reports success or failure for each video transcription download. |
| 54 | + """ |
| 55 | + ids = kwargs.get("ids") or [] |
| 56 | + urls = kwargs.get("urls") or [] |
| 57 | + ids_file_path = kwargs.get("ids_file_path") |
| 58 | + urls_file_path = kwargs.get("urls_file_path") |
| 59 | + output_dir = kwargs.get("output_dir") |
| 60 | + language_code = kwargs.get("language_code") |
| 61 | + api_key = kwargs.get("api_key") |
| 62 | + |
| 63 | + url_column_name = kwargs.get("url_column_name") or VideoTranscription.URL_COLUMN_NAME |
| 64 | + id_column_name = kwargs.get("id_column_name") or VideoTranscription.ID_COLUMN_NAME |
| 65 | + |
| 66 | + youtube = YouTube([api_key], disable_ipv6=True) |
| 67 | + |
| 68 | + if ids_file_path: |
| 69 | + ids += cls.data_from_csv(ids_file_path, id_column_name) |
| 70 | + if urls_file_path: |
| 71 | + urls += cls.data_from_csv(urls_file_path, url_column_name) |
| 72 | + |
| 73 | + if not ids and not urls: |
| 74 | + raise Exception("Either 'ids' or 'urls' must be provided for the video-transcription command") |
| 75 | + |
| 76 | + if urls: |
| 77 | + ids += sum([cls.video_id_from_url(url) for url in urls], []) |
| 78 | + |
| 79 | + # Remove duplicated |
| 80 | + ids = list(set(ids)) |
| 81 | + youtube.videos_transcriptions(ids, language_code, output_dir) |
| 82 | + output_dir_path = Path(output_dir) |
| 83 | + saved_transcriptions = [ |
| 84 | + str(output_dir_path / f"{v_id}.{language_code}.vtt") |
| 85 | + for v_id in ids |
| 86 | + if (output_dir_path / f"{v_id}.{language_code}.vtt").is_file() |
| 87 | + ] |
| 88 | + return "\n".join(saved_transcriptions) |
0 commit comments