whisper-youtube-2-hf_dataset / preprocessing /youtubevideopreprocessor.py
juancopi81's picture
Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset
7288748
from typing import List, Generator, Tuple
from pathlib import Path
from itertools import islice
import scrapetube
from youtubesearchpython import ChannelsSearch
from pytube import Playlist
from utils import accepts_types
from loading.serialization import Serializer
class YoutubeVideoPreprocessor:
"""This class is responsible for creating json files of expected as YoutubeVideo
objects taking a channel name as input.
Each JSON file has the following information:
- channel_name: The name of the YouTube channel
- url: The url of the video
Args:
channel_name (`str`):
The name of the YouTube channel:
Returns:
load_paths (`List[Path]`)
The paths of the json files of the video of that channel.
TODO: Change it to accept also URL of video list, name of video list, etc.
"""
def __init__(self,
mode: str = "channel_name",
serializer = Serializer) -> None:
self.mode = mode
self.serializer = serializer
def preprocess(self,
name: str,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
if self.mode == "channel_name":
# TODO: Add credits
channels_search = ChannelsSearch(name, limit=1)
channel_id = channels_search.result()['result'][0]['id']
videos = scrapetube.get_channel(channel_id=channel_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
elif self.mode == "playlist":
playlist_id = name.split("=")[-1]
playlist = Playlist(name)
name = playlist.title
videos = scrapetube.get_playlist(playlist_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
else:
# TODO: implement this part
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
test_files_folder = youtube_folder/"test/files"
return [Path("test.json"), Path("test1.json")], test_files_folder
def _convert_videos_to_json_files(self,
name:str,
videos: Generator,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
load_paths = []
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
dataset_folder = youtube_folder/name
Path(dataset_folder).mkdir(parents=True, exist_ok=True)
i = 0
while i < num_videos:
try:
video = next(videos)
if video["videoId"] in videos_in_ds:
continue
else:
file_name = f"{i}.json"
save_path = Path(dataset_folder, file_name)
save_path.touch(exist_ok=True)
video_dict = {"channel_name": name,
"url":f"https://www.youtube.com/watch?v={video['videoId']}"}
self.serializer.dump(obj=video_dict, save_path=save_path)
load_paths.append(save_path)
i += 1
except StopIteration:
break
return load_paths, dataset_folder