whisper-youtube-2-hf_dataset

Runtime error

App Files Files Community

whisper-youtube-2-hf_dataset / preprocessing /youtubevideopreprocessor.py

juancopi81

Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset

7288748 almost 2 years ago

raw

history blame contribute delete

4.02 kB

	from typing import List, Generator, Tuple
	from pathlib import Path
	from itertools import islice

	import scrapetube
	from youtubesearchpython import ChannelsSearch
	from pytube import Playlist

	from utils import accepts_types
	from loading.serialization import Serializer

	class YoutubeVideoPreprocessor:
	"""This class is responsible for creating json files of expected as YoutubeVideo
	objects taking a channel name as input.
	Each JSON file has the following information:
	- channel_name: The name of the YouTube channel
	- url: The url of the video
	Args:
	channel_name (`str`):
	The name of the YouTube channel:
	Returns:
	load_paths (`List[Path]`)
	The paths of the json files of the video of that channel.
	TODO: Change it to accept also URL of video list, name of video list, etc.
	"""
	def __init__(self,
	mode: str = "channel_name",
	serializer = Serializer) -> None:
	self.mode = mode
	self.serializer = serializer

	def preprocess(self,
	name: str,
	num_videos: int,
	videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
	if self.mode == "channel_name":
	# TODO: Add credits
	channels_search = ChannelsSearch(name, limit=1)
	channel_id = channels_search.result()['result'][0]['id']
	videos = scrapetube.get_channel(channel_id=channel_id)
	load_paths, dataset_folder = self._convert_videos_to_json_files(name,
	videos,
	num_videos,
	videos_in_ds)
	return load_paths, dataset_folder
	elif self.mode == "playlist":
	playlist_id = name.split("=")[-1]
	playlist = Playlist(name)
	name = playlist.title
	videos = scrapetube.get_playlist(playlist_id)
	load_paths, dataset_folder = self._convert_videos_to_json_files(name,
	videos,
	num_videos,
	videos_in_ds)
	return load_paths, dataset_folder
	else:
	# TODO: implement this part
	youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
	test_files_folder = youtube_folder/"test/files"
	return [Path("test.json"), Path("test1.json")], test_files_folder

	def _convert_videos_to_json_files(self,
	name:str,
	videos: Generator,
	num_videos: int,
	videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
	load_paths = []
	youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
	dataset_folder = youtube_folder/name
	Path(dataset_folder).mkdir(parents=True, exist_ok=True)
	i = 0
	while i < num_videos:
	try:
	video = next(videos)
	if video["videoId"] in videos_in_ds:
	continue
	else:
	file_name = f"{i}.json"
	save_path = Path(dataset_folder, file_name)
	save_path.touch(exist_ok=True)
	video_dict = {"channel_name": name,
	"url":f"https://www.youtube.com/watch?v={video['videoId']}"}
	self.serializer.dump(obj=video_dict, save_path=save_path)
	load_paths.append(save_path)
	i += 1
	except StopIteration:
	break
	return load_paths, dataset_folder