Spaces:
Runtime error
Runtime error
File size: 6,206 Bytes
093a866 d030b89 ba9fae4 d030b89 ba9fae4 093a866 d030b89 ba9fae4 d030b89 c997914 d030b89 c997914 d030b89 0d59eb5 d030b89 49a4a29 ba9fae4 093a866 d8804e5 d030b89 d8804e5 ba9fae4 a0c5866 3fca3bd ba9fae4 8a3c63b 3e21f52 8a3c63b 093a866 d8804e5 8a3c63b d8804e5 c997914 3965709 c997914 3965709 0d59eb5 c997914 0d59eb5 d030b89 3e21f52 c997914 0d59eb5 c997914 0d59eb5 6e687fe 860b5e0 c997914 3965709 093a866 d030b89 d8804e5 ba9fae4 a0c5866 3fca3bd ba9fae4 8a3c63b 3e21f52 ba9fae4 d8804e5 8a3c63b d8804e5 c997914 3965709 c997914 3965709 d8804e5 c997914 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import json
import math
import time
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
langs = item.keys()
if len(langs) == 0:
return None
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
if l is None:
return
for subtitle in item[l]:
# print("getUrlFromSubtitles subtitle: %s" % subtitle)
if l != "live_chat" and subType =="xml":
# print("subtitle source url: {}".format(subtitle.get("url")))
return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
if subtitle.get("ext") == subType:
return subtitle.get("url")
return None
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"subtitleslangs": [lang] if lang else [],
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
"extractor_args": {
"youtube": {
"player_skip": ["configs", "initial"], # "webpage",
"player_client": ["android"],
"skip": ["hls", "dash", "translated_subs"],
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
# print("subtitles.keys(): {} automatic_captions: {}".format( info_dict.get("subtitles").keys(),info_dict.get("automatic_captions").keys()))
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
if info_dict.get(subtitle_item):
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = xml_caption_to_srt(response.read().decode()) if isSrt else response.read().decode()
print("url{}, title:{}, duration:{} len(subtitle): {}".format(url, title, duration, len(subtitle)))
return {"title": title, "duration": duration,"subtitle": subtitle, "chapters":info_dict.get("chapters", None) }
except Exception as e:
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt( xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
# "extractor_args": {
# "youtube": {
# "player_skip": ["webpage", "configs", "initial"],
# "player_client": ["android"],
# "skip": ["hls", "dash", "translated_subs"],
# }
# },
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {"title": title, "duration": duration, "subtitles": info_dict.get("subtitles"),"automatic_captions": info_dict.get("automatic_captions")}
except Exception as e:
return {"error": str(e)}
|