Spaces:
Runtime error
Runtime error
File size: 9,444 Bytes
73a9bbe 093a866 d030b89 73a9bbe ba9fae4 d030b89 ba9fae4 093a866 73a9bbe d030b89 73a9bbe d030b89 73a9bbe d030b89 73a9bbe 49a4a29 73a9bbe b4671e5 73a9bbe ba9fae4 093a866 73a9bbe d8804e5 d030b89 73a9bbe ba9fae4 a0c5866 73a9bbe 0ffddf9 ba9fae4 8a3c63b 0806569 8a3c63b a232a02 0806569 73a9bbe 8a3c63b 093a866 d8804e5 8a3c63b 73a9bbe c997914 3965709 c997914 73a9bbe c997914 3965709 0d59eb5 c997914 0d59eb5 73a9bbe d030b89 73a9bbe c997914 73a9bbe c997914 3965709 093a866 73a9bbe d030b89 73a9bbe d030b89 73a9bbe d030b89 73a9bbe d8804e5 ba9fae4 a0c5866 0806569 73a9bbe 0806569 8a3c63b 3e21f52 0806569 73a9bbe 0806569 73a9bbe ba9fae4 d8804e5 8a3c63b 73a9bbe c997914 3965709 c997914 3965709 d8804e5 73a9bbe d8804e5 c997914 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
import os
import json
import math
import time
import traceback
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
debug = os.getenv("DEBUG")
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitleItem(item, lang="en", subType="vtt"):
for subtitle in item[lang]:
if lang != "live_chat" and subType == "xml":
if debug:
print(
"subtitle source lang:{} url: {}".format(lang, subtitle.get("url"))
)
return subtitle.get("url").replace("&fmt=" + subtitle.get("ext"), "")
if subtitle.get("ext") == subType:
if debug:
print("subtitle lang:{} url: {}".format(lang, subtitle.get("url")))
return subtitle.get("url")
return None
def getRequestedSubtitlesUrl(info_dict, lang, subType):
item = info_dict.get("requested_subtitles")
if item:
langs = item.keys()
for l in langs:
if l.startswith(lang):
item = {l: [item[l]]} if type(item[l]) == dict else item
url = getUrlFromSubtitleItem(item, l, subType)
if url:
if debug:
print("getRequestedSubtitlesUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleLangUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if lang in langs:
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), lang, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{}".format(lang))
return url
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
for l in langs:
if l.startswith(lang):
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleOtherUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if len(langs) == 0:
continue
l = lang if lang in langs else ("en" if "en" in langs else list(langs)[0])
if l is None:
continue
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleOtherUrl lang:{} url:{}".format(l, url))
return url
return None
async def fetchSubtitle(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
# lang-code or lang.* .* is regex
reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
ydl_opts = {
"noplaylist": True,
"writesubtitles": True,
"writeautomaticsub": True,
# "listsubtitles": True,
# "subtitlesformat": subType, # mark due to default youtube no srt and xml format
"subtitleslangs": [reqLang],
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
# "debug_printtraffic": True,
"extractor_args": {
"youtube": {
"player_skip": [
"configs",
"initial",
], # skip "webpage" will cause l2P5PgL1LfI missing some langs,
"player_client": ["ios"],
"skip": ["hls", "dash"], # don't skip "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
# print(ydl_opts)
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
# print(json.dumps(info_dict))
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
if debug:
print(
"subtitles.keys(): {} automatic_captions: {}".format(
info_dict.get("subtitles").keys(),
info_dict.get("automatic_captions").keys(),
)
)
subtitle_url = getRequestedSubtitlesUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleLangUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleOtherUrl(info_dict, lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = (
xml_caption_to_srt(response.read().decode())
if isSrt
else response.read().decode()
)
print(
"url:{}, title:{}, duration:{} len(subtitle): {}".format(
url, title, duration, len(subtitle)
)
)
return {
"title": title,
"duration": duration,
"subtitle": subtitle,
"chapters": info_dict.get("chapters", None),
}
except Exception as e:
print(e)
traceback.print_exc()
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt(xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(
text.replace("\n", " ").replace(" ", " "),
)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
"writesubtitles": True,
"writeautomaticsub": True,
# "allsubtitles": False,
# "listsubtitles": True,
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
# "debug_printtraffic": True,
"extractor_args": {
"youtube": {
"player_skip": ["configs", "initial"],
"player_client": ["ios"],
"skip": ["hls", "dash"], # , "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {
"title": title,
"duration": duration,
"subtitles": info_dict.get("subtitles"),
"automatic_captions": info_dict.get("automatic_captions"),
}
except Exception as e:
return {"error": str(e)}
|