Spaces:
Sleeping
Sleeping
1. Fix some video automatic_captions is empty (eg:VSFg5LZYsyc).
Browse files3. Support subtitle lang format like lang-code.
2. Add requested_subtitles and parse it first.
- fetchYoutubeSubtitle.py +166 -46
fetchYoutubeSubtitle.py
CHANGED
@@ -1,13 +1,17 @@
|
|
|
|
1 |
import json
|
2 |
import math
|
3 |
import time
|
|
|
4 |
from typing import Optional
|
5 |
import xml.etree.ElementTree as ElementTree
|
6 |
from html import unescape
|
7 |
import yt_dlp
|
8 |
|
|
|
9 |
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
|
10 |
|
|
|
11 |
# "subtitles": {
|
12 |
# "live_chat": [
|
13 |
# {
|
@@ -18,53 +22,134 @@ import yt_dlp
|
|
18 |
# }
|
19 |
# ]
|
20 |
# }
|
21 |
-
def
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
for subtitle in item[l]:
|
31 |
-
# print("getUrlFromSubtitles subtitle: %s" % subtitle)
|
32 |
-
if l != "live_chat" and subType =="xml":
|
33 |
-
# print("subtitle source url: {}".format(subtitle.get("url")))
|
34 |
-
return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
|
35 |
if subtitle.get("ext") == subType:
|
|
|
|
|
36 |
return subtitle.get("url")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return None
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
return await fetchSubtitlebyType(url, lang, subType, proxy)
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
ydl_opts = {
|
44 |
"noplaylist": True,
|
45 |
-
"writesubtitles":
|
46 |
-
"
|
47 |
-
"
|
|
|
|
|
48 |
"skip_download": True,
|
49 |
"socket_timeout": 10,
|
50 |
"extractor_retries": 0,
|
51 |
"extractor_args": {
|
52 |
"youtube": {
|
53 |
"player_skip": ["configs", "initial"], # "webpage",
|
54 |
-
"player_client": ["
|
55 |
-
"skip": ["hls", "dash", "translated_subs"
|
56 |
}
|
57 |
},
|
58 |
}
|
59 |
|
60 |
if proxy:
|
61 |
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
|
62 |
-
|
63 |
title = "unknow"
|
64 |
duration = ""
|
65 |
try:
|
66 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
67 |
info_dict = ydl.extract_info(url, download=False)
|
|
|
68 |
title = info_dict.get("title", "unknow")
|
69 |
seconds = info_dict.get("duration")
|
70 |
duration = str(seconds) if seconds else ""
|
@@ -72,21 +157,46 @@ async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Opt
|
|
72 |
if info_dict.get("extractor") == "youtube" and subType == "srt":
|
73 |
subType = "xml"
|
74 |
isSrt = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
except Exception as e:
|
|
|
|
|
87 |
return {"error": str(e)}
|
88 |
return {"title": title, "duration": duration, "error": "No subtitles"}
|
89 |
|
|
|
90 |
def float_to_srt_time_format(d: float) -> str:
|
91 |
"""Convert decimal durations into proper srt format.
|
92 |
:rtype: str
|
@@ -99,7 +209,8 @@ def float_to_srt_time_format(d: float) -> str:
|
|
99 |
ms = f"{fraction:.3f}".replace("0.", "")
|
100 |
return time_fmt + ms
|
101 |
|
102 |
-
|
|
|
103 |
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
104 |
:param str xml_captions:
|
105 |
XML formatted caption tracks.
|
@@ -108,7 +219,9 @@ def xml_caption_to_srt( xml_captions: str) -> str:
|
|
108 |
root = ElementTree.fromstring(xml_captions)
|
109 |
for i, child in enumerate(list(root)):
|
110 |
text = child.text or ""
|
111 |
-
caption = unescape(
|
|
|
|
|
112 |
try:
|
113 |
duration = float(child.attrib["dur"])
|
114 |
except KeyError:
|
@@ -125,25 +238,27 @@ def xml_caption_to_srt( xml_captions: str) -> str:
|
|
125 |
segments.append(line)
|
126 |
return "\n".join(segments).strip()
|
127 |
|
|
|
128 |
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
|
129 |
ydl_opts = {
|
130 |
"noplaylist": True,
|
131 |
-
"writesubtitles": False,
|
132 |
-
"allsubtitles":
|
133 |
-
"
|
|
|
134 |
"socket_timeout": 10,
|
135 |
"extractor_retries": 0,
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
}
|
144 |
if proxy:
|
145 |
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
|
146 |
-
|
147 |
title = "unknow"
|
148 |
duration = ""
|
149 |
try:
|
@@ -153,7 +268,12 @@ async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
|
|
153 |
seconds = info_dict.get("duration")
|
154 |
duration = str(seconds) if seconds else ""
|
155 |
|
156 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
except Exception as e:
|
159 |
return {"error": str(e)}
|
|
|
1 |
+
import os
|
2 |
import json
|
3 |
import math
|
4 |
import time
|
5 |
+
import traceback
|
6 |
from typing import Optional
|
7 |
import xml.etree.ElementTree as ElementTree
|
8 |
from html import unescape
|
9 |
import yt_dlp
|
10 |
|
11 |
+
debug = os.getenv("DEBUG")
|
12 |
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
|
13 |
|
14 |
+
|
15 |
# "subtitles": {
|
16 |
# "live_chat": [
|
17 |
# {
|
|
|
22 |
# }
|
23 |
# ]
|
24 |
# }
|
25 |
+
def getUrlFromSubtitleItem(item, lang="en", subType="vtt"):
|
26 |
+
for subtitle in item[lang]:
|
27 |
+
if lang != "live_chat" and subType == "xml":
|
28 |
+
if debug:
|
29 |
+
print(
|
30 |
+
"subtitle source lang:{} url: {}".format(lang, subtitle.get("url"))
|
31 |
+
)
|
32 |
+
return subtitle.get("url").replace("&fmt=" + subtitle.get("ext"), "")
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
if subtitle.get("ext") == subType:
|
34 |
+
if debug:
|
35 |
+
print("subtitle lang:{} url: {}".format(lang, subtitle.get("url")))
|
36 |
return subtitle.get("url")
|
37 |
+
|
38 |
+
return None
|
39 |
+
|
40 |
+
|
41 |
+
def getRequestedSubtitlesUrl(info_dict, lang, subType):
|
42 |
+
item = info_dict.get("requested_subtitles")
|
43 |
+
langs = item.keys()
|
44 |
+
for l in langs:
|
45 |
+
if l.startswith(lang):
|
46 |
+
item = {l: [item[l]]} if type(item[l]) == dict else item
|
47 |
+
url = getUrlFromSubtitleItem(item, l, subType)
|
48 |
+
if url:
|
49 |
+
if debug:
|
50 |
+
print("getRequestedSubtitlesUrl lang:{} url:{}".format(l, url))
|
51 |
+
return url
|
52 |
+
return None
|
53 |
+
|
54 |
+
|
55 |
+
def getSubtitleLangUrl(
|
56 |
+
info_dict,
|
57 |
+
lang="en",
|
58 |
+
subType="vtt",
|
59 |
+
subTitleKeys=["subtitles", "automatic_captions"],
|
60 |
+
):
|
61 |
+
for subtitle_item in subTitleKeys:
|
62 |
+
langs = info_dict.get(subtitle_item).keys()
|
63 |
+
if lang in langs:
|
64 |
+
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), lang, subType)
|
65 |
+
if url:
|
66 |
+
if debug:
|
67 |
+
print("getSubtitleLangUrl lang:{}".format(lang))
|
68 |
+
return url
|
69 |
+
|
70 |
+
for subtitle_item in subTitleKeys:
|
71 |
+
langs = info_dict.get(subtitle_item).keys()
|
72 |
+
for l in langs:
|
73 |
+
if l.startswith(lang):
|
74 |
+
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
|
75 |
+
if url:
|
76 |
+
if debug:
|
77 |
+
print("getSubtitleLangUrl lang:{} url:{}".format(l, url))
|
78 |
+
return url
|
79 |
+
|
80 |
+
return None
|
81 |
+
|
82 |
+
|
83 |
+
def getSubtitleOtherUrl(
|
84 |
+
info_dict,
|
85 |
+
lang="en",
|
86 |
+
subType="vtt",
|
87 |
+
subTitleKeys=["subtitles", "automatic_captions"],
|
88 |
+
):
|
89 |
+
for subtitle_item in subTitleKeys:
|
90 |
+
langs = info_dict.get(subtitle_item).keys()
|
91 |
+
if len(langs) == 0:
|
92 |
+
continue
|
93 |
+
|
94 |
+
l = lang if lang in langs else ("en" if "en" in langs else list(langs)[0])
|
95 |
+
if l is None:
|
96 |
+
continue
|
97 |
+
|
98 |
+
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
|
99 |
+
if url:
|
100 |
+
if debug:
|
101 |
+
print("getSubtitleOtherUrl lang:{} url:{}".format(l, url))
|
102 |
+
return url
|
103 |
+
|
104 |
return None
|
105 |
|
106 |
+
|
107 |
+
async def fetchSubtitle(
|
108 |
+
url: str,
|
109 |
+
lang: Optional[str] = "en",
|
110 |
+
subType: Optional[str] = "vtt",
|
111 |
+
proxy: Optional[str] = None,
|
112 |
+
) -> dict:
|
113 |
return await fetchSubtitlebyType(url, lang, subType, proxy)
|
114 |
|
115 |
+
|
116 |
+
async def fetchSubtitlebyType(
|
117 |
+
url: str,
|
118 |
+
lang: Optional[str] = "en",
|
119 |
+
subType: Optional[str] = "vtt",
|
120 |
+
proxy: Optional[str] = None,
|
121 |
+
) -> dict:
|
122 |
+
# lang-code or lang.* .* is regex
|
123 |
+
reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
|
124 |
+
|
125 |
ydl_opts = {
|
126 |
"noplaylist": True,
|
127 |
+
"writesubtitles": True,
|
128 |
+
"writeautomaticsub": True,
|
129 |
+
# "listsubtitles": True,
|
130 |
+
# "subtitlesformat": subType, # mark due to default youtube no srt and xml format
|
131 |
+
"subtitleslangs": [reqLang],
|
132 |
"skip_download": True,
|
133 |
"socket_timeout": 10,
|
134 |
"extractor_retries": 0,
|
135 |
"extractor_args": {
|
136 |
"youtube": {
|
137 |
"player_skip": ["configs", "initial"], # "webpage",
|
138 |
+
"player_client": ["web"],
|
139 |
+
"skip": ["hls", "dash"], # don't skip "translated_subs"
|
140 |
}
|
141 |
},
|
142 |
}
|
143 |
|
144 |
if proxy:
|
145 |
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
|
146 |
+
# print(ydl_opts)
|
147 |
title = "unknow"
|
148 |
duration = ""
|
149 |
try:
|
150 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
151 |
info_dict = ydl.extract_info(url, download=False)
|
152 |
+
# print(json.dumps(info_dict))
|
153 |
title = info_dict.get("title", "unknow")
|
154 |
seconds = info_dict.get("duration")
|
155 |
duration = str(seconds) if seconds else ""
|
|
|
157 |
if info_dict.get("extractor") == "youtube" and subType == "srt":
|
158 |
subType = "xml"
|
159 |
isSrt = True
|
160 |
+
if debug:
|
161 |
+
print(
|
162 |
+
"subtitles.keys(): {} automatic_captions: {}".format(
|
163 |
+
info_dict.get("subtitles").keys(),
|
164 |
+
info_dict.get("automatic_captions").keys(),
|
165 |
+
)
|
166 |
+
)
|
167 |
+
|
168 |
+
subtitle_url = getRequestedSubtitlesUrl(info_dict, lang, subType)
|
169 |
+
if not subtitle_url:
|
170 |
+
subtitle_url = getSubtitleLangUrl(info_dict, lang, subType)
|
171 |
+
if not subtitle_url:
|
172 |
+
subtitle_url = getSubtitleOtherUrl(info_dict, lang, subType)
|
173 |
|
174 |
+
if subtitle_url:
|
175 |
+
# print("subtitle_url: {}".format(subtitle_url))
|
176 |
+
with ydl.urlopen(subtitle_url) as response:
|
177 |
+
subtitle = (
|
178 |
+
xml_caption_to_srt(response.read().decode())
|
179 |
+
if isSrt
|
180 |
+
else response.read().decode()
|
181 |
+
)
|
182 |
+
print(
|
183 |
+
"url:{}, title:{}, duration:{} len(subtitle): {}".format(
|
184 |
+
url, title, duration, len(subtitle)
|
185 |
+
)
|
186 |
+
)
|
187 |
+
return {
|
188 |
+
"title": title,
|
189 |
+
"duration": duration,
|
190 |
+
"subtitle": subtitle,
|
191 |
+
"chapters": info_dict.get("chapters", None),
|
192 |
+
}
|
193 |
except Exception as e:
|
194 |
+
print(e)
|
195 |
+
traceback.print_exc()
|
196 |
return {"error": str(e)}
|
197 |
return {"title": title, "duration": duration, "error": "No subtitles"}
|
198 |
|
199 |
+
|
200 |
def float_to_srt_time_format(d: float) -> str:
|
201 |
"""Convert decimal durations into proper srt format.
|
202 |
:rtype: str
|
|
|
209 |
ms = f"{fraction:.3f}".replace("0.", "")
|
210 |
return time_fmt + ms
|
211 |
|
212 |
+
|
213 |
+
def xml_caption_to_srt(xml_captions: str) -> str:
|
214 |
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
215 |
:param str xml_captions:
|
216 |
XML formatted caption tracks.
|
|
|
219 |
root = ElementTree.fromstring(xml_captions)
|
220 |
for i, child in enumerate(list(root)):
|
221 |
text = child.text or ""
|
222 |
+
caption = unescape(
|
223 |
+
text.replace("\n", " ").replace(" ", " "),
|
224 |
+
)
|
225 |
try:
|
226 |
duration = float(child.attrib["dur"])
|
227 |
except KeyError:
|
|
|
238 |
segments.append(line)
|
239 |
return "\n".join(segments).strip()
|
240 |
|
241 |
+
|
242 |
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
|
243 |
ydl_opts = {
|
244 |
"noplaylist": True,
|
245 |
+
# "writesubtitles": False,
|
246 |
+
# "allsubtitles": False,
|
247 |
+
"listsubtitles": True,
|
248 |
+
# "skip_download": True,
|
249 |
"socket_timeout": 10,
|
250 |
"extractor_retries": 0,
|
251 |
+
"extractor_args": {
|
252 |
+
"youtube": {
|
253 |
+
"player_skip": ["configs", "initial"], # "webpage",
|
254 |
+
"player_client": ["web"],
|
255 |
+
"skip": ["hls", "dash"], # , "translated_subs"
|
256 |
+
}
|
257 |
+
},
|
258 |
}
|
259 |
if proxy:
|
260 |
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
|
261 |
+
|
262 |
title = "unknow"
|
263 |
duration = ""
|
264 |
try:
|
|
|
268 |
seconds = info_dict.get("duration")
|
269 |
duration = str(seconds) if seconds else ""
|
270 |
|
271 |
+
return {
|
272 |
+
"title": title,
|
273 |
+
"duration": duration,
|
274 |
+
"subtitles": info_dict.get("subtitles"),
|
275 |
+
"automatic_captions": info_dict.get("automatic_captions"),
|
276 |
+
}
|
277 |
|
278 |
except Exception as e:
|
279 |
return {"error": str(e)}
|