lanbogao commited on
Commit
eb92911
1 Parent(s): 5c48e61

1. Add retry to extract video, and use proxy when retry if has proxy.

Browse files

2. Default enable proxy for url subtitle-urls and subtitle-dl. default disable proxy for url subtitle

Files changed (3) hide show
  1. fetchYoutubeSubtitle.py +168 -118
  2. main.py +5 -0
  3. requirements.txt +2 -1
fetchYoutubeSubtitle.py CHANGED
@@ -7,12 +7,27 @@ import traceback
7
  from typing import Optional
8
  import xml.etree.ElementTree as ElementTree
9
  from html import unescape
10
- import yt_dlp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  debug = os.getenv("DEBUG")
13
  # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
14
 
15
-
16
  # "subtitles": {
17
  # "live_chat": [
18
  # {
@@ -63,7 +78,7 @@ def getSubtitleOptions(
63
  if proxy:
64
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
65
 
66
- # print(ydl_opts)
67
  return ydl_opts
68
 
69
 
@@ -179,72 +194,68 @@ async def fetchAnySubtitle(
179
  url: str,
180
  lang: Optional[str] = "en",
181
  subType: Optional[str] = "vtt",
182
- skipEmpty: bool = True,
183
  proxy: Optional[str] = None,
184
  ) -> dict:
185
  # lang-code or lang.* .* is regex
186
  # reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
187
 
188
- ydl_opts = getSubtitleOptions(lang, proxy)
189
-
190
  title = "unknow"
191
  duration = ""
192
  try:
193
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
194
- info_dict = ydl.extract_info(url, download=False)
195
- # print(json.dumps(info_dict))
196
- title = info_dict.get("title", "unknow")
197
- seconds = info_dict.get("duration")
198
- duration = str(seconds) if seconds else ""
199
- thumbnail = info_dict.get("thumbnail")
200
- if ".webp" in thumbnail:
201
- thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
202
- info_dict.get("id")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
204
-
205
- reqType = subType
206
- if info_dict.get("extractor") == "youtube" and subType in ["srt", "txt"]:
207
- reqType = "xml"
208
- if debug:
 
 
 
 
 
 
 
209
  print(
210
- "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
211
- info_dict.get("subtitles").keys(),
212
- info_dict.get("automatic_captions").keys(),
213
- (
214
- info_dict.get("requested_subtitles").keys()
215
- if info_dict.get("requested_subtitles")
216
- else {}
217
- ),
218
  )
219
  )
220
-
221
- subtitle_funcs = [
222
- getRequestedSubtitlesUrl,
223
- getSubtitleLangUrl,
224
- getSubtitleOtherUrl,
225
- ]
226
- for index in range(len(subtitle_funcs)):
227
- subtitle_url = subtitle_funcs[index](info_dict, lang, reqType)
228
- if subtitle_url:
229
- # print("subtitle_url: {}".format(subtitle_url))
230
- subtitle = fetchSubtitlebydlUrl(ydl, subType, subtitle_url)
231
- print(
232
- "function index:{}, url:{}, title:{}, duration:{} len(subtitle): {}".format(
233
- index, url, title, duration, len(subtitle or "")
234
- )
235
- )
236
- if subtitle is not None:
237
- return {
238
- "id": info_dict.get("id"),
239
- "url": url,
240
- "title": title,
241
- "thumbnail": thumbnail,
242
- "duration": duration,
243
- "subtitle": subtitle,
244
- "chapters": info_dict.get("chapters", None),
245
- }
246
  except Exception as e:
247
- print(e)
248
  traceback.print_exc()
249
  return {"error": str(e)}
250
  return {"title": title, "duration": duration, "error": "No subtitles"}
@@ -330,40 +341,49 @@ def xml_caption_to_txt(xml_captions: str, skip_empty: bool = True) -> str:
330
 
331
 
332
  async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
333
- ydl_opts = getSubtitleOptions(proxy)
334
-
335
- title = "unknow"
336
- duration = ""
337
  try:
338
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
339
- info_dict = ydl.extract_info(url, download=False)
340
- title = info_dict.get("title", "unknow")
341
- seconds = info_dict.get("duration")
342
- duration = str(seconds) if seconds else ""
343
- thumbnail = info_dict.get("thumbnail")
344
- if ".webp" in thumbnail:
345
- thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
346
- info_dict.get("id")
347
- )
348
- return {
349
- "id": info_dict.get("id"),
350
- "url": url,
351
- "title": title,
352
- "thumbnail": thumbnail,
353
- "duration": duration,
354
- "subtitles": info_dict.get("subtitles"),
355
- "automatic_captions": info_dict.get("automatic_captions"),
356
- }
357
 
358
  except Exception as e:
 
 
359
  return {"error": str(e)}
360
 
361
 
362
- def fetchSubtitlebydlUrl(ydl, subType, dlUrl, skipEmpty=True):
363
- dlUrl = dlUrl if subType not in ["srt", "txt"] else re.sub(r"&fmt=[\w]+", "", dlUrl)
 
 
 
 
 
 
364
 
 
 
 
 
365
  try:
366
- with ydl.urlopen(dlUrl) as resp:
 
367
  if subType == "srt":
368
  return xml_caption_to_srt(resp.read().decode(), skipEmpty)
369
  elif subType == "txt":
@@ -389,49 +409,79 @@ def getSubtitleUrlByLang(info_dict, lang, subType, isLangKey):
389
  return subtitle_url
390
 
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  async def fetchSubtitleByInfo(
393
  url: str, subType: str, dlInfo, proxy: Optional[str] = None
394
  ):
395
  try:
396
  reqType = "xml" if subType in ["srt", "txt"] else subType
397
- ydl_opts = getSubtitleOptions(dlInfo.get("lang", None), proxy)
398
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
399
- subtitle = None
400
- if "dlUrl" in dlInfo:
401
- subtitle = fetchSubtitlebydlUrl(
402
- ydl, subType, dlInfo.get("dlUrl"), False
403
- )
404
- if subtitle is not None:
405
- return subtitle
406
 
407
- info_dict = ydl.extract_info(url, download=False)
408
- if debug:
409
- print(
410
- "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
411
- info_dict.get("subtitles").keys(),
412
- info_dict.get("automatic_captions").keys(),
413
- (
414
- info_dict.get("requested_subtitles").keys()
415
- if info_dict.get("requested_subtitles")
416
- else {}
417
- ),
418
- )
419
- )
420
-
421
- subtitleUrl = None
422
- if "langKey" in dlInfo:
423
- subtitleUrl = getSubtitleUrlByLang(
424
- info_dict, dlInfo.get("langKey"), reqType, True
425
- )
426
- if subtitleUrl is None:
427
- subtitleUrl = getSubtitleUrlByLang(
428
- info_dict, dlInfo.get("lang"), reqType, False
429
  )
430
-
431
- print("subtitleUrl: {}".format(subtitleUrl))
432
- subtitle = fetchSubtitlebydlUrl(ydl, subType, subtitleUrl, False)
433
- return subtitle
 
 
 
 
 
 
 
 
 
 
 
434
  except Exception as e:
435
- print(e)
436
  traceback.print_exc()
437
  return {"error": str(e)}
 
7
  from typing import Optional
8
  import xml.etree.ElementTree as ElementTree
9
  from html import unescape
10
+ from yt_dlp import YoutubeDL, DownloadError
11
+ from yt_dlp.networking import Request
12
+ from yt_dlp.utils import sanitize_filename, random_user_agent
13
+
14
+ NO_RETRY_STR = [
15
+ "Sorry about that",
16
+ "unavailable",
17
+ "not available",
18
+ ]
19
+
20
+ RETRY_STR = [
21
+ "URLError",
22
+ "429",
23
+ "IncompleteRead",
24
+ "Remote end closed connection",
25
+ # "No video formats found",
26
+ ]
27
 
28
  debug = os.getenv("DEBUG")
29
  # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
30
 
 
31
  # "subtitles": {
32
  # "live_chat": [
33
  # {
 
78
  if proxy:
79
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
80
 
81
+ print(ydl_opts)
82
  return ydl_opts
83
 
84
 
 
194
  url: str,
195
  lang: Optional[str] = "en",
196
  subType: Optional[str] = "vtt",
 
197
  proxy: Optional[str] = None,
198
  ) -> dict:
199
  # lang-code or lang.* .* is regex
200
  # reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
201
 
 
 
202
  title = "unknow"
203
  duration = ""
204
  try:
205
+ ydl, info_dict = extractInfo(url, lang, proxy, False)
206
+ # print(json.dumps(info_dict))
207
+ title = sanitize_filename(info_dict.get("title", "unknow"))
208
+ seconds = info_dict.get("duration")
209
+ duration = str(seconds) if seconds else ""
210
+ thumbnail = info_dict.get("thumbnail")
211
+ if ".webp" in thumbnail:
212
+ thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
213
+ info_dict.get("id")
214
+ )
215
+
216
+ reqType = subType
217
+ if info_dict.get("extractor") == "youtube" and subType in ["srt", "txt"]:
218
+ reqType = "xml"
219
+ if debug:
220
+ print(
221
+ "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
222
+ info_dict.get("subtitles").keys(),
223
+ info_dict.get("automatic_captions").keys(),
224
+ (
225
+ info_dict.get("requested_subtitles").keys()
226
+ if info_dict.get("requested_subtitles")
227
+ else {}
228
+ ),
229
  )
230
+ )
231
+
232
+ subtitle_funcs = [
233
+ getRequestedSubtitlesUrl,
234
+ getSubtitleLangUrl,
235
+ getSubtitleOtherUrl,
236
+ ]
237
+ for index in range(len(subtitle_funcs)):
238
+ subtitle_url = subtitle_funcs[index](info_dict, lang, reqType)
239
+ if subtitle_url:
240
+ # print("subtitle_url: {}".format(subtitle_url))
241
+ subtitle = fetchSubtitleBydlUrl(subType, subtitle_url, ydl=ydl)
242
  print(
243
+ "function index:{}, url:{}, title:{}, duration:{} len(subtitle): {}".format(
244
+ index, url, title, duration, len(subtitle or "")
 
 
 
 
 
 
245
  )
246
  )
247
+ if subtitle is not None:
248
+ return {
249
+ "id": info_dict.get("id"),
250
+ "url": url,
251
+ "title": title,
252
+ "thumbnail": thumbnail,
253
+ "duration": duration,
254
+ "subtitle": subtitle,
255
+ "chapters": info_dict.get("chapters", None),
256
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  except Exception as e:
258
+ print("{}, {}".format(e, url))
259
  traceback.print_exc()
260
  return {"error": str(e)}
261
  return {"title": title, "duration": duration, "error": "No subtitles"}
 
341
 
342
 
343
  async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
 
 
 
 
344
  try:
345
+ _, info_dict = extractInfo(url, None, proxy, True)
346
+
347
+ title = sanitize_filename(info_dict.get("title", "unknow"))
348
+ seconds = info_dict.get("duration")
349
+ duration = str(seconds) if seconds else ""
350
+ thumbnail = info_dict.get("thumbnail")
351
+ if ".webp" in thumbnail:
352
+ thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(
353
+ info_dict.get("id")
354
+ )
355
+ return {
356
+ "id": info_dict.get("id"),
357
+ "url": url,
358
+ "title": title,
359
+ "thumbnail": thumbnail,
360
+ "duration": duration,
361
+ "subtitles": info_dict.get("subtitles"),
362
+ "automatic_captions": info_dict.get("automatic_captions"),
363
+ }
364
 
365
  except Exception as e:
366
+ print("{}, {}".format(e, url))
367
+ traceback.print_exc()
368
  return {"error": str(e)}
369
 
370
 
371
+ def createHeaders():
372
+ return {
373
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
374
+ "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
375
+ "Accept-Encoding": "gzip, deflate",
376
+ "Accept-Language": "en-us,en;q=0.5",
377
+ "User-Agent": random_user_agent(),
378
+ }
379
 
380
+
381
+ def fetchSubtitleBydlUrl(subType, dlUrl, skipEmpty=True, ydl=None):
382
+ dlUrl = dlUrl if subType not in ["srt", "txt"] else re.sub(r"&fmt=[\w]+", "", dlUrl)
383
+ # if download mailed we may contain headers and cookies in info and use it here.
384
  try:
385
+ ydl = ydl if ydl else YoutubeDL(getSubtitleOptions())
386
+ with ydl.urlopen(Request(dlUrl, headers=createHeaders())) as resp:
387
  if subType == "srt":
388
  return xml_caption_to_srt(resp.read().decode(), skipEmpty)
389
  elif subType == "txt":
 
409
  return subtitle_url
410
 
411
 
412
+ def extractInfo(url, lang, proxy, forceProxy=False):
413
+ max_retry = 2
414
+ retry = 0
415
+ http_proxy = proxy if forceProxy else None
416
+ errMsg = None
417
+
418
+ while retry < max_retry:
419
+ try:
420
+ ydl_opts = getSubtitleOptions(lang, http_proxy)
421
+ ydl = YoutubeDL(ydl_opts)
422
+ return ydl, ydl.extract_info(url, download=False)
423
+ except DownloadError as e:
424
+ errMsg = str(e)
425
+ if "429" in errMsg:
426
+ http_proxy = proxy
427
+
428
+ if any(s in errMsg for s in NO_RETRY_STR):
429
+ # print("{}, {}".format(e, url))
430
+ break
431
+
432
+ if not any(s in errMsg for s in RETRY_STR):
433
+ # print("{}, {}".format(e, url))
434
+ break
435
+ retry += 1
436
+ except Exception as e:
437
+ print(e)
438
+ traceback.print_exc()
439
+ raise e
440
+
441
+ raise Exception(errMsg)
442
+
443
+
444
  async def fetchSubtitleByInfo(
445
  url: str, subType: str, dlInfo, proxy: Optional[str] = None
446
  ):
447
  try:
448
  reqType = "xml" if subType in ["srt", "txt"] else subType
 
 
 
 
 
 
 
 
 
449
 
450
+ subtitle = None
451
+ if "dlUrl" in dlInfo:
452
+ subtitle = fetchSubtitleBydlUrl(subType, dlInfo.get("dlUrl"), False)
453
+ if subtitle is not None:
454
+ return subtitle
455
+
456
+ ydl, info_dict = extractInfo(url, dlInfo.get("lang", None), proxy, False)
457
+
458
+ if debug:
459
+ print(
460
+ "subtitles.keys(): {} automatic_captions: {} requested_subtitles: {}".format(
461
+ info_dict.get("subtitles").keys(),
462
+ info_dict.get("automatic_captions").keys(),
463
+ (
464
+ info_dict.get("requested_subtitles").keys()
465
+ if info_dict.get("requested_subtitles")
466
+ else {}
467
+ ),
 
 
 
 
468
  )
469
+ )
470
+
471
+ subtitleUrl = None
472
+ if "langKey" in dlInfo:
473
+ subtitleUrl = getSubtitleUrlByLang(
474
+ info_dict, dlInfo.get("langKey"), reqType, True
475
+ )
476
+ if subtitleUrl is None:
477
+ subtitleUrl = getSubtitleUrlByLang(
478
+ info_dict, dlInfo.get("lang"), reqType, False
479
+ )
480
+
481
+ print("subtitleUrl: {}".format(subtitleUrl))
482
+ subtitle = fetchSubtitleBydlUrl(subType, subtitleUrl, False, ydl)
483
+ return subtitle
484
  except Exception as e:
485
+ print("{}, {}".format(e, url))
486
  traceback.print_exc()
487
  return {"error": str(e)}
main.py CHANGED
@@ -68,8 +68,13 @@ async def download(
68
  if token != x_token:
69
  raise HTTPException(status_code=401, detail="Invalid token")
70
 
 
71
  try:
72
  dlInfo = json.loads(info)
 
 
 
 
73
  # print(
74
  # "url: {}, fileName: {}, fileType: {}, dlInfo: {}".format(
75
  # url, fileName, fileType, dlInfo
 
68
  if token != x_token:
69
  raise HTTPException(status_code=401, detail="Invalid token")
70
 
71
+ dlInfo = None
72
  try:
73
  dlInfo = json.loads(info)
74
+ except Exception:
75
+ raise HTTPException(status_code=400, detail="Invalid params")
76
+
77
+ try:
78
  # print(
79
  # "url: {}, fileName: {}, fileType: {}, dlInfo: {}".format(
80
  # url, fileName, fileType, dlInfo
requirements.txt CHANGED
@@ -4,4 +4,5 @@ fastapi==0.95.*
4
  # torch==1.11.*
5
  # transformers==4.*
6
  uvicorn[standard]==0.17.*
7
- yt-dlp==2023.06.22
 
 
4
  # torch==1.11.*
5
  # transformers==4.*
6
  uvicorn[standard]==0.17.*
7
+ # yt-dlp==2023.07.06
8
+ yt-dlp @ git+https://github.com/yt-dlp/yt-dlp.git@6014355c6142f68e20c8374e3787e5b5820f19e2 # jul 30