From 3a0002e7d87fb0628b6a7db5d88af562e50579d9 Mon Sep 17 00:00:00 2001 From: girst Date: Fri, 17 Jul 2020 23:22:10 +0200 Subject: [PATCH] proxy and fixup subtitles proxying is necessary, because youtube doesn't send CORS headers. we don't either, since we expect the consumer of the timedtext-proxy to only be us ourselves. if someone wants to use it externally (e.g. as an api), they'd have to add cors headers. thanks goes out to perflyst[m] on #invidious for checking that youtube's timedtext endpoint is "safe" for us to use. --- app/common/common.py | 8 +++++++- app/youtube/__init__.py | 19 +++++++++++++++++++ app/youtube/templates/watch.html.j2 | 5 ++--- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/app/common/common.py b/app/common/common.py index db9ca24..6ddae90 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -244,11 +244,17 @@ def prepare_metadata(metadata): some_img = meta2['thumbnail']['thumbnails'][0] aspect_ratio = some_img['width'] / some_img['height'] + # Note: we could get subtitles in multiple formats directly by querying + # https://video.google.com/timedtext?hl=en&type=list&v= followed by + # https://www.youtube.com/api/timedtext?lang=&v=&fmt={srv1|srv2|srv3|ttml|vtt}, + # but that won't give us autogenerated subtitles (and is an extra request). + # we can still add &fmt= to the extracted URLs below (first one takes precedence). subtitles = sorted([ {'url':cc['baseUrl'], 'code':cc['languageCode'], 'autogenerated':cc.get('kind')=="asr", - 'name':cc['name']['simpleText']} + 'name':cc['name']['simpleText'], + 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy for cc in metadata.get('captions',{}) .get('playerCaptionsTracklistRenderer',{}) .get('captionTracks',[]) diff --git a/app/youtube/__init__.py b/app/youtube/__init__.py index 461163b..c968e53 100644 --- a/app/youtube/__init__.py +++ b/app/youtube/__init__.py @@ -117,6 +117,25 @@ def playlist(): title, author, videos = parse_xml(xmlfeed) return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos) +@frontend.route('/api/timedtext') +def timedtext(): + r = requests.get("https://www.youtube.com/api/timedtext", request.args.to_dict()) + # Note: in srv1 format, xmlentities are double-encoded m( a smart quote is + # even worse: it's '&39;' wtf!? (at least vvt seems ok) + if not r.ok: + return "error: {r.text}", 400 # TODO: better + retval = r.text + if request.args.get('fmt') == 'vtt' and request.args.get('kind') == 'asr': + # autocaptions are extremely confusing, and stuck in the lower-left corner. fix it up a bit + retval = re.sub(r"<.+?>", "", retval) # remove inline html-like markup that times each word/adds styles + retval = retval.replace("align:start position:0%", "") # let browser position the text itself + # each subtitle-line is repeated twice (first on the lower line, then + # on the next "frame" on the upper line). we want to remove the + # repetition, as that's confusing without word and line animations: + lines = retval.split('\n') + retval = '\n'.join([line for line, prev in zip(lines, ['']+lines) if not " --> " in prev]) + return retval, {'Content-Type': r.headers.get("Content-Type")} + @frontend.route('/manage/subscriptions') # disabled for guest user: @login_required def subscription_manager(): diff --git a/app/youtube/templates/watch.html.j2 b/app/youtube/templates/watch.html.j2 index a958786..293c88e 100644 --- a/app/youtube/templates/watch.html.j2 +++ b/app/youtube/templates/watch.html.j2 @@ -14,10 +14,9 @@
-- 2.39.3