From 3a0002e7d87fb0628b6a7db5d88af562e50579d9 Mon Sep 17 00:00:00 2001
From: girst <tobi@isticktoit.net>
Date: Fri, 17 Jul 2020 23:22:10 +0200
Subject: [PATCH] proxy and fixup subtitles

proxying is necessary, because youtube doesn't send CORS headers. we
don't either, since we expect the consumer of the timedtext-proxy to
only be us ourselves. if someone wants to use it externally (e.g. as an
api), they'd have to add cors headers.

thanks goes out to perflyst[m] on #invidious for checking that
youtube's timedtext endpoint is "safe" for us to use.
---
 app/common/common.py                |  8 +++++++-
 app/youtube/__init__.py             | 19 +++++++++++++++++++
 app/youtube/templates/watch.html.j2 |  5 ++---
 3 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/app/common/common.py b/app/common/common.py
index db9ca24..6ddae90 100644
--- a/app/common/common.py
+++ b/app/common/common.py
@@ -244,11 +244,17 @@ def prepare_metadata(metadata):
         some_img = meta2['thumbnail']['thumbnails'][0]
         aspect_ratio = some_img['width'] / some_img['height']
 
+    # Note: we could get subtitles in multiple formats directly by querying
+    # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
+    # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
+    # but that won't give us autogenerated subtitles (and is an extra request).
+    # we can still add &fmt= to the extracted URLs below (first one takes precedence).
     subtitles = sorted([
         {'url':cc['baseUrl'],
          'code':cc['languageCode'],
          'autogenerated':cc.get('kind')=="asr", 
-         'name':cc['name']['simpleText']}
+         'name':cc['name']['simpleText'],
+         'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
         for cc in metadata.get('captions',{})
             .get('playerCaptionsTracklistRenderer',{})
             .get('captionTracks',[])
diff --git a/app/youtube/__init__.py b/app/youtube/__init__.py
index 461163b..c968e53 100644
--- a/app/youtube/__init__.py
+++ b/app/youtube/__init__.py
@@ -117,6 +117,25 @@ def playlist():
     title, author, videos = parse_xml(xmlfeed)
     return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos)
 
+@frontend.route('/api/timedtext')
+def timedtext():
+    r = requests.get("https://www.youtube.com/api/timedtext", request.args.to_dict())
+    # Note: in srv1 format, xmlentities are double-encoded m( a smart quote is
+    # even worse: it's '&amp;39;<smartquote>' wtf!? (at least vvt seems ok)
+    if not r.ok:
+        return "error: {r.text}", 400 # TODO: better
+    retval = r.text
+    if request.args.get('fmt') == 'vtt' and request.args.get('kind') == 'asr':
+        # autocaptions are extremely confusing, and stuck in the lower-left corner. fix it up a bit
+        retval = re.sub(r"<.+?>", "", retval) # remove inline html-like markup that times each word/adds styles
+        retval = retval.replace("align:start position:0%", "") # let browser position the text itself
+        # each subtitle-line is repeated twice (first on the lower line, then
+        # on the next "frame" on the upper line). we want to remove the
+        # repetition, as that's confusing without word and line animations:
+        lines = retval.split('\n')
+        retval = '\n'.join([line for line, prev in zip(lines, ['']+lines) if not " --> " in prev])
+    return retval, {'Content-Type': r.headers.get("Content-Type")}
+
 @frontend.route('/manage/subscriptions')
 # disabled for guest user: @login_required
 def subscription_manager():
diff --git a/app/youtube/templates/watch.html.j2 b/app/youtube/templates/watch.html.j2
index a958786..293c88e 100644
--- a/app/youtube/templates/watch.html.j2
+++ b/app/youtube/templates/watch.html.j2
@@ -14,10 +14,9 @@
 <div class="aspect-ratio main-video" style="--aspect-ratio:{{ aspectr }}">
 <video controls poster="{{ poster }}">
 	<source src="{{ video_url }}">
+	{% set cc_default = False %}
 	{% for cc in subtitles %}
-<!-- TODO: CORS error
-	<track label="{{ cc.name }}" kind="subtitles" srclang="{{ cc.code }}" src="{{ cc.url }}" {{ 'default' if not loop.counter }}>
--->
+	<track label="{{ cc.name }}" kind="subtitles" srclang="{{ cc.code }}" src="{{ url_for('youtube.timedtext') }}?{{ cc.query }}" {{ 'default' if cc_default and not loop.counter }}>
 	{% endfor %}
 </video>
 
-- 
2.39.3