app/frontend.py

   1 import re
   2 import json
   3 import time
   4 import hmac
   5 import hashlib
   6 import sqlite3
   7 import secrets
   8 import requests
   9 import requests_cache
  10 from urllib.parse import parse_qs
  11 from flask import Flask, render_template, request, redirect, flash, url_for, jsonify, g
  12
  13 from common import *
  14
  15 app = Flask(__name__)
  16 app.secret_key = secrets.token_bytes(16) # XXX: generate and hard-code, or cookies and csrf-validation will fail!
  17 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.
  18 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  19
  20 # Note: this should only be required for the 'memory' backed cache.
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 @app.route('/')
  30 def index():
  31     return redirect(url_for('feed'), code=302)
  32
  33 @app.route('/feed/subscriptions')
  34 def feed():
  35     token = request.args.get('token', 'guest')
  36     page = int(request.args.get('page', 0))
  37     with sqlite3.connect(cf['global']['database']) as conn:
  38         c = conn.cursor()
  39         c.execute("""
  40            SELECT videos.id, channel_id, name, title, published, flags.display
  41              FROM videos
  42              JOIN channels ON videos.channel_id = channels.id
  43         LEFT JOIN flags ON (videos.id = flags.video_id) AND (flags.user = ?)
  44             WHERE channel_id IN
  45                   (SELECT channel_id FROM subscriptions WHERE user = ?)
  46                   AND flags.display IS NOT 'hidden'
  47          ORDER BY (display = 'pinned') DESC, crawled DESC
  48             LIMIT 36
  49            OFFSET 36*?""", (token, token, page))
  50         rows = [{
  51             'video_id': video_id,
  52             'channel_id': channel_id,
  53             'author': author,
  54             'title': title,
  55             'published': published,
  56             'pinned': display == 'pinned',
  57         } for (video_id, channel_id, author, title, published, display) in c.fetchall()]
  58     return render_template('index.html.j2', rows=rows, page=page)
  59
  60 @app.route('/watch')
  61 def watch():
  62     if not 'v' in request.args:
  63         return "missing video id", 400
  64
  65     plaintextheader = {'content-type': 'text/plain',"Link": "<data:text/css,body%7Bcolor:%23eee;background:%23333%7D>; rel=stylesheet;"}
  66
  67     video_id = request.args.get('v')
  68     (video_url, metadata, error_type, error) = get_video_info(video_id)
  69     if error_type in ['initial', 'player']:
  70         return error, 400, plaintextheader
  71
  72     show = request.args.get("show")
  73     if show == "raw":
  74         if error:
  75             extra = {'geolocked':'local=1', 'livestream':'raw=0'}.get(error,'')
  76             # if error==exhausted, metadata.playabilityStatus.reason may contain additional information.
  77             return f"{error.upper()}: Redirecting to Invidious.", 502, {'Refresh': f'2; URL=https://invidio.us/watch?v={video_id}&{extra}&raw=1', **plaintextheader}
  78         return redirect(video_url, code=307)
  79     elif show == "json":
  80         return jsonify(metadata)
  81     else: # todo: handle geolocked, livesteam and the case when we have an exhausted error with no metadata returned
  82         return render_template('watch.html.j2', video_id=video_id, video_url=video_url, **prepare_metadata(metadata))
  83
  84 def prepare_metadata(metadata):
  85     meta1 = metadata['videoDetails']
  86     meta2 = metadata['microformat']['playerMicroformatRenderer']
  87     cards = metadata['cards']['cardCollectionRenderer']['cards'] if 'cards' in metadata else []
  88     endsc = metadata['endscreen']['endscreenRenderer']['elements'] if 'endscreen' in metadata else []
  89
  90     #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
  91     aspect_ratio = meta2['thumbnail']['thumbnails'][0]['width'] / meta2['thumbnail']['thumbnails'][0]['height']
  92
  93     subtitles = sorted([
  94         {'url':cc['baseUrl'],
  95          'code':cc['languageCode'],
  96          'autogenerated':cc.get('kind')=="asr",
  97          'name':cc['name']['simpleText']}
  98         for cc in metadata['captions']['playerCaptionsTracklistRenderer']['captionTracks']
  99     ], key=lambda cc: cc['autogenerated']) if 'captionTracks' in metadata['captions']['playerCaptionsTracklistRenderer'] else []
 100
 101     def parse_infocard(card):
 102         card = card['cardRenderer']
 103         teaser = card['teaser']['simpleCardTeaserRenderer']['message']['simpleText']  # not used
 104         ctype = list(card['content'].keys())[0]
 105         content = card['content'][ctype]
 106         if ctype == "pollRenderer":
 107             ctype = "POLL"
 108             content = {
 109                 'question': content['question']['simpleText'],
 110                 'answers': [(a['text']['simpleText'],a['numVotes']) for a in content['choices']],
 111             }
 112         elif ctype == "videoInfoCardContentRenderer":
 113             ctype = "VIDEO"
 114             content = {
 115                 'video_id': content['action']['watchEndpoint']['videoId'],
 116                 'title': content['videoTitle']['simpleText'],
 117                 'author': content['channelName']['simpleText'],   # 'by xXxXx'
 118                 'length': content['lengthString']['simpleText'],  # '23:03'
 119                 'views': content['viewCountText']['simpleText'],  # '421,248 views'
 120             }
 121         elif ctype == "playlistInfoCardContentRenderer":
 122             ctype = "PLAYLIST"
 123             content = {
 124                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 125                 'video_id': content['action']['watchEndpoint']['videoId'],
 126                 'title': content['playlistTitle']['simpleText'],
 127                 'author': content['channelName']['simpleText'],
 128                 'n_videos': content['playlistVideoCount']['simpleText'],  # '21'
 129             }
 130         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content.get('command',{}).keys():
 131             ctype = "WEBSITE"
 132             content = {
 133                 'url': parse_qs(content['command']['urlEndpoint']['url'].split('?')[1])['q'][0],
 134                 'domain': content['displayDomain']['simpleText'],
 135                 'title': content['title']['simpleText'],
 136                 'text': content['actionButton']['simpleCardButtonRenderer']['text']['simpleText'],
 137             }
 138         else:
 139             import pprint
 140             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 141
 142         return {'teaser': teaser, 'type': ctype, 'content': content}
 143
 144     def parse_endcard(card):
 145         card = card['endscreenElementRenderer'] if 'endscreenElementRenderer' in card.keys() else card
 146         ctype = card['style']
 147         if ctype == "CHANNEL":
 148             content = {
 149                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 150                 'title': card['title']['simpleText'],
 151                 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
 152             }
 153         elif ctype == "VIDEO":
 154             content = {
 155                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 156                 'title': card['title']['simpleText'],
 157                 'length': card['videoDuration']['simpleText'], # '12:21'
 158                 'views': card['metadata']['simpleText'],  # '51,649 views'
 159             }
 160         elif ctype == "PLAYLIST":
 161             content = {
 162                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 163                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 164                 'title': card['title']['simpleText'],
 165                 'author': card['metadata']['simpleText'],
 166                 'n_videos': card['playlistLength']['simpleText'].replace(" videos", ""),
 167             }
 168         elif ctype == "WEBSITE":
 169             content = {
 170                 'url': parse_qs(card['endpoint']['urlEndpoint']['url'].split('?')[1])['q'][0],
 171                 'domain': card['metadata']['simpleText'],
 172                 'title': card['title']['simpleText'],
 173                 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
 174             }
 175         else:
 176             import pprint
 177             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 178
 179         return {'type': ctype, 'content': content}
 180
 181     return {
 182         'title': meta1['title'],
 183         'author': meta1['author'],
 184         'channel_id': meta1['channelId'],
 185         'description': meta1['shortDescription'],
 186         'published': meta2['publishDate'],
 187         'views': meta1['viewCount'],
 188         'length': int(meta1['lengthSeconds']),
 189         'rating': meta1['averageRating'],
 190         'category': meta2['category'],
 191         'aspectr': aspect_ratio,
 192         'unlisted': meta2['isUnlisted'],
 193         'countries': meta2['availableCountries'],
 194         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 195         'infocards': [parse_infocard(card) for card in cards],
 196         'endcards': [parse_endcard(card) for card in endsc],
 197         'subtitles': subtitles,
 198     }
 199
 200 def get_video_info(video_id):
 201     """
 202     returns the best-quality muxed video stream, the player_response, error-type/-mesage
 203     error types: 'initial':  the request to get_video_info was malformed
 204                  'player':   playabilityStatus != OK
 205                  'internal': [livestream, geolocked, exhausted]
 206     """
 207     # TODO: caching, e.g. beaker? need to not cache premiering-soon videos/livestreams/etc, though
 208     #        responses are apparently valid for 6h; maybe cache for (video_length - 2h)
 209     # TODO: errro types? ["invalid parameters", playabilitystatus, own]
 210     # todo: a bit messy; should return all unscrambled video urls in best->worst quality
 211
 212     # we try to fetch the video multiple times using different origins
 213     (sts, algo) = get_cipher()
 214     for el in ['embedded', 'detailpage']: # ['el-completely-absent',info,leanback,editpage,adunit,previewpage,profilepage]
 215         r = requests.get(f"https://www.youtube.com/get_video_info"+
 216             f"?video_id={video_id}"+
 217             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 218             f"&el={el}"+
 219             f"&sts={sts}"+
 220             f"&hl=en_US") #"&hl=en&gl=US"
 221         params = parse_qs(r.text)
 222         if 'errorcode' in params: # status=fail
 223             return None, None, 'initial', f"MALFORMED: {params['reason'][0]}"
 224
 225         metadata = json.loads(params.get('player_response')[0])
 226         if metadata['playabilityStatus']['status'] != "OK":
 227             if metadata['playabilityStatus']['status'] == "UNPLAYABLE":
 228                 continue  # try again with different 'el' value. if none succeeds, we fall into "exhausted" path, which returns last tried metadata, from which the playabilityStatus.reason can be extracted. according to jwz/youtubedown, the worst error message comes from embedded, which is tried first, so it should be overwritten by a better message.
 229             return None, None, 'player', f"{metadata['playabilityStatus']['status']}: {metadata['playabilityStatus']['reason']}"
 230         if 'liveStreamability' in metadata['playabilityStatus']:
 231             return None, metadata, 'internal', "livestream" # can also check .microformat.liveBroadcastDetails.isLiveNow
 232
 233         formats = metadata['streamingData']['formats']
 234         for (i,v) in enumerate(formats):
 235             if not ('cipher' in v or 'signatureCipher' in v): continue
 236             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 237             formats[i]['url'] = unscramble(cipher)
 238
 239         # todo: check if we have urls or try again
 240         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 241
 242         if 'gcr' in parse_qs(url):
 243             return None, metadata, 'internal', "geolocked"
 244
 245         return url, metadata, None, None
 246     else:
 247         return None, metadata, 'internal', "exhausted"
 248
 249 def unscramble(cipher):  # test video id: UxxajLWwzqY
 250     signature = list(cipher['s'][0])
 251     (sts, algo) = get_cipher()
 252     for c in algo.split():
 253         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 254         if not op: continue
 255         if op == 'r': signature = list(reversed(signature))
 256         if op == 's': signature = signature[int(ix):]
 257         if op == 'w': signature[0], signature[int(ix)%len(signature)] = signature[int(ix)%len(signature)], signature[0]
 258     sp = cipher.get('sp', ['signature'])[0]
 259     sig = cipher['sig'][0] if 'sig' in cipher else ''.join(signature)
 260     return f"{cipher['url'][0]}&{sp}={sig}"
 261
 262 @app.route('/channel/<channel_id>')
 263 def channel(channel_id):
 264     if not re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id):
 265         return "bad channel id", 400 # todo
 266
 267     xmlfeed = fetch_xml("channel_id", channel_id)
 268     if not xmlfeed:
 269         return "not found or something", 404 # XXX
 270     (title, author, _, videos) = parse_xml(xmlfeed)
 271     return render_template('xmlfeed.html.j2', title=author, rows=videos)
 272
 273 @app.route('/playlist')
 274 def playlist():
 275     playlist_id = request.args.get('list')
 276     if not playlist_id:
 277         return "bad list id", 400 # todo
 278
 279     xmlfeed = fetch_xml("playlist_id", playlist_id)
 280     if not xmlfeed:
 281         return "not found or something", 404 # XXX
 282     (title, author, _, videos) = parse_xml(xmlfeed)
 283     return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos)
 284
 285 @app.route('/subscription_manager')
 286 def subscription_manager():
 287     token = request.args.get('token', 'guest')
 288     with sqlite3.connect(cf['global']['database']) as conn:
 289         #with conn.cursor() as c:
 290             c = conn.cursor()
 291             c.execute("""
 292                   SELECT subscriptions.channel_id, name,
 293                          (subscribed_until < datetime('now')) AS obsolete
 294                     FROM subscriptions
 295                     left JOIN channels ON channels.id = subscriptions.channel_id
 296                     left JOIN websub ON channels.id = websub.channel_id
 297                    WHERE user = ?
 298                 ORDER BY obsolete=0, name COLLATE NOCASE ASC""", (token,))
 299             rows = [{
 300                 'channel_id': channel_id,
 301                 'author': author or channel_id,
 302                 'subscribed_until': subscribed_until
 303             } for (channel_id, author, subscribed_until) in c.fetchall()]
 304     return render_template('subscription_manager.html.j2', rows=rows)
 305
 306 @app.route('/feed/subscriptions', methods=['POST'])
 307 def feed_post():
 308     token = request.args.get('token', 'guest')
 309     if token == 'guest': return "guest user is read-only", 403
 310     action = next(iter(k for k in request.form.keys() if k != 'csrf'), None)
 311     if action in ['pin', 'unpin', 'hide']:
 312         video_id = request.form.get(action)
 313         display = {
 314             'pin': 'pinned',
 315             'unpin': None,
 316             'hide': 'hidden',
 317         }[action]
 318         with sqlite3.connect(cf['global']['database']) as conn:
 319             #with conn.cursor() as c:
 320                 c = conn.cursor()
 321                 c.execute("""
 322                         INSERT OR REPLACE INTO flags (user, video_id, display)
 323                         VALUES (?, ?, ?)
 324                 """, (token, video_id, display))
 325     else:
 326         flash(("error","unsupported action"))
 327     return redirect(request.url, code=303)
 328
 329 @app.route('/subscription_manager', methods=['POST'])
 330 def manage_subscriptions():
 331     token = request.args.get('token', 'guest')
 332     if token == 'guest': return "guest user is read-only", 403
 333     if 'subscribe' in request.form:
 334         channel_id = request.form.get("subscribe")
 335         match = re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id)
 336         if match:
 337             channel_id = match.group(1)
 338         else:
 339             match = re.match(r"((?:PL|LL|EC|UU|FL|UL|OL)[A-Za-z0-9_-]{10,})", channel_id)
 340             if match:  # NOTE: PL-playlists are 32chars, others differ in length.
 341                 flash(("error","playlists not (yet?) supported."))
 342                 return redirect(request.url, code=303) # TODO: dedup redirection
 343             else:
 344                 flash(("error","not a valid/subscribable URI"))
 345                 return redirect(request.url, code=303) # TODO: dedup redirection
 346         with sqlite3.connect(cf['global']['database']) as conn:
 347             #with conn.cursor() as c:
 348                 c = conn.cursor()
 349                 c.execute("""
 350                         INSERT OR IGNORE INTO subscriptions (user, channel_id)
 351                         VALUES (?, ?)
 352                 """, (token, channel_id))
 353                 # TODO: sql-error-handling, asynchronically calling update-subs.pl
 354
 355     elif 'unsubscribe' in request.form:
 356         with sqlite3.connect(cf['global']['database']) as conn:
 357             #with conn.cursor() as c:
 358                 c = conn.cursor()
 359                 c.execute("""
 360                         DELETE FROM subscriptions
 361                         WHERE user = ? AND channel_id = ?
 362                 """, (token, channel_id))
 363                 # TODO: sql-error-handling, report success
 364
 365     else:
 366         flash(("error","unsupported action"))
 367
 368     return redirect(request.url, code=303)
 369
 370 @app.route('/r/')
 371 def reddit_index():
 372     return ""
 373 @app.route('/r/<subreddit>')
 374 def reddit(subreddit="videos"):
 375     count = int(request.args.get('count', 0))
 376     before = request.args.get('before')
 377     after = request.args.get('after')
 378     query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after)] if v])
 379     r = requests.get(f"https://old.reddit.com/r/{subreddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'})
 380     if not r.ok or not 'data' in r.json():
 381         return r.text+"error retrieving reddit data", 502
 382
 383     good = [e for e in r.json()['data']['children'] if e['data']['score'] > 1]
 384     bad  = [e for e in r.json()['data']['children'] if e['data']['score'] <=1]
 385     videos = []
 386     for entry in (good+bad):
 387         e = entry['data']
 388         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 389             continue
 390         video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/)([-_0-9A-Za-z]+)', e['url']).group(1)
 391         if not video_id: continue
 392         videos.append({
 393             'video_id': video_id,
 394             'title': e['title'],
 395             'url': e['permalink'],
 396             'n_comments': e['num_comments'],
 397             'n_karma': e['score'],
 398         })
 399     before = r.json()['data']['before']
 400     after = r.json()['data']['after']
 401     return render_template('reddit.html.j2', subreddit=subreddit, rows=videos, before=before, after=after, count=count)
 402
 403 def get_cipher():
 404     # reload cipher from database every 1 hour
 405     if 'cipher' not in g or time.time() - g.get('cipher_updated', 0) > 1 * 60 * 60:
 406         with sqlite3.connect(cf['global']['database']) as conn:
 407             c = conn.cursor()
 408             c.execute("SELECT sts, algorithm FROM cipher")
 409             g.cipher = c.fetchone()
 410             g.cipher_updated = time.time()
 411
 412     return g.cipher
 413
 414 #@app.teardown_appcontext
 415 #def teardown_db():
 416 #    db = g.pop('db', None)
 417 #
 418 #    if db is not None:
 419 #        db.close()
 420
 421 # Magic CSRF protection: This modifies outgoing HTML responses and injects a csrf token into all forms.
 422 # All post requests are then checked if they contain the valid token.
 423 # TODO:
 424 # - don't use regex for injecting
 425 # - inject a http header into all responses (that could be used by apis)
 426 # - allow csrf token to be passed in http header, json, ...
 427 # - a decorator on routes to opt out of verification or output munging
 428 @app.after_request
 429 def add_csrf_protection(response):
 430     if response.mimetype == "text/html":
 431         token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest()  # TODO: will fail behind reverse proxy (remote_addr always localhost)
 432         response.set_data( re.sub(
 433             rb'''(<[Ff][Oo][Rr][Mm](\s+[a-zA-Z0-9-]+(=(\w*|'[^']*'|"[^"]*"))?)*>)''', # match form tags with any number of attributes and any type of quotes
 434             rb'\1<input type="hidden" name="csrf" value="'+token.encode('ascii')+rb'">', # hackily append a hidden input with our csrf protection value
 435             response.get_data()))
 436     return response
 437 @app.before_request
 438 def verify_csrf_protection():
 439     token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest()  # TODO: will fail behind reverse proxy (remote_addr always localhost)
 440     if request.method == "POST" and request.form.get('csrf') != token:
 441         return "CSRF validation failed!", 400
 442     request.form = request.form.copy() # make it mutable
 443     # request.form.pop('csrf')  # XXX: breaks all requests?!
 444
 445 @app.template_filter('format_date')
 446 def format_date(s):
 447     (y,m,d) = (int(n) for n in s.split('T')[0].split(' ')[0].split('-'))  # iso-dates can seperate date from time with space or 'T'
 448     M = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split()
 449     return f"{d} {M[m]}"
 450
 451 def pp(*args):
 452     from pprint import pprint
 453     import sys, codecs
 454     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
 455
 456 if __name__ == '__main__':
 457     app.run(debug=True)