app/frontend.py

   1 import re
   2 import json
   3 import time
   4 import sqlite3
   5 import secrets
   6 import requests
   7 import requests_cache
   8 from urllib.parse import parse_qs
   9 from flask import Flask, render_template, request, redirect, flash, url_for, jsonify, g
  10
  11 from common import *
  12
  13 app = Flask(__name__)
  14 app.secret_key = secrets.token_bytes(16)
  15 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.
  16 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  17
  18 # Note: this should only be required for the 'memory' backed cache.
  19 from threading import Timer
  20 def purge_cache(sec):
  21     requests_cache.remove_expired_responses()
  22     t = Timer(sec, purge_cache, args=(sec,))
  23     t.setDaemon(True)
  24     t.start()
  25 purge_cache(10*60)
  26
  27 @app.route('/')
  28 def index():
  29     return redirect(url_for('feed'), code=302)
  30
  31 @app.route('/feed/subscriptions')
  32 def feed():
  33     token = request.args.get('token', 'guest')
  34     page = int(request.args.get('page', 0))
  35     with sqlite3.connect(cf['global']['database']) as conn:
  36         c = conn.cursor()
  37         c.execute("""
  38            SELECT videos.id, channel_id, name, title, published, flags.display
  39              FROM videos
  40              JOIN channels ON videos.channel_id = channels.id
  41         LEFT JOIN flags ON (videos.id = flags.video_id) AND (flags.user = ?)
  42             WHERE channel_id IN
  43                   (SELECT channel_id FROM subscriptions WHERE user = ?)
  44                   AND flags.display IS NOT 'hidden'
  45          ORDER BY (display = 'pinned') DESC, crawled DESC
  46             LIMIT 36
  47            OFFSET 36*?""", (token, token, page))
  48         rows = [{
  49             'video_id': video_id,
  50             'channel_id': channel_id,
  51             'author': author,
  52             'title': title,
  53             'published': published,
  54             'pinned': display == 'pinned',
  55         } for (video_id, channel_id, author, title, published, display) in c.fetchall()]
  56     return render_template('index.html.j2', rows=rows, page=page)
  57
  58 @app.route('/watch')
  59 def watch():
  60     if not 'v' in request.args:
  61         return "missing video id", 400
  62
  63     video_id = request.args.get('v')
  64     (video_url, metadata, error_type, error) = get_video_info(video_id)
  65     if error_type in ['initial', 'player']:
  66         return error, 400, {'content-type': 'text/plain',"Link": "<data:text/css,body%7Bcolor:%23eee;background:%23333%7D>; rel=stylesheet;"}
  67
  68     show = request.args.get("show")
  69     if show == "metadata": # todo: handle the case when we have an exhausted error with no metadata returned
  70         return render_template('watch.html.j2', video_id=video_id, video_url=video_url, **prepare_metadata(metadata))
  71     elif show == "json":
  72         return jsonify(metadata)
  73     else:
  74         if error:
  75             extra = {'geolocked':'local=1', 'livestream':'raw=0'}.get(error,'')
  76             # if error==exhausted, metadata.playabilityStatus.reason may contain additional information.
  77             return f"{error.upper()}: Redirecting to Invidious.", 502, {'Refresh': '2; URL=https://invidio.us/watch?v='+video_id+'&'+extra+'&raw=1','content-type': 'text/plain',"Link": "<data:text/css,body%7Bcolor:%23eee;background:%23333%7D>; rel=stylesheet;"}
  78         return redirect(video_url, code=307)
  79
  80 def prepare_metadata(metadata):
  81     meta1 = metadata['videoDetails']
  82     meta2 = metadata['microformat']['playerMicroformatRenderer']
  83     cards = metadata['cards']['cardCollectionRenderer']['cards'] if 'cards' in metadata else []
  84     endsc = metadata['endscreen']['endscreenRenderer']['elements'] if 'endscreen' in metadata else []
  85
  86     #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
  87     aspect_ratio = meta2['thumbnail']['thumbnails'][0]['width'] / meta2['thumbnail']['thumbnails'][0]['height']
  88
  89     subtitles = sorted([
  90         {'url':cc['baseUrl'],
  91          'code':cc['languageCode'],
  92          'autogenerated':cc.get('kind')=="asr",
  93          'name':cc['name']['simpleText']}
  94         for cc in metadata['captions']['playerCaptionsTracklistRenderer']['captionTracks']
  95     ], key=lambda cc: cc['autogenerated']) if 'captionTracks' in metadata['captions']['playerCaptionsTracklistRenderer'] else []
  96
  97     def parse_infocard(card):
  98         card = card['cardRenderer']
  99         teaser = card['teaser']['simpleCardTeaserRenderer']['message']['simpleText']  # not used
 100         ctype = list(card['content'].keys())[0]
 101         content = card['content'][ctype]
 102         if ctype == "pollRenderer":
 103             ctype = "POLL"
 104             content = {
 105                 'question': content['question']['simpleText'],
 106                 'answers': [(a['text']['simpleText'],a['numVotes']) for a in content['choices']],
 107             }
 108         elif ctype == "videoInfoCardContentRenderer":
 109             ctype = "VIDEO"
 110             content = {
 111                 'video_id': content['action']['watchEndpoint']['videoId'],
 112                 'title': content['videoTitle']['simpleText'],
 113                 'author': content['channelName']['simpleText'],   # 'by xXxXx'
 114                 'length': content['lengthString']['simpleText'],  # '23:03'
 115                 'views': content['viewCountText']['simpleText'],  # '421,248 views'
 116             }
 117         elif ctype == "playlistInfoCardContentRenderer":
 118             ctype = "PLAYLIST"
 119             content = {
 120                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 121                 'video_id': content['action']['watchEndpoint']['videoId'], # XXX: untested
 122                 'title': content['playlistTitle']['simpleText'],
 123                 'author': content['channelName']['simpleText'],
 124                 'n_videos': content['videoCountText']['simpleText'],
 125             }
 126         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content.get('command',{}).keys():
 127             ctype = "WEBSITE"
 128             content = {
 129                 'url': parse_qs(content['command']['urlEndpoint']['url'].split('?')[1])['q'][0],
 130                 'title': content['title']['simpleText'],
 131                 'text': content['actionButton']['simpleCardButtonRenderer']['text']['simpleText'],
 132             }
 133         else:
 134             import pprint
 135             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 136
 137         return {'teaser': teaser, 'type': ctype, 'content': content}
 138
 139     def parse_endcard(card):
 140         card = card['endscreenElementRenderer'] if 'endscreenElementRenderer' in card.keys() else card
 141         ctype = card['style']
 142         if ctype == "CHANNEL":
 143             content = {
 144                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 145                 'title': card['title']['simpleText'],
 146                 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
 147             }
 148         elif ctype == "VIDEO":
 149             content = {
 150                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 151                 'title': card['title']['simpleText'],
 152                 'length': card['videoDuration']['simpleText'], # '12:21'
 153                 'views': card['metadata']['simpleText'],  # '51,649 views'
 154             }
 155         elif ctype == "PLAYLIST":
 156             content = {
 157                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 158                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 159                 'title': card['title']['simpleText'],
 160                 'author': card['metadata']['simpleText'],
 161                 'n_videos': card['playlistLength']['simpleText'],
 162             }
 163         elif ctype == "WEBSITE":
 164             content = {
 165                 'url': parse_qs(card['endpoint']['urlEndpoint']['url'].split('?')[1])['q'][0],
 166                 'title': card['title']['simpleText'],
 167                 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
 168             }
 169         else:
 170             import pprint
 171             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 172
 173         return {'type': ctype, 'content': content}
 174
 175     return {
 176         'title': meta1['title'],
 177         'author': meta1['author'],
 178         'channel_id': meta1['channelId'],
 179         'description': meta1['shortDescription'],
 180         'published': meta2['publishDate'],
 181         'views': meta1['viewCount'],
 182         'length': int(meta1['lengthSeconds']),
 183         'rating': meta1['averageRating'],
 184         'category': meta2['category'],
 185         'aspectr': aspect_ratio,
 186         'unlisted': meta2['isUnlisted'],
 187         'countries': meta2['availableCountries'],
 188         'infocards': [parse_infocard(card) for card in cards],
 189         'endcards': [parse_endcard(card) for card in endsc],
 190         'subtitles': subtitles,
 191     }
 192
 193 def get_video_info(video_id):
 194     """
 195     returns the best-quality muxed video stream, the player_response, error-type/-mesage
 196     error types: 'initial':  the request to get_video_info was malformed
 197                  'player':   playabilityStatus != OK
 198                  'internal': [livestream, geolocked, exhausted]
 199     """
 200     # TODO: caching, e.g. beaker? need to not cache premiering-soon videos/livestreams/etc, though
 201     #        responses are apparently valid for 6h; maybe cache for (video_length - 2h)
 202     # TODO: errro types? ["invalid parameters", playabilitystatus, own]
 203     # todo: a bit messy; should return all unscrambled video urls in best->worst quality
 204
 205     # we try to fetch the video multiple times using different origins
 206     (sts, algo) = get_cipher()
 207     for el in ['embedded', 'detailpage']: # ['el-completely-absent',info,leanback,editpage,adunit,previewpage,profilepage]
 208         r = requests.get(f"https://www.youtube.com/get_video_info"+
 209             f"?video_id={video_id}"+
 210             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 211             f"&el={el}"+
 212             f"&sts={sts}"+
 213             f"&hl=en_US") #"&hl=en&gl=US"
 214         params = parse_qs(r.text)
 215         if 'errorcode' in params: # status=fail
 216             return None, None, 'initial', f"MALFORMED: {params['reason'][0]}"
 217
 218         metadata = json.loads(params.get('player_response')[0])
 219         if metadata['playabilityStatus']['status'] != "OK":
 220             if metadata['playabilityStatus']['status'] == "UNPLAYABLE":
 221                 continue  # try again with different 'el' value. if none succeeds, we fall into "exhausted" path, which returns last tried metadata, from which the playabilityStatus.reason can be extracted. according to jwz/youtubedown, the worst error message comes from embedded, which is tried first, so it should be overwritten by a better message.
 222             return None, None, 'player', f"{metadata['playabilityStatus']['status']}: {metadata['playabilityStatus']['reason']}"
 223         if 'liveStreamability' in metadata['playabilityStatus']:
 224             return None, metadata, 'internal', "livestream" # can also check .microformat.liveBroadcastDetails.isLiveNow
 225
 226         formats = metadata['streamingData']['formats']
 227         for (i,v) in enumerate(formats):
 228             if not ('cipher' in v or 'signatureCipher' in v): continue
 229             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 230             formats[i]['url'] = unscramble(cipher)
 231
 232         # todo: check if we have urls or try again
 233         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 234
 235         if 'gcr' in parse_qs(url):
 236             return None, metadata, 'internal', "geolocked"
 237
 238         return url, metadata, None, None
 239     else:
 240         return None, metadata, 'internal', "exhausted"
 241
 242 def unscramble(cipher):  # test video id: UxxajLWwzqY
 243     signature = list(cipher['s'][0])
 244     (sts, algo) = get_cipher()
 245     for c in algo.split():
 246         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 247         if not op: continue
 248         if op == 'r': signature = list(reversed(signature))
 249         if op == 's': signature = signature[int(ix):]
 250         if op == 'w': signature[0], signature[int(ix)%len(signature)] = signature[int(ix)%len(signature)], signature[0]
 251     sp = cipher.get('sp', ['signature'])[0]
 252     sig = cipher['sig'][0] if 'sig' in cipher else ''.join(signature)
 253     return f"{cipher['url'][0]}&{sp}={sig}"
 254
 255 @app.route('/channel/<channel_id>')
 256 def channel(channel_id):
 257     if not re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id):
 258         return "bad channel id", 400 # todo
 259
 260     xmlfeed = fetch_xml("channel_id", channel_id)
 261     if not xmlfeed:
 262         return "not found or something", 404 # XXX
 263     (title, author, _, videos) = parse_xml(xmlfeed)
 264     return render_template('xmlfeed.html.j2', title=author, rows=videos)
 265
 266 @app.route('/playlist')
 267 def playlist():
 268     playlist_id = request.args.get('list')
 269     if not playlist_id:
 270         return "bad list id", 400 # todo
 271
 272     xmlfeed = fetch_xml("playlist_id", playlist_id)
 273     if not xmlfeed:
 274         return "not found or something", 404 # XXX
 275     (title, author, _, videos) = parse_xml(xmlfeed)
 276     return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos)
 277
 278 @app.route('/subscription_manager')
 279 def subscription_manager():
 280     token = request.args.get('token', 'guest')
 281     with sqlite3.connect(cf['global']['database']) as conn:
 282         #with conn.cursor() as c:
 283             c = conn.cursor()
 284             c.execute("""
 285                   SELECT subscriptions.channel_id, name,
 286                          (subscribed_until < datetime('now')) AS obsolete
 287                     FROM subscriptions
 288                     left JOIN channels ON channels.id = subscriptions.channel_id
 289                     left JOIN websub ON channels.id = websub.channel_id
 290                    WHERE user = ?
 291                 ORDER BY obsolete=0, name COLLATE NOCASE ASC""", (token,))
 292             rows = [{
 293                 'channel_id': channel_id,
 294                 'author': author or channel_id,
 295                 'subscribed_until': subscribed_until
 296             } for (channel_id, author, subscribed_until) in c.fetchall()]
 297     return render_template('subscription_manager.html.j2', rows=rows)
 298
 299 @app.route('/feed/subscriptions', methods=['POST'])
 300 def feed_post():
 301     token = request.args.get('token', 'guest')
 302     if token == 'guest': return "guest user is read-only", 403
 303     action = next(request.form.keys(), None)
 304     if action in ['pin', 'unpin', 'hide']:
 305         video_id = request.form.get(action)
 306         display = {
 307             'pin': 'pinned',
 308             'unpin': None,
 309             'hide': 'hidden',
 310         }[action]
 311         with sqlite3.connect(cf['global']['database']) as conn:
 312             #with conn.cursor() as c:
 313                 c = conn.cursor()
 314                 c.execute("""
 315                         INSERT OR REPLACE INTO flags (user, video_id, display)
 316                         VALUES (?, ?, ?)
 317                 """, (token, video_id, display))
 318     else:
 319         flash(("error","unsupported action"))
 320     return redirect(request.url, code=303)
 321
 322 @app.route('/subscription_manager', methods=['POST'])
 323 def manage_subscriptions():
 324     token = request.args.get('token', 'guest')
 325     if token == 'guest': return "guest user is read-only", 403
 326     if 'subscribe' in request.form:
 327         channel_id = request.form.get("subscribe")
 328         match = re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id)
 329         if match:
 330             channel_id = match.group(1)
 331         else:
 332             match = re.match(r"((?:PL|LL|EC|UU|FL|UL|OL)[A-Za-z0-9_-]{10,})", channel_id)
 333             if match:  # NOTE: PL-playlists are 32chars, others differ in length.
 334                 flash(("error","playlists not (yet?) supported."))
 335                 return redirect(request.url, code=303) # TODO: dedup redirection
 336             else:
 337                 flash(("error","not a valid/subscribable URI"))
 338                 return redirect(request.url, code=303) # TODO: dedup redirection
 339         with sqlite3.connect(cf['global']['database']) as conn:
 340             #with conn.cursor() as c:
 341                 c = conn.cursor()
 342                 c.execute("""
 343                         INSERT OR IGNORE INTO subscriptions (user, channel_id)
 344                         VALUES (?, ?)
 345                 """, (token, channel_id))
 346                 # TODO: sql-error-handling, asynchronically calling update-subs.pl
 347
 348     elif 'unsubscribe' in request.form:
 349         with sqlite3.connect(cf['global']['database']) as conn:
 350             #with conn.cursor() as c:
 351                 c = conn.cursor()
 352                 c.execute("""
 353                         DELETE FROM subscriptions
 354                         WHERE user = ? AND channel_id = ?
 355                 """, (token, channel_id))
 356                 # TODO: sql-error-handling, report success
 357
 358     else:
 359         flash(("error","unsupported action"))
 360
 361     return redirect(request.url, code=303)
 362
 363 @app.route('/r/')
 364 def reddit_index():
 365     return ""
 366 @app.route('/r/<subreddit>')
 367 def reddit(subreddit="videos"):
 368     count = int(request.args.get('count', 0))
 369     before = request.args.get('before')
 370     after = request.args.get('after')
 371     query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after)] if v])
 372     r = requests.get(f"https://old.reddit.com/r/{subreddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'})
 373     if not r.ok or not 'data' in r.json():
 374         return r.text+"error retrieving reddit data", 502
 375
 376     good = [e for e in r.json()['data']['children'] if e['data']['score'] > 1]
 377     bad  = [e for e in r.json()['data']['children'] if e['data']['score'] <=1]
 378     videos = []
 379     for entry in (good+bad):
 380         e = entry['data']
 381         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 382             continue
 383         video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/)([-_0-9A-Za-z]+)', e['url']).group(1)
 384         if not video_id: continue
 385         videos.append({
 386             'video_id': video_id,
 387             'title': e['title'],
 388             'url': e['permalink'],
 389             'n_comments': e['num_comments'],
 390             'n_karma': e['score'],
 391         })
 392     before = r.json()['data']['before']
 393     after = r.json()['data']['after']
 394     return render_template('reddit.html.j2', subreddit=subreddit, rows=videos, before=before, after=after, count=count)
 395
 396 def get_cipher():
 397     # reload cipher from database every 1 hour
 398     if 'cipher' not in g or time.time() - g.get('cipher_updated', 0) > 1 * 60 * 60:
 399         with sqlite3.connect(cf['global']['database']) as conn:
 400             c = conn.cursor()
 401             c.execute("SELECT sts, algorithm FROM cipher")
 402             g.cipher = c.fetchone()
 403             g.cipher_updated = time.time()
 404
 405     return g.cipher
 406
 407 #@app.teardown_appcontext
 408 #def teardown_db():
 409 #    db = g.pop('db', None)
 410 #
 411 #    if db is not None:
 412 #        db.close()
 413
 414 @app.template_filter('format_date')
 415 def format_date(s):
 416     (y,m,d) = (int(n) for n in s.split('T')[0].split(' ')[0].split('-'))  # iso-dates can seperate date from time with space or 'T'
 417     M = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split()
 418     return f"{d} {M[m]}"
 419
 420 def pp(*args):
 421     from pprint import pprint
 422     import sys, codecs
 423     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
 424
 425 if __name__ == '__main__':
 426     app.run(debug=True)