import re import json import time import hmac import hashlib import sqlite3 import secrets import requests import requests_cache from urllib.parse import parse_qs from flask import Flask, render_template, request, redirect, flash, url_for, jsonify, g from common import * app = Flask(__name__) app.secret_key = secrets.token_bytes(16) # XXX: generate and hard-code, or cookies and csrf-validation will fail! # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,)) # Note: this should only be required for the 'memory' backed cache. from threading import Timer def purge_cache(sec): requests_cache.remove_expired_responses() t = Timer(sec, purge_cache, args=(sec,)) t.setDaemon(True) t.start() purge_cache(10*60) @app.route('/') def index(): return redirect(url_for('feed'), code=302) @app.route('/feed/subscriptions') def feed(): token = request.args.get('token', 'guest') page = int(request.args.get('page', 0)) with sqlite3.connect(cf['global']['database']) as conn: c = conn.cursor() c.execute(""" SELECT videos.id, channel_id, name, title, published, flags.display FROM videos JOIN channels ON videos.channel_id = channels.id LEFT JOIN flags ON (videos.id = flags.video_id) AND (flags.user = ?) WHERE channel_id IN (SELECT channel_id FROM subscriptions WHERE user = ?) AND flags.display IS NOT 'hidden' ORDER BY (display = 'pinned') DESC, crawled DESC LIMIT 36 OFFSET 36*?""", (token, token, page)) rows = [{ 'video_id': video_id, 'channel_id': channel_id, 'author': author, 'title': title, 'published': published, 'pinned': display == 'pinned', } for (video_id, channel_id, author, title, published, display) in c.fetchall()] return render_template('index.html.j2', rows=rows, page=page) @app.route('/watch') def watch(): if not 'v' in request.args: return "missing video id", 400 video_id = request.args.get('v') (video_url, metadata, error_type, error) = get_video_info(video_id) if error_type in ['initial', 'player']: return error, 400, {'content-type': 'text/plain',"Link": "; rel=stylesheet;"} show = request.args.get("show") if show == "metadata": # todo: handle the case when we have an exhausted error with no metadata returned return render_template('watch.html.j2', video_id=video_id, video_url=video_url, **prepare_metadata(metadata)) elif show == "json": return jsonify(metadata) else: if error: extra = {'geolocked':'local=1', 'livestream':'raw=0'}.get(error,'') # if error==exhausted, metadata.playabilityStatus.reason may contain additional information. return f"{error.upper()}: Redirecting to Invidious.", 502, {'Refresh': '2; URL=https://invidio.us/watch?v='+video_id+'&'+extra+'&raw=1','content-type': 'text/plain',"Link": "; rel=stylesheet;"} return redirect(video_url, code=307) def prepare_metadata(metadata): meta1 = metadata['videoDetails'] meta2 = metadata['microformat']['playerMicroformatRenderer'] cards = metadata['cards']['cardCollectionRenderer']['cards'] if 'cards' in metadata else [] endsc = metadata['endscreen']['endscreenRenderer']['elements'] if 'endscreen' in metadata else [] #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent aspect_ratio = meta2['thumbnail']['thumbnails'][0]['width'] / meta2['thumbnail']['thumbnails'][0]['height'] subtitles = sorted([ {'url':cc['baseUrl'], 'code':cc['languageCode'], 'autogenerated':cc.get('kind')=="asr", 'name':cc['name']['simpleText']} for cc in metadata['captions']['playerCaptionsTracklistRenderer']['captionTracks'] ], key=lambda cc: cc['autogenerated']) if 'captionTracks' in metadata['captions']['playerCaptionsTracklistRenderer'] else [] def parse_infocard(card): card = card['cardRenderer'] teaser = card['teaser']['simpleCardTeaserRenderer']['message']['simpleText'] # not used ctype = list(card['content'].keys())[0] content = card['content'][ctype] if ctype == "pollRenderer": ctype = "POLL" content = { 'question': content['question']['simpleText'], 'answers': [(a['text']['simpleText'],a['numVotes']) for a in content['choices']], } elif ctype == "videoInfoCardContentRenderer": ctype = "VIDEO" content = { 'video_id': content['action']['watchEndpoint']['videoId'], 'title': content['videoTitle']['simpleText'], 'author': content['channelName']['simpleText'], # 'by xXxXx' 'length': content['lengthString']['simpleText'], # '23:03' 'views': content['viewCountText']['simpleText'], # '421,248 views' } elif ctype == "playlistInfoCardContentRenderer": ctype = "PLAYLIST" content = { 'playlist_id': content['action']['watchEndpoint']['playlistId'], 'video_id': content['action']['watchEndpoint']['videoId'], # XXX: untested 'title': content['playlistTitle']['simpleText'], 'author': content['channelName']['simpleText'], 'n_videos': content['videoCountText']['simpleText'], } elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content.get('command',{}).keys(): ctype = "WEBSITE" content = { 'url': parse_qs(content['command']['urlEndpoint']['url'].split('?')[1])['q'][0], 'title': content['title']['simpleText'], 'text': content['actionButton']['simpleCardButtonRenderer']['text']['simpleText'], } else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} return {'teaser': teaser, 'type': ctype, 'content': content} def parse_endcard(card): card = card['endscreenElementRenderer'] if 'endscreenElementRenderer' in card.keys() else card ctype = card['style'] if ctype == "CHANNEL": content = { 'channel_id': card['endpoint']['browseEndpoint']['browseId'], 'title': card['title']['simpleText'], 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']}, } elif ctype == "VIDEO": content = { 'video_id': card['endpoint']['watchEndpoint']['videoId'], 'title': card['title']['simpleText'], 'length': card['videoDuration']['simpleText'], # '12:21' 'views': card['metadata']['simpleText'], # '51,649 views' } elif ctype == "PLAYLIST": content = { 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'], 'video_id': card['endpoint']['watchEndpoint']['videoId'], 'title': card['title']['simpleText'], 'author': card['metadata']['simpleText'], 'n_videos': card['playlistLength']['simpleText'], } elif ctype == "WEBSITE": content = { 'url': parse_qs(card['endpoint']['urlEndpoint']['url'].split('?')[1])['q'][0], 'title': card['title']['simpleText'], 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']}, } else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} return {'type': ctype, 'content': content} return { 'title': meta1['title'], 'author': meta1['author'], 'channel_id': meta1['channelId'], 'description': meta1['shortDescription'], 'published': meta2['publishDate'], 'views': meta1['viewCount'], 'length': int(meta1['lengthSeconds']), 'rating': meta1['averageRating'], 'category': meta2['category'], 'aspectr': aspect_ratio, 'unlisted': meta2['isUnlisted'], 'countries': meta2['availableCountries'], 'infocards': [parse_infocard(card) for card in cards], 'endcards': [parse_endcard(card) for card in endsc], 'subtitles': subtitles, } def get_video_info(video_id): """ returns the best-quality muxed video stream, the player_response, error-type/-mesage error types: 'initial': the request to get_video_info was malformed 'player': playabilityStatus != OK 'internal': [livestream, geolocked, exhausted] """ # TODO: caching, e.g. beaker? need to not cache premiering-soon videos/livestreams/etc, though # responses are apparently valid for 6h; maybe cache for (video_length - 2h) # TODO: errro types? ["invalid parameters", playabilitystatus, own] # todo: a bit messy; should return all unscrambled video urls in best->worst quality # we try to fetch the video multiple times using different origins (sts, algo) = get_cipher() for el in ['embedded', 'detailpage']: # ['el-completely-absent',info,leanback,editpage,adunit,previewpage,profilepage] r = requests.get(f"https://www.youtube.com/get_video_info"+ f"?video_id={video_id}"+ f"&eurl=https://youtube.googleapis.com/v/{video_id}"+ f"&el={el}"+ f"&sts={sts}"+ f"&hl=en_US") #"&hl=en&gl=US" params = parse_qs(r.text) if 'errorcode' in params: # status=fail return None, None, 'initial', f"MALFORMED: {params['reason'][0]}" metadata = json.loads(params.get('player_response')[0]) if metadata['playabilityStatus']['status'] != "OK": if metadata['playabilityStatus']['status'] == "UNPLAYABLE": continue # try again with different 'el' value. if none succeeds, we fall into "exhausted" path, which returns last tried metadata, from which the playabilityStatus.reason can be extracted. according to jwz/youtubedown, the worst error message comes from embedded, which is tried first, so it should be overwritten by a better message. return None, None, 'player', f"{metadata['playabilityStatus']['status']}: {metadata['playabilityStatus']['reason']}" if 'liveStreamability' in metadata['playabilityStatus']: return None, metadata, 'internal', "livestream" # can also check .microformat.liveBroadcastDetails.isLiveNow formats = metadata['streamingData']['formats'] for (i,v) in enumerate(formats): if not ('cipher' in v or 'signatureCipher' in v): continue cipher = parse_qs(v.get('cipher') or v.get('signatureCipher')) formats[i]['url'] = unscramble(cipher) # todo: check if we have urls or try again url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] if 'gcr' in parse_qs(url): return None, metadata, 'internal', "geolocked" return url, metadata, None, None else: return None, metadata, 'internal', "exhausted" def unscramble(cipher): # test video id: UxxajLWwzqY signature = list(cipher['s'][0]) (sts, algo) = get_cipher() for c in algo.split(): op, ix = re.match(r"([rsw])(\d+)?", c).groups() if not op: continue if op == 'r': signature = list(reversed(signature)) if op == 's': signature = signature[int(ix):] if op == 'w': signature[0], signature[int(ix)%len(signature)] = signature[int(ix)%len(signature)], signature[0] sp = cipher.get('sp', ['signature'])[0] sig = cipher['sig'][0] if 'sig' in cipher else ''.join(signature) return f"{cipher['url'][0]}&{sp}={sig}" @app.route('/channel/') def channel(channel_id): if not re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id): return "bad channel id", 400 # todo xmlfeed = fetch_xml("channel_id", channel_id) if not xmlfeed: return "not found or something", 404 # XXX (title, author, _, videos) = parse_xml(xmlfeed) return render_template('xmlfeed.html.j2', title=author, rows=videos) @app.route('/playlist') def playlist(): playlist_id = request.args.get('list') if not playlist_id: return "bad list id", 400 # todo xmlfeed = fetch_xml("playlist_id", playlist_id) if not xmlfeed: return "not found or something", 404 # XXX (title, author, _, videos) = parse_xml(xmlfeed) return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos) @app.route('/subscription_manager') def subscription_manager(): token = request.args.get('token', 'guest') with sqlite3.connect(cf['global']['database']) as conn: #with conn.cursor() as c: c = conn.cursor() c.execute(""" SELECT subscriptions.channel_id, name, (subscribed_until < datetime('now')) AS obsolete FROM subscriptions left JOIN channels ON channels.id = subscriptions.channel_id left JOIN websub ON channels.id = websub.channel_id WHERE user = ? ORDER BY obsolete=0, name COLLATE NOCASE ASC""", (token,)) rows = [{ 'channel_id': channel_id, 'author': author or channel_id, 'subscribed_until': subscribed_until } for (channel_id, author, subscribed_until) in c.fetchall()] return render_template('subscription_manager.html.j2', rows=rows) @app.route('/feed/subscriptions', methods=['POST']) def feed_post(): token = request.args.get('token', 'guest') if token == 'guest': return "guest user is read-only", 403 action = next(request.form.keys(), None) if action in ['pin', 'unpin', 'hide']: video_id = request.form.get(action) display = { 'pin': 'pinned', 'unpin': None, 'hide': 'hidden', }[action] with sqlite3.connect(cf['global']['database']) as conn: #with conn.cursor() as c: c = conn.cursor() c.execute(""" INSERT OR REPLACE INTO flags (user, video_id, display) VALUES (?, ?, ?) """, (token, video_id, display)) else: flash(("error","unsupported action")) return redirect(request.url, code=303) @app.route('/subscription_manager', methods=['POST']) def manage_subscriptions(): token = request.args.get('token', 'guest') if token == 'guest': return "guest user is read-only", 403 if 'subscribe' in request.form: channel_id = request.form.get("subscribe") match = re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id) if match: channel_id = match.group(1) else: match = re.match(r"((?:PL|LL|EC|UU|FL|UL|OL)[A-Za-z0-9_-]{10,})", channel_id) if match: # NOTE: PL-playlists are 32chars, others differ in length. flash(("error","playlists not (yet?) supported.")) return redirect(request.url, code=303) # TODO: dedup redirection else: flash(("error","not a valid/subscribable URI")) return redirect(request.url, code=303) # TODO: dedup redirection with sqlite3.connect(cf['global']['database']) as conn: #with conn.cursor() as c: c = conn.cursor() c.execute(""" INSERT OR IGNORE INTO subscriptions (user, channel_id) VALUES (?, ?) """, (token, channel_id)) # TODO: sql-error-handling, asynchronically calling update-subs.pl elif 'unsubscribe' in request.form: with sqlite3.connect(cf['global']['database']) as conn: #with conn.cursor() as c: c = conn.cursor() c.execute(""" DELETE FROM subscriptions WHERE user = ? AND channel_id = ? """, (token, channel_id)) # TODO: sql-error-handling, report success else: flash(("error","unsupported action")) return redirect(request.url, code=303) @app.route('/r/') def reddit_index(): return "" @app.route('/r/') def reddit(subreddit="videos"): count = int(request.args.get('count', 0)) before = request.args.get('before') after = request.args.get('after') query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after)] if v]) r = requests.get(f"https://old.reddit.com/r/{subreddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'}) if not r.ok or not 'data' in r.json(): return r.text+"error retrieving reddit data", 502 good = [e for e in r.json()['data']['children'] if e['data']['score'] > 1] bad = [e for e in r.json()['data']['children'] if e['data']['score'] <=1] videos = [] for entry in (good+bad): e = entry['data'] if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']: continue video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/)([-_0-9A-Za-z]+)', e['url']).group(1) if not video_id: continue videos.append({ 'video_id': video_id, 'title': e['title'], 'url': e['permalink'], 'n_comments': e['num_comments'], 'n_karma': e['score'], }) before = r.json()['data']['before'] after = r.json()['data']['after'] return render_template('reddit.html.j2', subreddit=subreddit, rows=videos, before=before, after=after, count=count) def get_cipher(): # reload cipher from database every 1 hour if 'cipher' not in g or time.time() - g.get('cipher_updated', 0) > 1 * 60 * 60: with sqlite3.connect(cf['global']['database']) as conn: c = conn.cursor() c.execute("SELECT sts, algorithm FROM cipher") g.cipher = c.fetchone() g.cipher_updated = time.time() return g.cipher #@app.teardown_appcontext #def teardown_db(): # db = g.pop('db', None) # # if db is not None: # db.close() # Magic CSRF protection: This modifies outgoing HTML responses and injects a csrf token into all forms. # All post requests are then checked if they contain the valid token. # TODO: # - don't use regex for injecting # - inject a http header into all responses (that could be used by apis) # - allow csrf token to be passed in http header, json, ... # - a decorator on routes to opt out of verification or output munging @app.after_request def add_csrf_protection(response): if response.mimetype == "text/html": token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest() # TODO: will fail behind reverse proxy (remote_addr always localhost) response.set_data( re.sub( rb'()', # match form tags with any number of attributes and any type of quotes rb'\1', # hackily append a hidden input with our csrf protection value response.get_data())) return response @app.before_request def verify_csrf_protection(): token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest() # TODO: will fail behind reverse proxy (remote_addr always localhost) if request.method == "POST" and request.form.get('csrf') != token: return "CSRF validation failed!", 400 @app.template_filter('format_date') def format_date(s): (y,m,d) = (int(n) for n in s.split('T')[0].split(' ')[0].split('-')) # iso-dates can seperate date from time with space or 'T' M = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split() return f"{d} {M[m]}" def pp(*args): from pprint import pprint import sys, codecs pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer)) if __name__ == '__main__': app.run(debug=True)