]> git.gir.st - subscriptionfeed.git/blob - app/frontend.py
switch to html watch page by default, fix csrf bugs
[subscriptionfeed.git] / app / frontend.py
1 import re
2 import json
3 import time
4 import hmac
5 import hashlib
6 import sqlite3
7 import secrets
8 import requests
9 import requests_cache
10 from urllib.parse import parse_qs
11 from flask import Flask, render_template, request, redirect, flash, url_for, jsonify, g
12
13 from common import *
14
15 app = Flask(__name__)
16 app.secret_key = secrets.token_bytes(16) # XXX: generate and hard-code, or cookies and csrf-validation will fail!
17 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.
18 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
19
20 # Note: this should only be required for the 'memory' backed cache.
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 @app.route('/')
30 def index():
31 return redirect(url_for('feed'), code=302)
32
33 @app.route('/feed/subscriptions')
34 def feed():
35 token = request.args.get('token', 'guest')
36 page = int(request.args.get('page', 0))
37 with sqlite3.connect(cf['global']['database']) as conn:
38 c = conn.cursor()
39 c.execute("""
40 SELECT videos.id, channel_id, name, title, published, flags.display
41 FROM videos
42 JOIN channels ON videos.channel_id = channels.id
43 LEFT JOIN flags ON (videos.id = flags.video_id) AND (flags.user = ?)
44 WHERE channel_id IN
45 (SELECT channel_id FROM subscriptions WHERE user = ?)
46 AND flags.display IS NOT 'hidden'
47 ORDER BY (display = 'pinned') DESC, crawled DESC
48 LIMIT 36
49 OFFSET 36*?""", (token, token, page))
50 rows = [{
51 'video_id': video_id,
52 'channel_id': channel_id,
53 'author': author,
54 'title': title,
55 'published': published,
56 'pinned': display == 'pinned',
57 } for (video_id, channel_id, author, title, published, display) in c.fetchall()]
58 return render_template('index.html.j2', rows=rows, page=page)
59
60 @app.route('/watch')
61 def watch():
62 if not 'v' in request.args:
63 return "missing video id", 400
64
65 plaintextheader = {'content-type': 'text/plain',"Link": "<data:text/css,body%7Bcolor:%23eee;background:%23333%7D>; rel=stylesheet;"}
66
67 video_id = request.args.get('v')
68 (video_url, metadata, error_type, error) = get_video_info(video_id)
69 if error_type in ['initial', 'player']:
70 return error, 400, plaintextheader
71
72 show = request.args.get("show")
73 if show == "raw":
74 if error:
75 extra = {'geolocked':'local=1', 'livestream':'raw=0'}.get(error,'')
76 # if error==exhausted, metadata.playabilityStatus.reason may contain additional information.
77 return f"{error.upper()}: Redirecting to Invidious.", 502, {'Refresh': f'2; URL=https://invidio.us/watch?v={video_id}&{extra}&raw=1', **plaintextheader}
78 return redirect(video_url, code=307)
79 elif show == "json":
80 return jsonify(metadata)
81 else: # todo: handle geolocked, livesteam and the case when we have an exhausted error with no metadata returned
82 return render_template('watch.html.j2', video_id=video_id, video_url=video_url, **prepare_metadata(metadata))
83
84 def prepare_metadata(metadata):
85 meta1 = metadata['videoDetails']
86 meta2 = metadata['microformat']['playerMicroformatRenderer']
87 cards = metadata['cards']['cardCollectionRenderer']['cards'] if 'cards' in metadata else []
88 endsc = metadata['endscreen']['endscreenRenderer']['elements'] if 'endscreen' in metadata else []
89
90 #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
91 aspect_ratio = meta2['thumbnail']['thumbnails'][0]['width'] / meta2['thumbnail']['thumbnails'][0]['height']
92
93 subtitles = sorted([
94 {'url':cc['baseUrl'],
95 'code':cc['languageCode'],
96 'autogenerated':cc.get('kind')=="asr",
97 'name':cc['name']['simpleText']}
98 for cc in metadata['captions']['playerCaptionsTracklistRenderer']['captionTracks']
99 ], key=lambda cc: cc['autogenerated']) if 'captionTracks' in metadata['captions']['playerCaptionsTracklistRenderer'] else []
100
101 def parse_infocard(card):
102 card = card['cardRenderer']
103 teaser = card['teaser']['simpleCardTeaserRenderer']['message']['simpleText'] # not used
104 ctype = list(card['content'].keys())[0]
105 content = card['content'][ctype]
106 if ctype == "pollRenderer":
107 ctype = "POLL"
108 content = {
109 'question': content['question']['simpleText'],
110 'answers': [(a['text']['simpleText'],a['numVotes']) for a in content['choices']],
111 }
112 elif ctype == "videoInfoCardContentRenderer":
113 ctype = "VIDEO"
114 content = {
115 'video_id': content['action']['watchEndpoint']['videoId'],
116 'title': content['videoTitle']['simpleText'],
117 'author': content['channelName']['simpleText'], # 'by xXxXx'
118 'length': content['lengthString']['simpleText'], # '23:03'
119 'views': content['viewCountText']['simpleText'], # '421,248 views'
120 }
121 elif ctype == "playlistInfoCardContentRenderer":
122 ctype = "PLAYLIST"
123 content = {
124 'playlist_id': content['action']['watchEndpoint']['playlistId'],
125 'video_id': content['action']['watchEndpoint']['videoId'],
126 'title': content['playlistTitle']['simpleText'],
127 'author': content['channelName']['simpleText'],
128 'n_videos': content['playlistVideoCount']['simpleText'], # '21'
129 }
130 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content.get('command',{}).keys():
131 ctype = "WEBSITE"
132 content = {
133 'url': parse_qs(content['command']['urlEndpoint']['url'].split('?')[1])['q'][0],
134 'domain': content['displayDomain']['simpleText'],
135 'title': content['title']['simpleText'],
136 'text': content['actionButton']['simpleCardButtonRenderer']['text']['simpleText'],
137 }
138 else:
139 import pprint
140 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
141
142 return {'teaser': teaser, 'type': ctype, 'content': content}
143
144 def parse_endcard(card):
145 card = card['endscreenElementRenderer'] if 'endscreenElementRenderer' in card.keys() else card
146 ctype = card['style']
147 if ctype == "CHANNEL":
148 content = {
149 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
150 'title': card['title']['simpleText'],
151 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
152 }
153 elif ctype == "VIDEO":
154 content = {
155 'video_id': card['endpoint']['watchEndpoint']['videoId'],
156 'title': card['title']['simpleText'],
157 'length': card['videoDuration']['simpleText'], # '12:21'
158 'views': card['metadata']['simpleText'], # '51,649 views'
159 }
160 elif ctype == "PLAYLIST":
161 content = {
162 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
163 'video_id': card['endpoint']['watchEndpoint']['videoId'],
164 'title': card['title']['simpleText'],
165 'author': card['metadata']['simpleText'],
166 'n_videos': card['playlistLength']['simpleText'].replace(" videos", ""),
167 }
168 elif ctype == "WEBSITE":
169 content = {
170 'url': parse_qs(card['endpoint']['urlEndpoint']['url'].split('?')[1])['q'][0],
171 'domain': card['metadata']['simpleText'],
172 'title': card['title']['simpleText'],
173 'icons': {e['height']: e['url'] for e in card['image']['thumbnails']},
174 }
175 else:
176 import pprint
177 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
178
179 return {'type': ctype, 'content': content}
180
181 return {
182 'title': meta1['title'],
183 'author': meta1['author'],
184 'channel_id': meta1['channelId'],
185 'description': meta1['shortDescription'],
186 'published': meta2['publishDate'],
187 'views': meta1['viewCount'],
188 'length': int(meta1['lengthSeconds']),
189 'rating': meta1['averageRating'],
190 'category': meta2['category'],
191 'aspectr': aspect_ratio,
192 'unlisted': meta2['isUnlisted'],
193 'countries': meta2['availableCountries'],
194 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
195 'infocards': [parse_infocard(card) for card in cards],
196 'endcards': [parse_endcard(card) for card in endsc],
197 'subtitles': subtitles,
198 }
199
200 def get_video_info(video_id):
201 """
202 returns the best-quality muxed video stream, the player_response, error-type/-mesage
203 error types: 'initial': the request to get_video_info was malformed
204 'player': playabilityStatus != OK
205 'internal': [livestream, geolocked, exhausted]
206 """
207 # TODO: caching, e.g. beaker? need to not cache premiering-soon videos/livestreams/etc, though
208 # responses are apparently valid for 6h; maybe cache for (video_length - 2h)
209 # TODO: errro types? ["invalid parameters", playabilitystatus, own]
210 # todo: a bit messy; should return all unscrambled video urls in best->worst quality
211
212 # we try to fetch the video multiple times using different origins
213 (sts, algo) = get_cipher()
214 for el in ['embedded', 'detailpage']: # ['el-completely-absent',info,leanback,editpage,adunit,previewpage,profilepage]
215 r = requests.get(f"https://www.youtube.com/get_video_info"+
216 f"?video_id={video_id}"+
217 f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
218 f"&el={el}"+
219 f"&sts={sts}"+
220 f"&hl=en_US") #"&hl=en&gl=US"
221 params = parse_qs(r.text)
222 if 'errorcode' in params: # status=fail
223 return None, None, 'initial', f"MALFORMED: {params['reason'][0]}"
224
225 metadata = json.loads(params.get('player_response')[0])
226 if metadata['playabilityStatus']['status'] != "OK":
227 if metadata['playabilityStatus']['status'] == "UNPLAYABLE":
228 continue # try again with different 'el' value. if none succeeds, we fall into "exhausted" path, which returns last tried metadata, from which the playabilityStatus.reason can be extracted. according to jwz/youtubedown, the worst error message comes from embedded, which is tried first, so it should be overwritten by a better message.
229 return None, None, 'player', f"{metadata['playabilityStatus']['status']}: {metadata['playabilityStatus']['reason']}"
230 if 'liveStreamability' in metadata['playabilityStatus']:
231 return None, metadata, 'internal', "livestream" # can also check .microformat.liveBroadcastDetails.isLiveNow
232
233 formats = metadata['streamingData']['formats']
234 for (i,v) in enumerate(formats):
235 if not ('cipher' in v or 'signatureCipher' in v): continue
236 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
237 formats[i]['url'] = unscramble(cipher)
238
239 # todo: check if we have urls or try again
240 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
241
242 if 'gcr' in parse_qs(url):
243 return None, metadata, 'internal', "geolocked"
244
245 return url, metadata, None, None
246 else:
247 return None, metadata, 'internal', "exhausted"
248
249 def unscramble(cipher): # test video id: UxxajLWwzqY
250 signature = list(cipher['s'][0])
251 (sts, algo) = get_cipher()
252 for c in algo.split():
253 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
254 if not op: continue
255 if op == 'r': signature = list(reversed(signature))
256 if op == 's': signature = signature[int(ix):]
257 if op == 'w': signature[0], signature[int(ix)%len(signature)] = signature[int(ix)%len(signature)], signature[0]
258 sp = cipher.get('sp', ['signature'])[0]
259 sig = cipher['sig'][0] if 'sig' in cipher else ''.join(signature)
260 return f"{cipher['url'][0]}&{sp}={sig}"
261
262 @app.route('/channel/<channel_id>')
263 def channel(channel_id):
264 if not re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id):
265 return "bad channel id", 400 # todo
266
267 xmlfeed = fetch_xml("channel_id", channel_id)
268 if not xmlfeed:
269 return "not found or something", 404 # XXX
270 (title, author, _, videos) = parse_xml(xmlfeed)
271 return render_template('xmlfeed.html.j2', title=author, rows=videos)
272
273 @app.route('/playlist')
274 def playlist():
275 playlist_id = request.args.get('list')
276 if not playlist_id:
277 return "bad list id", 400 # todo
278
279 xmlfeed = fetch_xml("playlist_id", playlist_id)
280 if not xmlfeed:
281 return "not found or something", 404 # XXX
282 (title, author, _, videos) = parse_xml(xmlfeed)
283 return render_template('xmlfeed.html.j2', title=f"{title} by {author}", rows=videos)
284
285 @app.route('/subscription_manager')
286 def subscription_manager():
287 token = request.args.get('token', 'guest')
288 with sqlite3.connect(cf['global']['database']) as conn:
289 #with conn.cursor() as c:
290 c = conn.cursor()
291 c.execute("""
292 SELECT subscriptions.channel_id, name,
293 (subscribed_until < datetime('now')) AS obsolete
294 FROM subscriptions
295 left JOIN channels ON channels.id = subscriptions.channel_id
296 left JOIN websub ON channels.id = websub.channel_id
297 WHERE user = ?
298 ORDER BY obsolete=0, name COLLATE NOCASE ASC""", (token,))
299 rows = [{
300 'channel_id': channel_id,
301 'author': author or channel_id,
302 'subscribed_until': subscribed_until
303 } for (channel_id, author, subscribed_until) in c.fetchall()]
304 return render_template('subscription_manager.html.j2', rows=rows)
305
306 @app.route('/feed/subscriptions', methods=['POST'])
307 def feed_post():
308 token = request.args.get('token', 'guest')
309 if token == 'guest': return "guest user is read-only", 403
310 action = next(iter(k for k in request.form.keys() if k != 'csrf'), None)
311 if action in ['pin', 'unpin', 'hide']:
312 video_id = request.form.get(action)
313 display = {
314 'pin': 'pinned',
315 'unpin': None,
316 'hide': 'hidden',
317 }[action]
318 with sqlite3.connect(cf['global']['database']) as conn:
319 #with conn.cursor() as c:
320 c = conn.cursor()
321 c.execute("""
322 INSERT OR REPLACE INTO flags (user, video_id, display)
323 VALUES (?, ?, ?)
324 """, (token, video_id, display))
325 else:
326 flash(("error","unsupported action"))
327 return redirect(request.url, code=303)
328
329 @app.route('/subscription_manager', methods=['POST'])
330 def manage_subscriptions():
331 token = request.args.get('token', 'guest')
332 if token == 'guest': return "guest user is read-only", 403
333 if 'subscribe' in request.form:
334 channel_id = request.form.get("subscribe")
335 match = re.match(r"(UC[A-Za-z0-9_-]{22})", channel_id)
336 if match:
337 channel_id = match.group(1)
338 else:
339 match = re.match(r"((?:PL|LL|EC|UU|FL|UL|OL)[A-Za-z0-9_-]{10,})", channel_id)
340 if match: # NOTE: PL-playlists are 32chars, others differ in length.
341 flash(("error","playlists not (yet?) supported."))
342 return redirect(request.url, code=303) # TODO: dedup redirection
343 else:
344 flash(("error","not a valid/subscribable URI"))
345 return redirect(request.url, code=303) # TODO: dedup redirection
346 with sqlite3.connect(cf['global']['database']) as conn:
347 #with conn.cursor() as c:
348 c = conn.cursor()
349 c.execute("""
350 INSERT OR IGNORE INTO subscriptions (user, channel_id)
351 VALUES (?, ?)
352 """, (token, channel_id))
353 # TODO: sql-error-handling, asynchronically calling update-subs.pl
354
355 elif 'unsubscribe' in request.form:
356 with sqlite3.connect(cf['global']['database']) as conn:
357 #with conn.cursor() as c:
358 c = conn.cursor()
359 c.execute("""
360 DELETE FROM subscriptions
361 WHERE user = ? AND channel_id = ?
362 """, (token, channel_id))
363 # TODO: sql-error-handling, report success
364
365 else:
366 flash(("error","unsupported action"))
367
368 return redirect(request.url, code=303)
369
370 @app.route('/r/')
371 def reddit_index():
372 return ""
373 @app.route('/r/<subreddit>')
374 def reddit(subreddit="videos"):
375 count = int(request.args.get('count', 0))
376 before = request.args.get('before')
377 after = request.args.get('after')
378 query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after)] if v])
379 r = requests.get(f"https://old.reddit.com/r/{subreddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'})
380 if not r.ok or not 'data' in r.json():
381 return r.text+"error retrieving reddit data", 502
382
383 good = [e for e in r.json()['data']['children'] if e['data']['score'] > 1]
384 bad = [e for e in r.json()['data']['children'] if e['data']['score'] <=1]
385 videos = []
386 for entry in (good+bad):
387 e = entry['data']
388 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
389 continue
390 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/)([-_0-9A-Za-z]+)', e['url']).group(1)
391 if not video_id: continue
392 videos.append({
393 'video_id': video_id,
394 'title': e['title'],
395 'url': e['permalink'],
396 'n_comments': e['num_comments'],
397 'n_karma': e['score'],
398 })
399 before = r.json()['data']['before']
400 after = r.json()['data']['after']
401 return render_template('reddit.html.j2', subreddit=subreddit, rows=videos, before=before, after=after, count=count)
402
403 def get_cipher():
404 # reload cipher from database every 1 hour
405 if 'cipher' not in g or time.time() - g.get('cipher_updated', 0) > 1 * 60 * 60:
406 with sqlite3.connect(cf['global']['database']) as conn:
407 c = conn.cursor()
408 c.execute("SELECT sts, algorithm FROM cipher")
409 g.cipher = c.fetchone()
410 g.cipher_updated = time.time()
411
412 return g.cipher
413
414 #@app.teardown_appcontext
415 #def teardown_db():
416 # db = g.pop('db', None)
417 #
418 # if db is not None:
419 # db.close()
420
421 # Magic CSRF protection: This modifies outgoing HTML responses and injects a csrf token into all forms.
422 # All post requests are then checked if they contain the valid token.
423 # TODO:
424 # - don't use regex for injecting
425 # - inject a http header into all responses (that could be used by apis)
426 # - allow csrf token to be passed in http header, json, ...
427 # - a decorator on routes to opt out of verification or output munging
428 @app.after_request
429 def add_csrf_protection(response):
430 if response.mimetype == "text/html":
431 token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest() # TODO: will fail behind reverse proxy (remote_addr always localhost)
432 response.set_data( re.sub(
433 rb'''(<[Ff][Oo][Rr][Mm](\s+[a-zA-Z0-9-]+(=(\w*|'[^']*'|"[^"]*"))?)*>)''', # match form tags with any number of attributes and any type of quotes
434 rb'\1<input type="hidden" name="csrf" value="'+token.encode('ascii')+rb'">', # hackily append a hidden input with our csrf protection value
435 response.get_data()))
436 return response
437 @app.before_request
438 def verify_csrf_protection():
439 token = hmac.new(app.secret_key, request.remote_addr.encode('ascii'), hashlib.sha256).hexdigest() # TODO: will fail behind reverse proxy (remote_addr always localhost)
440 if request.method == "POST" and request.form.get('csrf') != token:
441 return "CSRF validation failed!", 400
442 request.form = request.form.copy() # make it mutable
443 # request.form.pop('csrf') # XXX: breaks all requests?!
444
445 @app.template_filter('format_date')
446 def format_date(s):
447 (y,m,d) = (int(n) for n in s.split('T')[0].split(' ')[0].split('-')) # iso-dates can seperate date from time with space or 'T'
448 M = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split()
449 return f"{d} {M[m]}"
450
451 def pp(*args):
452 from pprint import pprint
453 import sys, codecs
454 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
455
456 if __name__ == '__main__':
457 app.run(debug=True)
Imprint / Impressum