]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
proxy and fixup subtitles
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import base64
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 updated = dateutil.parser.parse(video['updated'])
110 published = dateutil.parser.parse(video['published'])
111 # if update and published time are near-identical, we assume it's new.
112 # checking if it was posted this week is necessary during xmlfeed pulling.
113 if (updated - published).seconds < 60 and (now - published).days < 7:
114 timestamp = now
115 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
116 else:#, it might just an update to an older video, or a previously unlisted one.
117 # first, assume it's an older video (correct when pulling xmlfeeds)
118 timestamp = published
119 # then, check if we don't know about it and if so, look up the real date.
120
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video). If
125 # that's the case, we fetch get_video_info and double-check.
126 # We only need to do this to not-yet-in-the-database videos.
127 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
128 new_video = len(c.fetchall()) < 1
129 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
130 if from_webhook and new_video:
131 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
132 _, meta, _, _ = get_video_info(video['video_id'])
133 if meta:
134 meta = prepare_metadata(meta)
135 published = dateutil.parser.parse(meta['published'])
136 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
137 if (now - published).days < 7:
138 timestamp = now
139 else:#, it's just an update to an older video.
140 timestamp = published
141
142 c.execute("""
143 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
144 VALUES (?, ?, ?, datetime(?), datetime(?))
145 """, (
146 video['video_id'],
147 video['channel_id'],
148 video['title'],
149 video['published'],
150 timestamp
151 ))
152
153 if i == 0: # only required once per feed
154 c.execute("""
155 INSERT OR REPLACE INTO channels (id, name)
156 VALUES (?, ?)
157 """, (video['channel_id'], video['author']))
158 db.commit()
159
160 return True
161
162 def get_video_info(video_id, sts=0, algo=""):
163 """
164 returns: best-quality muxed video stream, player_response, error-type/mesage
165 error types: player, malformed, livestream, geolocked, exhausted
166 """
167 player_error = None # for 'exhausted'
168 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
169 r = requests.get("https://www.youtube.com/get_video_info", {
170 "video_id": video_id,
171 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
172 "el": el,
173 "sts": sts,
174 "hl": "en_US",
175 })
176 params = parse_qs(r.text)
177 if 'errorcode' in params: # status=fail
178 return None, None, 'malformed', params['reason'][0]
179
180 metadata = json.loads(params.get('player_response')[0])
181 playabilityStatus = metadata['playabilityStatus']['status']
182 if playabilityStatus != "OK":
183 playabilityReason = metadata['playabilityStatus'].get('reason',
184 '//'.join(metadata['playabilityStatus'].get('messages',[])))
185 player_error = f"{playabilityStatus}: {playabilityReason}"
186 if playabilityStatus == "UNPLAYABLE":
187 continue # try again with next el value (or fail as exhausted)
188 # without videoDetails, there's only the error message
189 maybe_metadata = metadata if 'videoDetails' in metadata else None
190 return None, maybe_metadata, 'player', player_error
191 if metadata['videoDetails']['isLiveContent'] and \
192 (metadata['videoDetails'].get('isLive', False) or \
193 metadata['videoDetails'].get('isPostLiveDvr', False)):
194 return None, metadata, 'livestream', None
195
196 if not 'formats' in metadata['streamingData']:
197 continue # no urls
198
199 formats = metadata['streamingData']['formats']
200 for (i,v) in enumerate(formats):
201 if not ('cipher' in v or 'signatureCipher' in v): continue
202 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
203 formats[i]['url'] = unscramble(cipher, algo)
204
205 # todo: check if we have urls or try again
206 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
207
208 if 'gcr' in parse_qs(url):
209 return None, metadata, 'geolocked', None
210
211 return url, metadata, None, None
212 else:
213 return None, metadata, 'exhausted', player_error
214
215 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
216 signature = list(cipher['s'][0])
217 for c in algo.split():
218 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
219 ix = int(ix) % len(signature) if ix else 0
220 if not op: continue
221 if op == 'r': signature = list(reversed(signature))
222 if op == 's': signature = signature[ix:]
223 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
224 sp = cipher.get('sp', ['signature'])[0]
225 sig = cipher.get('sig', [''.join(signature)])[0]
226 return f"{cipher['url'][0]}&{sp}={sig}"
227
228 def prepare_metadata(metadata):
229 meta1 = metadata['videoDetails']
230 meta2 = metadata['microformat']['playerMicroformatRenderer']
231 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
232 if 'cards' in metadata else []
233 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
234 if 'endscreen' in metadata else []
235
236 # the actual video streams have exact information:
237 try:
238 sd = metadata['streamingData']
239 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
240 aspect_ratio = some_stream['width'] / some_stream['height']
241 # if that's unavailable (e.g. on livestreams), fall back to
242 # thumbnails (only either 4:3 or 16:9).
243 except:
244 some_img = meta2['thumbnail']['thumbnails'][0]
245 aspect_ratio = some_img['width'] / some_img['height']
246
247 # Note: we could get subtitles in multiple formats directly by querying
248 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
249 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
250 # but that won't give us autogenerated subtitles (and is an extra request).
251 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
252 subtitles = sorted([
253 {'url':cc['baseUrl'],
254 'code':cc['languageCode'],
255 'autogenerated':cc.get('kind')=="asr",
256 'name':cc['name']['simpleText'],
257 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
258 for cc in metadata.get('captions',{})
259 .get('playerCaptionsTracklistRenderer',{})
260 .get('captionTracks',[])
261 ], key=lambda cc: cc['autogenerated'])
262
263 def clean_url(url):
264 # externals URLs are redirected through youtube.com/redirect, but we
265 # may encounter internal URLs, too
266 return parse_qs(urlparse(url).query).get('q',[url])[0]
267 # Remove left-/rightmost word from string:
268 delL = lambda s: s.partition(' ')[2]
269 delR = lambda s: s.rpartition(' ')[0]
270 # Thousands seperator aware int():
271 intT = lambda s: int(s.replace(',', ''))
272
273 def parse_infocard(card):
274 card = card['cardRenderer']
275 ctype = list(card['content'].keys())[0]
276 content = card['content'][ctype]
277 if ctype == "pollRenderer":
278 ctype = "POLL"
279 content = {
280 'question': content['question']['simpleText'],
281 'answers': [(a['text']['simpleText'],a['numVotes']) \
282 for a in content['choices']],
283 }
284 elif ctype == "videoInfoCardContentRenderer":
285 ctype = "VIDEO"
286 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
287 # TODO: this is ugly; cleanup.
288 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
289 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
290 content = {
291 'video_id': content['action']['watchEndpoint']['videoId'],
292 'title': content['videoTitle']['simpleText'],
293 'author': delL(content['channelName']['simpleText']),
294 'length': length,
295 'views': intT(delR(content['viewCountText']['simpleText'])),
296 }
297 elif ctype == "playlistInfoCardContentRenderer":
298 ctype = "PLAYLIST"
299 content = {
300 'playlist_id': content['action']['watchEndpoint']['playlistId'],
301 'video_id': content['action']['watchEndpoint']['videoId'],
302 'title': content['playlistTitle']['simpleText'],
303 'author': delL(content['channelName']['simpleText']),
304 'n_videos': intT(content['playlistVideoCount']['simpleText']),
305 }
306 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
307 ctype = "WEBSITE"
308 content = {
309 'url': clean_url(content['command']['urlEndpoint']['url']),
310 'domain': content['displayDomain']['simpleText'],
311 'title': content['title']['simpleText'],
312 # XXX: no thumbnails for infocards
313 }
314 elif ctype == "collaboratorInfoCardContentRenderer":
315 ctype = "CHANNEL"
316 content = {
317 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
318 'title': content['channelName']['simpleText'],
319 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
320 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
321 }
322 else:
323 import pprint
324 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
325
326 return {'type': ctype, 'content': content}
327
328 def mkthumbs(thumbs):
329 return {e['height']: e['url'] for e in thumbs}
330 def parse_endcard(card):
331 card = card.get('endscreenElementRenderer', card) #only sometimes nested
332 ctype = card['style']
333 if ctype == "CHANNEL":
334 content = {
335 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
336 'title': card['title']['simpleText'],
337 'icons': mkthumbs(card['image']['thumbnails']),
338 }
339 elif ctype == "VIDEO":
340 content = {
341 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
342 'title': card['title']['simpleText'],
343 'length': card['videoDuration']['simpleText'], # '12:21'
344 'views': delR(card['metadata']['simpleText']),
345 # XXX: no channel name
346 }
347 elif ctype == "PLAYLIST":
348 content = {
349 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
350 'video_id': card['endpoint']['watchEndpoint']['videoId'],
351 'title': card['title']['simpleText'],
352 'author': delL(card['metadata']['simpleText']),
353 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
354 }
355 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
356 ctype = "WEBSITE"
357 url = clean_url(card['endpoint']['urlEndpoint']['url'])
358 content = {
359 'url': url,
360 'domain': urlparse(url).netloc,
361 'title': card['title']['simpleText'],
362 'icons': mkthumbs(card['image']['thumbnails']),
363 }
364 else:
365 import pprint
366 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
367
368 return {'type': ctype, 'content': content}
369
370 infocards = [parse_infocard(card) for card in cards]
371 endcards = [parse_endcard(card) for card in endsc]
372 # combine cards to weed out duplicates. for videos and playlists prefer
373 # infocards, for channels and websites prefer endcards, as those have more
374 # information than the other.
375 # if the card type is not in ident, we use the whole card for comparison
376 # (otherwise they'd all replace each other)
377 ident = { # ctype -> ident
378 'VIDEO': 'video_id',
379 'PLAYLIST': 'playlist_id',
380 'CHANNEL': 'channel_id',
381 'WEBSITE': 'url',
382 'POLL': 'question',
383 }
384 getident = lambda c: c['content'].get(ident.get(c['type']), c)
385 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
386 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
387
388 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
389 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
390
391 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
392 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
393 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
394 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
395 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
396 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
397 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
398 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
399 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
400 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
401 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
402 whitelisted = sorted(meta2.get('availableCountries',[]))
403 blacklisted = sorted(set(all_countries) - set(whitelisted))
404
405 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
406 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
407 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
408 published_at = meta2['liveBroadcastDetails']['startTimestamp']
409
410 return {
411 'title': meta1['title'],
412 'author': meta1['author'],
413 'channel_id': meta1['channelId'],
414 'description': meta1['shortDescription'],
415 'published': published_at,
416 'views': meta1['viewCount'],
417 'length': int(meta1['lengthSeconds']),
418 'rating': meta1['averageRating'],
419 'category': meta2['category'],
420 'aspectr': aspect_ratio,
421 'unlisted': meta2['isUnlisted'],
422 'whitelisted': whitelisted,
423 'blacklisted': blacklisted,
424 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
425 'infocards': infocards,
426 'endcards': endcards,
427 'all_cards': allcards,
428 'subtitles': subtitles,
429 }
430
431 class RedditException(Exception): pass
432 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
433 count=None, before=None, after=None):
434 """
435 fetches data from a subreddit (or a multireddit like gif+gifs) and
436 filters/sorts results.
437 sorted_by values: hot, new, rising, controversial, top
438 time values: hour, day, week, month, year, all (for top and controversial)
439 """
440
441 if not subreddits:
442 return None
443
444 query = {k:v for k,v in {
445 'count':count,
446 'before':before,
447 'after':after,
448 'limit':limit, # 1..100 (default 25)
449 't': time, # hour,week,month,year,all
450 }.items() if v}
451 multireddit = '+'.join(subreddits)
452 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
453 query, headers={'User-Agent':'Mozilla/5.0'})
454 if not r.ok or not 'data' in r.json():
455 raise RedditException(r.text)
456
457 return r.json()
458
459 def fetch_reddit_post(post_id):
460 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
461 r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
462 headers={'User-Agent':'Mozilla/5.0'})
463 if not r.ok or not 'data' in r.json():
464 raise RedditException(r.text)
465
466 return r.json()
467
468 def parse_reddit_videos(data):
469 videos = []
470 entries = sorted(data['data']['children'],
471 key=lambda e: e['data']['score'] > 1,
472 reverse=True)
473 for entry in entries:
474 e = entry['data']
475 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
476 continue
477 try:
478 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
479 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
480 except:
481 continue # XXX: should we log that?
482 if not video_id: continue
483 videos.append({
484 'video_id': video_id,
485 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
486 'url': e['permalink'],
487 'n_comments': e['num_comments'],
488 'n_karma': e['score'],
489 'subreddit': e['subreddit'],
490 'post_id': e['id'],
491 })
492
493 return videos
494
495 class NoFallbackException(Exception): pass
496 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
497 """
498 finds the next route that matches the current url rule, and executes it.
499 args, kwargs: pass all arguments of the current route
500 """
501 from flask import current_app, request, g
502 from werkzeug.exceptions import NotFound
503
504 # build a list of endpoints that match the current request's url rule:
505 matching = [
506 rule.endpoint
507 for rule in current_app.url_map.iter_rules()
508 if rule.rule == request.url_rule.rule
509 ]
510 current = matching.index(request.endpoint)
511
512 # since we can't change request.endpoint, we always get the original
513 # endpoint back. so for repeated fall throughs, we use the g object to
514 # increment how often we want to fall through.
515 if not '_fallback_next' in g:
516 g._fallback_next = 0
517 g._fallback_next += 1
518
519 next_ep = current + g._fallback_next
520
521 if next_ep < len(matching):
522 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
523 else:
524 raise NoFallbackException
525
526 def websub_url_hmac(key, feed_id, timestamp, nonce):
527 """ generate sha1 hmac, as required by websub/pubsubhubbub """
528 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
529 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
530
531 def websub_body_hmac(key, body):
532 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
533
534 def pp(*args):
535 from pprint import pprint
536 import sys, codecs
537 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum