]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
decouple reddit from common
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 updated = dateutil.parser.parse(video['updated'])
110 published = dateutil.parser.parse(video['published'])
111 # if update and published time are near-identical, we assume it's new.
112 # checking if it was posted this week is necessary during xmlfeed pulling.
113 if (updated - published).seconds < 60 and (now - published).days < 7:
114 timestamp = now
115 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
116 else:#, it might just an update to an older video, or a previously unlisted one.
117 # first, assume it's an older video (correct when pulling xmlfeeds)
118 timestamp = published
119 # then, check if we don't know about it and if so, look up the real date.
120
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video). If
125 # that's the case, we fetch get_video_info and double-check.
126 # We only need to do this to not-yet-in-the-database videos.
127 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
128 new_video = len(c.fetchall()) < 1
129 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
130 if from_webhook and new_video:
131 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
132 _, meta, _, _ = get_video_info(video['video_id'])
133 if meta:
134 meta = prepare_metadata(meta)
135 published = dateutil.parser.parse(meta['published'])
136 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
137 if (now - published).days < 7:
138 timestamp = now
139 else:#, it's just an update to an older video.
140 timestamp = published
141
142 c.execute("""
143 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
144 VALUES (?, ?, ?, datetime(?), datetime(?))
145 """, (
146 video['video_id'],
147 video['channel_id'],
148 video['title'],
149 video['published'],
150 timestamp
151 ))
152
153 if i == 0: # only required once per feed
154 c.execute("""
155 INSERT OR REPLACE INTO channels (id, name)
156 VALUES (?, ?)
157 """, (video['channel_id'], video['author']))
158 db.commit()
159
160 return True
161
162 def get_video_info(video_id, sts=0, algo=""):
163 """
164 returns: best-quality muxed video stream, player_response, error-type/mesage
165 error types: player, malformed, livestream, geolocked, exhausted
166 """
167 player_error = None # for 'exhausted'
168 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
169 r = requests.get("https://www.youtube.com/get_video_info", {
170 "video_id": video_id,
171 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
172 "el": el,
173 "sts": sts,
174 "hl": "en_US",
175 })
176 params = parse_qs(r.text)
177 if 'errorcode' in params: # status=fail
178 return None, None, 'malformed', params['reason'][0]
179
180 metadata = json.loads(params.get('player_response')[0])
181 playabilityStatus = metadata['playabilityStatus']['status']
182 if playabilityStatus != "OK":
183 playabilityReason = metadata['playabilityStatus'].get('reason',
184 '//'.join(metadata['playabilityStatus'].get('messages',[])))
185 player_error = f"{playabilityStatus}: {playabilityReason}"
186 if playabilityStatus == "UNPLAYABLE":
187 continue # try again with next el value (or fail as exhausted)
188 # without videoDetails, there's only the error message
189 maybe_metadata = metadata if 'videoDetails' in metadata else None
190 return None, maybe_metadata, 'player', player_error
191 if metadata['videoDetails'].get('isLive', False):
192 return None, metadata, 'livestream', None
193
194 if not 'formats' in metadata['streamingData']:
195 continue # no urls
196
197 formats = metadata['streamingData']['formats']
198 for (i,v) in enumerate(formats):
199 if not ('cipher' in v or 'signatureCipher' in v): continue
200 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
201 formats[i]['url'] = unscramble(cipher, algo)
202
203 # todo: check if we have urls or try again
204 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
205
206 # ip-locked videos can be recovered if the proxy module is loaded:
207 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
208
209 return url, metadata, is_geolocked, None
210 else:
211 return None, metadata, 'exhausted', player_error
212
213 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
214 signature = list(cipher['s'][0])
215 for c in algo.split():
216 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
217 ix = int(ix) % len(signature) if ix else 0
218 if not op: continue
219 if op == 'r': signature = list(reversed(signature))
220 if op == 's': signature = signature[ix:]
221 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
222 sp = cipher.get('sp', ['signature'])[0]
223 sig = cipher.get('sig', [''.join(signature)])[0]
224 return f"{cipher['url'][0]}&{sp}={sig}"
225
226 def prepare_metadata(metadata):
227 meta1 = metadata['videoDetails']
228 meta2 = metadata['microformat']['playerMicroformatRenderer']
229 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
230 if 'cards' in metadata else []
231 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
232 if 'endscreen' in metadata else []
233
234 # the actual video streams have exact information:
235 try:
236 sd = metadata['streamingData']
237 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
238 aspect_ratio = some_stream['width'] / some_stream['height']
239 # if that's unavailable (e.g. on livestreams), fall back to
240 # thumbnails (only either 4:3 or 16:9).
241 except:
242 some_img = meta2['thumbnail']['thumbnails'][0]
243 aspect_ratio = some_img['width'] / some_img['height']
244
245 # Note: we could get subtitles in multiple formats directly by querying
246 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
247 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
248 # but that won't give us autogenerated subtitles (and is an extra request).
249 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
250 try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
251 default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
252 main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
253 except:
254 main_subtitle = -1
255 subtitles = sorted([
256 {'url':cc['baseUrl'],
257 'code':cc['languageCode'],
258 'autogenerated':cc.get('kind')=="asr",
259 'name':cc['name']['simpleText'],
260 'default':i==main_subtitle,
261 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
262 for i,cc in enumerate(metadata.get('captions',{})
263 .get('playerCaptionsTracklistRenderer',{})
264 .get('captionTracks',[]))
265 # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
266 ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
267
268 def clean_url(url):
269 # externals URLs are redirected through youtube.com/redirect, but we
270 # may encounter internal URLs, too
271 return parse_qs(urlparse(url).query).get('q',[url])[0]
272 # Remove left-/rightmost word from string:
273 delL = lambda s: s.partition(' ')[2]
274 delR = lambda s: s.rpartition(' ')[0]
275 # Thousands seperator aware int():
276 intT = lambda s: int(s.replace(',', ''))
277
278 def parse_infocard(card):
279 card = card['cardRenderer']
280 ctype = list(card['content'].keys())[0]
281 content = card['content'][ctype]
282 if ctype == "pollRenderer":
283 ctype = "POLL"
284 content = {
285 'question': content['question']['simpleText'],
286 'answers': [(a['text']['simpleText'],a['numVotes']) \
287 for a in content['choices']],
288 }
289 elif ctype == "videoInfoCardContentRenderer":
290 ctype = "VIDEO"
291 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
292 # TODO: this is ugly; cleanup.
293 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
294 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
295 content = {
296 'video_id': content['action']['watchEndpoint']['videoId'],
297 'title': content['videoTitle']['simpleText'],
298 'author': delL(content['channelName']['simpleText']),
299 'length': length,
300 'views': intT(delR(content['viewCountText']['simpleText'])),
301 }
302 elif ctype == "playlistInfoCardContentRenderer":
303 ctype = "PLAYLIST"
304 content = {
305 'playlist_id': content['action']['watchEndpoint']['playlistId'],
306 'video_id': content['action']['watchEndpoint']['videoId'],
307 'title': content['playlistTitle']['simpleText'],
308 'author': delL(content['channelName']['simpleText']),
309 'n_videos': intT(content['playlistVideoCount']['simpleText']),
310 }
311 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
312 ctype = "WEBSITE"
313 content = {
314 'url': clean_url(content['command']['urlEndpoint']['url']),
315 'domain': content['displayDomain']['simpleText'],
316 'title': content['title']['simpleText'],
317 # XXX: no thumbnails for infocards
318 }
319 elif ctype == "collaboratorInfoCardContentRenderer":
320 ctype = "CHANNEL"
321 content = {
322 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
323 'title': content['channelName']['simpleText'],
324 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
325 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
326 }
327 else:
328 import pprint
329 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
330
331 return {'type': ctype, 'content': content}
332
333 def mkthumbs(thumbs):
334 return {e['height']: e['url'] for e in thumbs}
335 def parse_endcard(card):
336 card = card.get('endscreenElementRenderer', card) #only sometimes nested
337 ctype = card['style']
338 if ctype == "CHANNEL":
339 content = {
340 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
341 'title': card['title']['simpleText'],
342 'icons': mkthumbs(card['image']['thumbnails']),
343 }
344 elif ctype == "VIDEO":
345 content = {
346 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
347 'title': card['title']['simpleText'],
348 'length': card['videoDuration']['simpleText'], # '12:21'
349 'views': delR(card['metadata']['simpleText']),
350 # XXX: no channel name
351 }
352 elif ctype == "PLAYLIST":
353 content = {
354 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
355 'video_id': card['endpoint']['watchEndpoint']['videoId'],
356 'title': card['title']['simpleText'],
357 'author': delL(card['metadata']['simpleText']),
358 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
359 }
360 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
361 ctype = "WEBSITE"
362 url = clean_url(card['endpoint']['urlEndpoint']['url'])
363 content = {
364 'url': url,
365 'domain': urlparse(url).netloc,
366 'title': card['title']['simpleText'],
367 'icons': mkthumbs(card['image']['thumbnails']),
368 }
369 else:
370 import pprint
371 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
372
373 return {'type': ctype, 'content': content}
374
375 infocards = [parse_infocard(card) for card in cards]
376 endcards = [parse_endcard(card) for card in endsc]
377 # combine cards to weed out duplicates. for videos and playlists prefer
378 # infocards, for channels and websites prefer endcards, as those have more
379 # information than the other.
380 # if the card type is not in ident, we use the whole card for comparison
381 # (otherwise they'd all replace each other)
382 ident = { # ctype -> ident
383 'VIDEO': 'video_id',
384 'PLAYLIST': 'playlist_id',
385 'CHANNEL': 'channel_id',
386 'WEBSITE': 'url',
387 'POLL': 'question',
388 }
389 getident = lambda c: c['content'].get(ident.get(c['type']), c)
390 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
391 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
392
393 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
394 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
395
396 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
397 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
398 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
399 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
400 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
401 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
402 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
403 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
404 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
405 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
406 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
407 whitelisted = sorted(meta2.get('availableCountries',[]))
408 blacklisted = sorted(set(all_countries) - set(whitelisted))
409
410 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
411 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
412 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
413 published_at = meta2['liveBroadcastDetails']['startTimestamp']
414
415 return {
416 'title': meta1['title'],
417 'author': meta1['author'],
418 'channel_id': meta1['channelId'],
419 'description': meta1['shortDescription'],
420 'published': published_at,
421 'views': meta1['viewCount'],
422 'length': int(meta1['lengthSeconds']),
423 'rating': meta1['averageRating'],
424 'category': meta2['category'],
425 'aspectr': aspect_ratio,
426 'unlisted': meta2['isUnlisted'],
427 'whitelisted': whitelisted,
428 'blacklisted': blacklisted,
429 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
430 'infocards': infocards,
431 'endcards': endcards,
432 'all_cards': allcards,
433 'subtitles': subtitles,
434 }
435
436 def store_video_metadata(video_id):
437 # check if we know about it, and if not, fetch and store video metadata
438 with sqlite3.connect(cf['global']['database']) as conn:
439 c = conn.cursor()
440 c.execute("SELECT 1 from videos where id = ?", (video_id,))
441 new_video = len(c.fetchall()) < 1
442 if new_video:
443 _, meta, _, _ = get_video_info(video_id)
444 if meta:
445 meta = prepare_metadata(meta)
446 c.execute("""
447 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
448 VALUES (?, ?, ?, datetime(?), datetime(?))
449 """, (
450 video_id,
451 meta['channel_id'],
452 meta['title'],
453 meta['published'],
454 meta['published'],
455 ))
456 c.execute("""
457 INSERT OR REPLACE INTO channels (id, name)
458 VALUES (?, ?)
459 """, (meta['channel_id'], meta['author']))
460
461 from werkzeug.exceptions import NotFound
462 class NoFallbackException(NotFound): pass
463 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
464 """
465 finds the next route that matches the current url rule, and executes it.
466 args, kwargs: pass all arguments of the current route
467 """
468 from flask import current_app, request, g
469
470 # build a list of endpoints that match the current request's url rule:
471 matching = [
472 rule.endpoint
473 for rule in current_app.url_map.iter_rules()
474 if rule.rule == request.url_rule.rule
475 ]
476 current = matching.index(request.endpoint)
477
478 # since we can't change request.endpoint, we always get the original
479 # endpoint back. so for repeated fall throughs, we use the g object to
480 # increment how often we want to fall through.
481 if not '_fallback_next' in g:
482 g._fallback_next = 0
483 g._fallback_next += 1
484
485 next_ep = current + g._fallback_next
486
487 if next_ep < len(matching):
488 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
489 else:
490 raise NoFallbackException
491
492 def websub_url_hmac(key, feed_id, timestamp, nonce):
493 """ generate sha1 hmac, as required by websub/pubsubhubbub """
494 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
495 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
496
497 def websub_body_hmac(key, body):
498 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
499
500 def pp(*args):
501 from pprint import pprint
502 import sys, codecs
503 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum