]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
WIP: subscribing to playlist
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 from flask import current_app # XXX: remove
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
111 # TODO: enable once we enforce hmac validation:
112 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
113 break
114
115 now = datetime.now(timezone.utc)
116 updated = dateutil.parser.parse(video['updated'])
117 published = dateutil.parser.parse(video['published'])
118 # if update and published time are near-identical, we assume it's new.
119 # checking if it was posted this week is necessary during xmlfeed pulling.
120 if (updated - published).seconds < 60 and (now - published).days < 7:
121 timestamp = now
122 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
123 else:#, it might just an update to an older video, or a previously unlisted one.
124 # first, assume it's an older video (correct when pulling xmlfeeds)
125 timestamp = published
126 # then, check if we don't know about it and if so, look up the real date.
127
128 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
129 # video gets uploaded as unlisted on day A and set to public on day B;
130 # the webhook is sent on day B, but 'published' says A. The video
131 # therefore looks like it's just an update to an older video). If
132 # that's the case, we fetch get_video_info and double-check.
133 # We only need to do this to not-yet-in-the-database videos.
134 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
135 new_video = len(c.fetchall()) < 1
136 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
137 if from_webhook and new_video:
138 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
139 _, _, meta, _, _ = get_video_info(video['video_id'])
140 if meta:
141 meta = prepare_metadata(meta)
142 published = dateutil.parser.parse(meta['published'])
143 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
151 VALUES (?, ?, ?, datetime(?), datetime(?))
152 """, (
153 video['video_id'],
154 video['channel_id'],
155 video['title'],
156 video['published'],
157 timestamp
158 ))
159
160 # for channels, this is obviously always the same, but playlists can
161 # consist of videos from different channels:
162 if i == 0 or playlist:
163 c.execute("""
164 INSERT OR REPLACE INTO channels (id, name)
165 VALUES (?, ?)
166 """, (video['channel_id'], video['author']))
167
168 # keep track of which videos are in a playlist, so we can show the user
169 # why a video is in their feed:
170 if playlist:
171 c.execute("""
172 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
173 VALUES (?, ?)
174 """, (video['video_id'], playlist))
175
176 if playlist and not from_webhook: # Note: playlists can't get updated via websub
177 c.execute("""
178 INSERT OR REPLACE INTO playlists (id, name, author)
179 VALUES (?, ?, ?)
180 """, (playlist, title, channel))
181 c.execute("""
182 INSERT OR REPLACE INTO channels (id, name)
183 VALUES (?, ?)
184 """, (channel, author))
185
186 db.commit()
187
188 return True
189
190 def get_video_info(video_id, sts=0, algo=""):
191 """
192 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
193 error types: player, malformed, livestream, geolocked, exhausted
194 """
195 player_error = None # for 'exhausted'
196 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
197 r = requests.get("https://www.youtube.com/get_video_info", {
198 "video_id": video_id,
199 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
200 "el": el,
201 "sts": sts,
202 "hl": "en_US",
203 })
204 params = parse_qs(r.text)
205 if 'errorcode' in params: # status=fail
206 return None, None, None, 'malformed', params['reason'][0]
207
208 metadata = json.loads(params.get('player_response')[0])
209 playabilityStatus = metadata['playabilityStatus']['status']
210 if playabilityStatus != "OK":
211 playabilityReason = metadata['playabilityStatus'].get('reason',
212 '//'.join(metadata['playabilityStatus'].get('messages',[])))
213 player_error = f"{playabilityStatus}: {playabilityReason}"
214 if playabilityStatus == "UNPLAYABLE":
215 continue # try again with next el value (or fail as exhausted)
216 # without videoDetails, there's only the error message
217 maybe_metadata = metadata if 'videoDetails' in metadata else None
218 return None, None, maybe_metadata, 'player', player_error
219 if metadata['videoDetails'].get('isLive', False):
220 return None, None, metadata, 'livestream', None
221
222 if not 'formats' in metadata['streamingData']:
223 continue # no urls
224
225 formats = metadata['streamingData']['formats']
226 for (i,v) in enumerate(formats):
227 if not ('cipher' in v or 'signatureCipher' in v): continue
228 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
229 formats[i]['url'] = unscramble(cipher, algo)
230
231 adaptive = metadata['streamingData']['adaptiveFormats']
232 for (i,v) in enumerate(adaptive):
233 if not ('cipher' in v or 'signatureCipher' in v): continue
234 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
235 adaptive[i]['url'] = unscramble(cipher, algo)
236
237 stream_map = {'adaptive': adaptive, 'muxed': formats}
238
239 # todo: check if we have urls or try again
240 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
241
242 # ip-locked videos can be recovered if the proxy module is loaded:
243 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
244
245 return url, stream_map, metadata, is_geolocked, None
246 else:
247 return None, None, metadata, 'exhausted', player_error
248
249 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
250 signature = list(cipher['s'][0])
251 for c in algo.split():
252 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
253 ix = int(ix) % len(signature) if ix else 0
254 if not op: continue
255 if op == 'r': signature = list(reversed(signature))
256 if op == 's': signature = signature[ix:]
257 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
258 sp = cipher.get('sp', ['signature'])[0]
259 sig = cipher.get('sig', [''.join(signature)])[0]
260 return f"{cipher['url'][0]}&{sp}={sig}"
261
262 def prepare_metadata(metadata):
263 meta1 = metadata['videoDetails']
264 meta2 = metadata['microformat']['playerMicroformatRenderer']
265 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
266 if 'cards' in metadata else []
267 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
268 if 'endscreen' in metadata else []
269
270 # the actual video streams have exact information:
271 try:
272 sd = metadata['streamingData']
273 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
274 aspect_ratio = some_stream['width'] / some_stream['height']
275 # if that's unavailable (e.g. on livestreams), fall back to
276 # thumbnails (only either 4:3 or 16:9).
277 except:
278 some_img = meta2['thumbnail']['thumbnails'][0]
279 aspect_ratio = some_img['width'] / some_img['height']
280
281 # Note: we could get subtitles in multiple formats directly by querying
282 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
283 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
284 # but that won't give us autogenerated subtitles (and is an extra request).
285 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
286 try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
287 default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
288 main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
289 except:
290 main_subtitle = -1
291 subtitles = sorted([
292 {'url':cc['baseUrl'],
293 'code':cc['languageCode'],
294 'autogenerated':cc.get('kind')=="asr",
295 'name':cc['name']['simpleText'],
296 'default':i==main_subtitle,
297 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
298 for i,cc in enumerate(metadata.get('captions',{})
299 .get('playerCaptionsTracklistRenderer',{})
300 .get('captionTracks',[]))
301 # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
302 ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
303
304 def clean_url(url):
305 # externals URLs are redirected through youtube.com/redirect, but we
306 # may encounter internal URLs, too
307 return parse_qs(urlparse(url).query).get('q',[url])[0]
308 # Remove left-/rightmost word from string:
309 delL = lambda s: s.partition(' ')[2]
310 delR = lambda s: s.rpartition(' ')[0]
311 # Thousands seperator aware int():
312 intT = lambda s: int(s.replace(',', ''))
313
314 def parse_infocard(card):
315 card = card['cardRenderer']
316 ctype = list(card['content'].keys())[0]
317 content = card['content'][ctype]
318 if ctype == "pollRenderer":
319 ctype = "POLL"
320 content = {
321 'question': content['question']['simpleText'],
322 'answers': [(a['text']['simpleText'],a['numVotes']) \
323 for a in content['choices']],
324 }
325 elif ctype == "videoInfoCardContentRenderer":
326 ctype = "VIDEO"
327 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
328 # TODO: this is ugly; cleanup.
329 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
330 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
331 content = {
332 'video_id': content['action']['watchEndpoint']['videoId'],
333 'title': content['videoTitle']['simpleText'],
334 'author': delL(content['channelName']['simpleText']),
335 'length': length,
336 'views': intT(delR(content['viewCountText']['simpleText'])),
337 }
338 elif ctype == "playlistInfoCardContentRenderer":
339 ctype = "PLAYLIST"
340 content = {
341 'playlist_id': content['action']['watchEndpoint']['playlistId'],
342 'video_id': content['action']['watchEndpoint']['videoId'],
343 'title': content['playlistTitle']['simpleText'],
344 'author': delL(content['channelName']['simpleText']),
345 'n_videos': intT(content['playlistVideoCount']['simpleText']),
346 }
347 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
348 ctype = "WEBSITE"
349 content = {
350 'url': clean_url(content['command']['urlEndpoint']['url']),
351 'domain': content['displayDomain']['simpleText'],
352 'title': content['title']['simpleText'],
353 # XXX: no thumbnails for infocards
354 }
355 elif ctype == "collaboratorInfoCardContentRenderer":
356 ctype = "CHANNEL"
357 content = {
358 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
359 'title': content['channelName']['simpleText'],
360 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
361 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
362 }
363 else:
364 import pprint
365 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
366
367 return {'type': ctype, 'content': content}
368
369 def mkthumbs(thumbs):
370 return {e['height']: e['url'] for e in thumbs}
371 def parse_endcard(card):
372 card = card.get('endscreenElementRenderer', card) #only sometimes nested
373 ctype = card['style']
374 if ctype == "CHANNEL":
375 content = {
376 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
377 'title': card['title']['simpleText'],
378 'icons': mkthumbs(card['image']['thumbnails']),
379 }
380 elif ctype == "VIDEO":
381 content = {
382 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
383 'title': card['title']['simpleText'],
384 'length': card['videoDuration']['simpleText'], # '12:21'
385 'views': delR(card['metadata']['simpleText']),
386 # XXX: no channel name
387 }
388 elif ctype == "PLAYLIST":
389 content = {
390 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
391 'video_id': card['endpoint']['watchEndpoint']['videoId'],
392 'title': card['title']['simpleText'],
393 'author': delL(card['metadata']['simpleText']),
394 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
395 }
396 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
397 ctype = "WEBSITE"
398 url = clean_url(card['endpoint']['urlEndpoint']['url'])
399 content = {
400 'url': url,
401 'domain': urlparse(url).netloc,
402 'title': card['title']['simpleText'],
403 'icons': mkthumbs(card['image']['thumbnails']),
404 }
405 else:
406 import pprint
407 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
408
409 return {'type': ctype, 'content': content}
410
411 infocards = [parse_infocard(card) for card in cards]
412 endcards = [parse_endcard(card) for card in endsc]
413 # combine cards to weed out duplicates. for videos and playlists prefer
414 # infocards, for channels and websites prefer endcards, as those have more
415 # information than the other.
416 # if the card type is not in ident, we use the whole card for comparison
417 # (otherwise they'd all replace each other)
418 ident = { # ctype -> ident
419 'VIDEO': 'video_id',
420 'PLAYLIST': 'playlist_id',
421 'CHANNEL': 'channel_id',
422 'WEBSITE': 'url',
423 'POLL': 'question',
424 }
425 getident = lambda c: c['content'].get(ident.get(c['type']), c)
426 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
427 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
428
429 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
430 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
431
432 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
433 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
434 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
435 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
436 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
437 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
438 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
439 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
440 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
441 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
442 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
443 whitelisted = sorted(meta2.get('availableCountries',[]))
444 blacklisted = sorted(set(all_countries) - set(whitelisted))
445
446 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
447 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
448 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
449 published_at = meta2['liveBroadcastDetails']['startTimestamp']
450
451 return {
452 'title': meta1['title'],
453 'author': meta1['author'],
454 'channel_id': meta1['channelId'],
455 'description': meta1['shortDescription'],
456 'published': published_at,
457 'views': meta1['viewCount'],
458 'length': int(meta1['lengthSeconds']),
459 'rating': meta1['averageRating'],
460 'category': meta2['category'],
461 'aspectr': aspect_ratio,
462 'unlisted': meta2['isUnlisted'],
463 'whitelisted': whitelisted,
464 'blacklisted': blacklisted,
465 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
466 'infocards': infocards,
467 'endcards': endcards,
468 'all_cards': allcards,
469 'subtitles': subtitles,
470 }
471
472 def store_video_metadata(video_id):
473 # check if we know about it, and if not, fetch and store video metadata
474 with sqlite3.connect(cf['global']['database']) as conn:
475 c = conn.cursor()
476 c.execute("SELECT 1 from videos where id = ?", (video_id,))
477 new_video = len(c.fetchall()) < 1
478 if new_video:
479 _, _, meta, _, _ = get_video_info(video_id)
480 if meta:
481 meta = prepare_metadata(meta)
482 c.execute("""
483 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
484 VALUES (?, ?, ?, datetime(?), datetime(?))
485 """, (
486 video_id,
487 meta['channel_id'],
488 meta['title'],
489 meta['published'],
490 meta['published'],
491 ))
492 c.execute("""
493 INSERT OR REPLACE INTO channels (id, name)
494 VALUES (?, ?)
495 """, (meta['channel_id'], meta['author']))
496
497 from werkzeug.exceptions import NotFound
498 class NoFallbackException(NotFound): pass
499 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
500 """
501 finds the next route that matches the current url rule, and executes it.
502 args, kwargs: pass all arguments of the current route
503 """
504 from flask import current_app, request, g
505
506 # build a list of endpoints that match the current request's url rule:
507 matching = [
508 rule.endpoint
509 for rule in current_app.url_map.iter_rules()
510 if rule.rule == request.url_rule.rule
511 ]
512 current = matching.index(request.endpoint)
513
514 # since we can't change request.endpoint, we always get the original
515 # endpoint back. so for repeated fall throughs, we use the g object to
516 # increment how often we want to fall through.
517 if not '_fallback_next' in g:
518 g._fallback_next = 0
519 g._fallback_next += 1
520
521 next_ep = current + g._fallback_next
522
523 if next_ep < len(matching):
524 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
525 else:
526 raise NoFallbackException
527
528 def websub_url_hmac(key, feed_id, timestamp, nonce):
529 """ generate sha1 hmac, as required by websub/pubsubhubbub """
530 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
531 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
532
533 def websub_body_hmac(key, body):
534 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
535
536 def pp(*args):
537 from pprint import pprint
538 import sys, codecs
539 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum