]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
use all muxed stream sources
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import base64
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 updated = dateutil.parser.parse(video['updated'])
110 published = dateutil.parser.parse(video['published'])
111 # if update and published time are near-identical, we assume it's new.
112 # checking if it was posted this week is necessary during xmlfeed pulling.
113 if (updated - published).seconds < 60 and (now - published).days < 7:
114 timestamp = now
115 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
116 else:#, it might just an update to an older video, or a previously unlisted one.
117 # first, assume it's an older video (correct when pulling xmlfeeds)
118 timestamp = published
119 # then, check if we don't know about it and if so, look up the real date.
120
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video). If
125 # that's the case, we fetch get_video_info and double-check.
126 # We only need to do this to not-yet-in-the-database videos.
127 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
128 new_video = len(c.fetchall()) < 1
129 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
130 if from_webhook and new_video:
131 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
132 _, meta, _, _ = get_video_info(video['video_id'])
133 if meta:
134 meta = prepare_metadata(meta)
135 published = dateutil.parser.parse(meta['published'])
136 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
137 if (now - published).days < 7:
138 timestamp = now
139 else:#, it's just an update to an older video.
140 timestamp = published
141
142 c.execute("""
143 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
144 VALUES (?, ?, ?, datetime(?), datetime(?))
145 """, (
146 video['video_id'],
147 video['channel_id'],
148 video['title'],
149 video['published'],
150 timestamp
151 ))
152
153 if i == 0: # only required once per feed
154 c.execute("""
155 INSERT OR REPLACE INTO channels (id, name)
156 VALUES (?, ?)
157 """, (video['channel_id'], video['author']))
158 db.commit()
159
160 return True
161
162 def get_video_info(video_id, sts=0, algo=""):
163 """
164 returns: best-quality muxed video stream, player_response, error-type/mesage
165 error types: player, malformed, livestream, geolocked, exhausted
166 """
167 player_error = None # for 'exhausted'
168 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
169 r = requests.get("https://www.youtube.com/get_video_info", {
170 "video_id": video_id,
171 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
172 "el": el,
173 "sts": sts,
174 "hl": "en_US",
175 })
176 params = parse_qs(r.text)
177 if 'errorcode' in params: # status=fail
178 return None, None, 'malformed', params['reason'][0]
179
180 metadata = json.loads(params.get('player_response')[0])
181 playabilityStatus = metadata['playabilityStatus']['status']
182 if playabilityStatus != "OK":
183 playabilityReason = metadata['playabilityStatus'].get('reason',
184 '//'.join(metadata['playabilityStatus'].get('messages',[])))
185 player_error = f"{playabilityStatus}: {playabilityReason}"
186 if playabilityStatus == "UNPLAYABLE":
187 continue # try again with next el value (or fail as exhausted)
188 # without videoDetails, there's only the error message
189 maybe_metadata = metadata if 'videoDetails' in metadata else None
190 return None, maybe_metadata, 'player', player_error
191 if metadata['videoDetails']['isLiveContent'] and \
192 (metadata['videoDetails'].get('isLive', False) or \
193 metadata['videoDetails'].get('isPostLiveDvr', False)):
194 return None, metadata, 'livestream', None
195
196 if not 'formats' in metadata['streamingData']:
197 continue # no urls
198
199 formats = metadata['streamingData']['formats']
200 for (i,v) in enumerate(formats):
201 if not ('cipher' in v or 'signatureCipher' in v): continue
202 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
203 formats[i]['url'] = unscramble(cipher, algo)
204
205 # todo: check if we have urls or try again
206 muxed = [
207 f['url'] for f in
208 sorted(formats, key=lambda k: k['height'], reverse=True)
209 ]
210
211 if 'gcr' in parse_qs(muxed[0]):
212 return None, metadata, 'geolocked', None
213
214 return muxed, metadata, None, None
215 else:
216 return None, metadata, 'exhausted', player_error
217
218 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
219 signature = list(cipher['s'][0])
220 for c in algo.split():
221 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
222 ix = int(ix) % len(signature) if ix else 0
223 if not op: continue
224 if op == 'r': signature = list(reversed(signature))
225 if op == 's': signature = signature[ix:]
226 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
227 sp = cipher.get('sp', ['signature'])[0]
228 sig = cipher.get('sig', [''.join(signature)])[0]
229 return f"{cipher['url'][0]}&{sp}={sig}"
230
231 def prepare_metadata(metadata):
232 meta1 = metadata['videoDetails']
233 meta2 = metadata['microformat']['playerMicroformatRenderer']
234 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
235 if 'cards' in metadata else []
236 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
237 if 'endscreen' in metadata else []
238
239 # the actual video streams have exact information:
240 try:
241 sd = metadata['streamingData']
242 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
243 aspect_ratio = some_stream['width'] / some_stream['height']
244 # if that's unavailable (e.g. on livestreams), fall back to
245 # thumbnails (only either 4:3 or 16:9).
246 except:
247 some_img = meta2['thumbnail']['thumbnails'][0]
248 aspect_ratio = some_img['width'] / some_img['height']
249
250 # Note: we could get subtitles in multiple formats directly by querying
251 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
252 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
253 # but that won't give us autogenerated subtitles (and is an extra request).
254 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
255 subtitles = sorted([
256 {'url':cc['baseUrl'],
257 'code':cc['languageCode'],
258 'autogenerated':cc.get('kind')=="asr",
259 'name':cc['name']['simpleText'],
260 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
261 for cc in metadata.get('captions',{})
262 .get('playerCaptionsTracklistRenderer',{})
263 .get('captionTracks',[])
264 ], key=lambda cc: cc['autogenerated'])
265
266 def clean_url(url):
267 # externals URLs are redirected through youtube.com/redirect, but we
268 # may encounter internal URLs, too
269 return parse_qs(urlparse(url).query).get('q',[url])[0]
270 # Remove left-/rightmost word from string:
271 delL = lambda s: s.partition(' ')[2]
272 delR = lambda s: s.rpartition(' ')[0]
273 # Thousands seperator aware int():
274 intT = lambda s: int(s.replace(',', ''))
275
276 def parse_infocard(card):
277 card = card['cardRenderer']
278 ctype = list(card['content'].keys())[0]
279 content = card['content'][ctype]
280 if ctype == "pollRenderer":
281 ctype = "POLL"
282 content = {
283 'question': content['question']['simpleText'],
284 'answers': [(a['text']['simpleText'],a['numVotes']) \
285 for a in content['choices']],
286 }
287 elif ctype == "videoInfoCardContentRenderer":
288 ctype = "VIDEO"
289 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
290 # TODO: this is ugly; cleanup.
291 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
292 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
293 content = {
294 'video_id': content['action']['watchEndpoint']['videoId'],
295 'title': content['videoTitle']['simpleText'],
296 'author': delL(content['channelName']['simpleText']),
297 'length': length,
298 'views': intT(delR(content['viewCountText']['simpleText'])),
299 }
300 elif ctype == "playlistInfoCardContentRenderer":
301 ctype = "PLAYLIST"
302 content = {
303 'playlist_id': content['action']['watchEndpoint']['playlistId'],
304 'video_id': content['action']['watchEndpoint']['videoId'],
305 'title': content['playlistTitle']['simpleText'],
306 'author': delL(content['channelName']['simpleText']),
307 'n_videos': intT(content['playlistVideoCount']['simpleText']),
308 }
309 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
310 ctype = "WEBSITE"
311 content = {
312 'url': clean_url(content['command']['urlEndpoint']['url']),
313 'domain': content['displayDomain']['simpleText'],
314 'title': content['title']['simpleText'],
315 # XXX: no thumbnails for infocards
316 }
317 elif ctype == "collaboratorInfoCardContentRenderer":
318 ctype = "CHANNEL"
319 content = {
320 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
321 'title': content['channelName']['simpleText'],
322 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
323 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
324 }
325 else:
326 import pprint
327 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
328
329 return {'type': ctype, 'content': content}
330
331 def mkthumbs(thumbs):
332 return {e['height']: e['url'] for e in thumbs}
333 def parse_endcard(card):
334 card = card.get('endscreenElementRenderer', card) #only sometimes nested
335 ctype = card['style']
336 if ctype == "CHANNEL":
337 content = {
338 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
339 'title': card['title']['simpleText'],
340 'icons': mkthumbs(card['image']['thumbnails']),
341 }
342 elif ctype == "VIDEO":
343 content = {
344 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
345 'title': card['title']['simpleText'],
346 'length': card['videoDuration']['simpleText'], # '12:21'
347 'views': delR(card['metadata']['simpleText']),
348 # XXX: no channel name
349 }
350 elif ctype == "PLAYLIST":
351 content = {
352 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
353 'video_id': card['endpoint']['watchEndpoint']['videoId'],
354 'title': card['title']['simpleText'],
355 'author': delL(card['metadata']['simpleText']),
356 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
357 }
358 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
359 ctype = "WEBSITE"
360 url = clean_url(card['endpoint']['urlEndpoint']['url'])
361 content = {
362 'url': url,
363 'domain': urlparse(url).netloc,
364 'title': card['title']['simpleText'],
365 'icons': mkthumbs(card['image']['thumbnails']),
366 }
367 else:
368 import pprint
369 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
370
371 return {'type': ctype, 'content': content}
372
373 infocards = [parse_infocard(card) for card in cards]
374 endcards = [parse_endcard(card) for card in endsc]
375 # combine cards to weed out duplicates. for videos and playlists prefer
376 # infocards, for channels and websites prefer endcards, as those have more
377 # information than the other.
378 # if the card type is not in ident, we use the whole card for comparison
379 # (otherwise they'd all replace each other)
380 ident = { # ctype -> ident
381 'VIDEO': 'video_id',
382 'PLAYLIST': 'playlist_id',
383 'CHANNEL': 'channel_id',
384 'WEBSITE': 'url',
385 'POLL': 'question',
386 }
387 getident = lambda c: c['content'].get(ident.get(c['type']), c)
388 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
389 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
390
391 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
392 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
393
394 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
395 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
396 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
397 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
398 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
399 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
400 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
401 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
402 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
403 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
404 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
405 whitelisted = sorted(meta2.get('availableCountries',[]))
406 blacklisted = sorted(set(all_countries) - set(whitelisted))
407
408 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
409 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
410 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
411 published_at = meta2['liveBroadcastDetails']['startTimestamp']
412
413 return {
414 'title': meta1['title'],
415 'author': meta1['author'],
416 'channel_id': meta1['channelId'],
417 'description': meta1['shortDescription'],
418 'published': published_at,
419 'views': meta1['viewCount'],
420 'length': int(meta1['lengthSeconds']),
421 'rating': meta1['averageRating'],
422 'category': meta2['category'],
423 'aspectr': aspect_ratio,
424 'unlisted': meta2['isUnlisted'],
425 'whitelisted': whitelisted,
426 'blacklisted': blacklisted,
427 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
428 'infocards': infocards,
429 'endcards': endcards,
430 'all_cards': allcards,
431 'subtitles': subtitles,
432 }
433
434 class RedditException(Exception): pass
435 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
436 count=None, before=None, after=None):
437 """
438 fetches data from a subreddit (or a multireddit like gif+gifs) and
439 filters/sorts results.
440 sorted_by values: hot, new, rising, controversial, top
441 time values: hour, day, week, month, year, all (for top and controversial)
442 """
443
444 if not subreddits:
445 return None
446
447 query = {k:v for k,v in {
448 'count':count,
449 'before':before,
450 'after':after,
451 'limit':limit, # 1..100 (default 25)
452 't': time, # hour,week,month,year,all
453 }.items() if v}
454 multireddit = '+'.join(subreddits)
455 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
456 query, headers={'User-Agent':'Mozilla/5.0'})
457 if not r.ok or not 'data' in r.json():
458 raise RedditException(r.text)
459
460 return r.json()
461
462 def fetch_reddit_post(post_id):
463 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
464 r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
465 headers={'User-Agent':'Mozilla/5.0'})
466 if not r.ok or not 'data' in r.json():
467 raise RedditException(r.text)
468
469 return r.json()
470
471 def parse_reddit_videos(data):
472 videos = []
473 entries = sorted(data['data']['children'],
474 key=lambda e: e['data']['score'] > 1,
475 reverse=True)
476 for entry in entries:
477 e = entry['data']
478 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
479 continue
480 try:
481 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
482 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
483 except:
484 continue # XXX: should we log that?
485 if not video_id: continue
486 videos.append({
487 'video_id': video_id,
488 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
489 'url': e['permalink'],
490 'n_comments': e['num_comments'],
491 'n_karma': e['score'],
492 'subreddit': e['subreddit'],
493 'post_id': e['id'],
494 })
495
496 return videos
497
498 class NoFallbackException(Exception): pass
499 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
500 """
501 finds the next route that matches the current url rule, and executes it.
502 args, kwargs: pass all arguments of the current route
503 """
504 from flask import current_app, request, g
505 from werkzeug.exceptions import NotFound
506
507 # build a list of endpoints that match the current request's url rule:
508 matching = [
509 rule.endpoint
510 for rule in current_app.url_map.iter_rules()
511 if rule.rule == request.url_rule.rule
512 ]
513 current = matching.index(request.endpoint)
514
515 # since we can't change request.endpoint, we always get the original
516 # endpoint back. so for repeated fall throughs, we use the g object to
517 # increment how often we want to fall through.
518 if not '_fallback_next' in g:
519 g._fallback_next = 0
520 g._fallback_next += 1
521
522 next_ep = current + g._fallback_next
523
524 if next_ep < len(matching):
525 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
526 else:
527 raise NoFallbackException
528
529 def websub_url_hmac(key, feed_id, timestamp, nonce):
530 """ generate sha1 hmac, as required by websub/pubsubhubbub """
531 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
532 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
533
534 def websub_body_hmac(key, body):
535 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
536
537 def pp(*args):
538 from pprint import pprint
539 import sys, codecs
540 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum