]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
fix livestream start time metadata
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import base64
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.text
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 current_app.logger.info(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 timestamp, published = None, None
110 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
111 # video gets uploaded as unlisted on day A and set to public on day B;
112 # the webhook is sent on day B, but 'published' says A. The video
113 # therefore looks like it's just an update to an older video). If
114 # that's the case, we fetch get_video_info and double-check.
115 # We only need to do this to not-yet-in-the-database videos.
116 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 if from_webhook and new_video:
119 _, meta, _, _ = get_video_info(video['video_id'])
120 if meta:
121 meta = prepare_metadata(meta)
122 published = dateutil.parser.parse(meta['published'])
123 current_app.logger.info(f"new video {video['video_id']}, uploaded {published}") # XXX: remove
124 # if published within the last week, assume it's new
125 if (now - published).days < 7:
126 timestamp = now
127 else:#, it's just an update to an older video.
128 timestamp = published
129 # if we update from an rss-pull, we can rely on the embedded published
130 # dates (and don't have to fire off a whole bunch of requests)
131 else:
132 updated = dateutil.parser.parse(video['updated'])
133 published = dateutil.parser.parse(video['published'])
134 if (updated - published).seconds < 60 and (now - published).days < 7:
135 timestamp = now
136 else:#, it's just an update to an older video.
137 timestamp = published
138
139
140 c.execute("""
141 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
142 VALUES (?, ?, ?, datetime(?), datetime(?))
143 """, (
144 video['video_id'],
145 video['channel_id'],
146 video['title'],
147 video['published'],
148 timestamp
149 ))
150
151 if i == 0: # only required once per feed
152 c.execute("""
153 INSERT OR REPLACE INTO channels (id, name)
154 VALUES (?, ?)
155 """, (video['channel_id'], video['author']))
156 db.commit()
157
158 return True
159
160 def get_video_info(video_id, sts=0, algo=""):
161 """
162 returns: best-quality muxed video stream, player_response, error-type/mesage
163 error types: player, malformed, livestream, geolocked, exhausted
164 """
165 player_error = None # for 'exhausted'
166 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
167 r = requests.get("https://www.youtube.com/get_video_info", {
168 "video_id": video_id,
169 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
170 "el": el,
171 "sts": sts,
172 "hl": "en_US",
173 })
174 params = parse_qs(r.text)
175 if 'errorcode' in params: # status=fail
176 return None, None, 'malformed', params['reason'][0]
177
178 metadata = json.loads(params.get('player_response')[0])
179 playabilityStatus = metadata['playabilityStatus']['status']
180 if playabilityStatus != "OK":
181 playabilityReason = metadata['playabilityStatus'].get('reason',
182 '//'.join(metadata['playabilityStatus'].get('messages',[])))
183 player_error = f"{playabilityStatus}: {playabilityReason}"
184 if playabilityStatus == "UNPLAYABLE":
185 continue # try again with next el value (or fail as exhausted)
186 # without videoDetails, there's only the error message
187 maybe_metadata = metadata if 'videoDetails' in metadata else None
188 return None, maybe_metadata, 'player', player_error
189 if metadata['videoDetails']['isLiveContent'] and \
190 (metadata['videoDetails'].get('isLive', False) or \
191 metadata['videoDetails'].get('isPostLiveDvr', False)):
192 return None, metadata, 'livestream', None
193
194 if not 'formats' in metadata['streamingData']:
195 continue # no urls
196
197 formats = metadata['streamingData']['formats']
198 for (i,v) in enumerate(formats):
199 if not ('cipher' in v or 'signatureCipher' in v): continue
200 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
201 formats[i]['url'] = unscramble(cipher, algo)
202
203 # todo: check if we have urls or try again
204 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
205
206 if 'gcr' in parse_qs(url):
207 return None, metadata, 'geolocked', None
208
209 return url, metadata, None, None
210 else:
211 return None, metadata, 'exhausted', player_error
212
213 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
214 signature = list(cipher['s'][0])
215 for c in algo.split():
216 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
217 ix = int(ix) % len(signature) if ix else 0
218 if not op: continue
219 if op == 'r': signature = list(reversed(signature))
220 if op == 's': signature = signature[ix:]
221 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
222 sp = cipher.get('sp', ['signature'])[0]
223 sig = cipher.get('sig', [''.join(signature)])[0]
224 return f"{cipher['url'][0]}&{sp}={sig}"
225
226 def prepare_metadata(metadata):
227 meta1 = metadata['videoDetails']
228 meta2 = metadata['microformat']['playerMicroformatRenderer']
229 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
230 if 'cards' in metadata else []
231 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
232 if 'endscreen' in metadata else []
233
234 # the actual video streams have exact information:
235 try:
236 sd = metadata['streamingData']
237 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
238 aspect_ratio = some_stream['width'] / some_stream['height']
239 # if that's unavailable (e.g. on livestreams), fall back to
240 # thumbnails (only either 4:3 or 16:9).
241 except:
242 some_img = meta2['thumbnail']['thumbnails'][0]
243 aspect_ratio = some_img['width'] / some_img['height']
244
245 subtitles = sorted([
246 {'url':cc['baseUrl'],
247 'code':cc['languageCode'],
248 'autogenerated':cc.get('kind')=="asr",
249 'name':cc['name']['simpleText']}
250 for cc in metadata.get('captions',{})
251 .get('playerCaptionsTracklistRenderer',{})
252 .get('captionTracks',[])
253 ], key=lambda cc: cc['autogenerated'])
254
255 def clean_url(url):
256 # externals URLs are redirected through youtube.com/redirect, but we
257 # may encounter internal URLs, too
258 return parse_qs(urlparse(url).query).get('q',[url])[0]
259 # Remove left-/rightmost word from string:
260 delL = lambda s: s.partition(' ')[2]
261 delR = lambda s: s.rpartition(' ')[0]
262 # Thousands seperator aware int():
263 intT = lambda s: int(s.replace(',', ''))
264
265 def parse_infocard(card):
266 card = card['cardRenderer']
267 ctype = list(card['content'].keys())[0]
268 content = card['content'][ctype]
269 if ctype == "pollRenderer":
270 ctype = "POLL"
271 content = {
272 'question': content['question']['simpleText'],
273 'answers': [(a['text']['simpleText'],a['numVotes']) \
274 for a in content['choices']],
275 }
276 elif ctype == "videoInfoCardContentRenderer":
277 ctype = "VIDEO"
278 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
279 # TODO: this is ugly; cleanup.
280 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
281 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
282 content = {
283 'video_id': content['action']['watchEndpoint']['videoId'],
284 'title': content['videoTitle']['simpleText'],
285 'author': delL(content['channelName']['simpleText']),
286 'length': length,
287 'views': intT(delR(content['viewCountText']['simpleText'])),
288 }
289 elif ctype == "playlistInfoCardContentRenderer":
290 ctype = "PLAYLIST"
291 content = {
292 'playlist_id': content['action']['watchEndpoint']['playlistId'],
293 'video_id': content['action']['watchEndpoint']['videoId'],
294 'title': content['playlistTitle']['simpleText'],
295 'author': delL(content['channelName']['simpleText']),
296 'n_videos': intT(content['playlistVideoCount']['simpleText']),
297 }
298 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
299 ctype = "WEBSITE"
300 content = {
301 'url': clean_url(content['command']['urlEndpoint']['url']),
302 'domain': content['displayDomain']['simpleText'],
303 'title': content['title']['simpleText'],
304 # XXX: no thumbnails for infocards
305 }
306 elif ctype == "collaboratorInfoCardContentRenderer":
307 ctype = "CHANNEL"
308 content = {
309 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
310 'title': content['channelName']['simpleText'],
311 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
312 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
313 }
314 else:
315 import pprint
316 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
317
318 return {'type': ctype, 'content': content}
319
320 def mkthumbs(thumbs):
321 return {e['height']: e['url'] for e in thumbs}
322 def parse_endcard(card):
323 card = card.get('endscreenElementRenderer', card) #only sometimes nested
324 ctype = card['style']
325 if ctype == "CHANNEL":
326 content = {
327 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
328 'title': card['title']['simpleText'],
329 'icons': mkthumbs(card['image']['thumbnails']),
330 }
331 elif ctype == "VIDEO":
332 content = {
333 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
334 'title': card['title']['simpleText'],
335 'length': card['videoDuration']['simpleText'], # '12:21'
336 'views': delR(card['metadata']['simpleText']),
337 # XXX: no channel name
338 }
339 elif ctype == "PLAYLIST":
340 content = {
341 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
342 'video_id': card['endpoint']['watchEndpoint']['videoId'],
343 'title': card['title']['simpleText'],
344 'author': delL(card['metadata']['simpleText']),
345 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
346 }
347 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
348 ctype = "WEBSITE"
349 url = clean_url(card['endpoint']['urlEndpoint']['url'])
350 content = {
351 'url': url,
352 'domain': urlparse(url).netloc,
353 'title': card['title']['simpleText'],
354 'icons': mkthumbs(card['image']['thumbnails']),
355 }
356 else:
357 import pprint
358 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
359
360 return {'type': ctype, 'content': content}
361
362 infocards = [parse_infocard(card) for card in cards]
363 endcards = [parse_endcard(card) for card in endsc]
364 # combine cards to weed out duplicates. for videos and playlists prefer
365 # infocards, for channels and websites prefer endcards, as those have more
366 # information than the other.
367 # if the card type is not in ident, we use the whole card for comparison
368 # (otherwise they'd all replace each other)
369 ident = { # ctype -> ident
370 'VIDEO': 'video_id',
371 'PLAYLIST': 'playlist_id',
372 'CHANNEL': 'channel_id',
373 'WEBSITE': 'url',
374 'POLL': 'question',
375 }
376 getident = lambda c: c['content'].get(ident.get(c['type']), c)
377 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
378 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
379
380 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
381 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
382
383 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
384 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
385 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
386 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
387 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
388 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
389 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
390 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
391 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
392 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
393 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
394 whitelisted = sorted(meta2.get('availableCountries',[]))
395 blacklisted = sorted(set(all_countries) - set(whitelisted))
396
397 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
398 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
399 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
400 published_at = meta2['liveBroadcastDetails']['startTimestamp']
401
402 return {
403 'title': meta1['title'],
404 'author': meta1['author'],
405 'channel_id': meta1['channelId'],
406 'description': meta1['shortDescription'],
407 'published': published_at,
408 'views': meta1['viewCount'],
409 'length': int(meta1['lengthSeconds']),
410 'rating': meta1['averageRating'],
411 'category': meta2['category'],
412 'aspectr': aspect_ratio,
413 'unlisted': meta2['isUnlisted'],
414 'whitelisted': whitelisted,
415 'blacklisted': blacklisted,
416 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
417 'infocards': infocards,
418 'endcards': endcards,
419 'all_cards': allcards,
420 'subtitles': subtitles,
421 }
422
423 class RedditException(Exception): pass
424 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
425 count=None, before=None, after=None):
426 """
427 fetches data from a subreddit (or a multireddit like gif+gifs) and
428 filters/sorts results.
429 sorted_by values: hot, new, rising, controversial, top
430 time values: hour, day, week, month, year, all (for top and controversial)
431 """
432
433 if not subreddits:
434 return None
435
436 query = {k:v for k,v in {
437 'count':count,
438 'before':before,
439 'after':after,
440 'limit':limit, # 1..100 (default 25)
441 't': time, # hour,week,month,year,all
442 }.items() if v}
443 multireddit = '+'.join(subreddits)
444 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
445 query, headers={'User-Agent':'Mozilla/5.0'})
446 if not r.ok or not 'data' in r.json():
447 raise RedditException(r.text)
448
449 return r.json()
450
451 def fetch_reddit_post(post_id):
452 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
453 r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
454 headers={'User-Agent':'Mozilla/5.0'})
455 if not r.ok or not 'data' in r.json():
456 raise RedditException(r.text)
457
458 return r.json()
459
460 def parse_reddit_videos(data):
461 videos = []
462 entries = sorted(data['data']['children'],
463 key=lambda e: e['data']['score'] > 1,
464 reverse=True)
465 for entry in entries:
466 e = entry['data']
467 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
468 continue
469 try:
470 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
471 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
472 except:
473 continue # XXX: should we log that?
474 if not video_id: continue
475 videos.append({
476 'video_id': video_id,
477 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
478 'url': e['permalink'],
479 'n_comments': e['num_comments'],
480 'n_karma': e['score'],
481 'subreddit': e['subreddit'],
482 'post_id': e['id'],
483 })
484
485 return videos
486
487 class NoFallbackException(Exception): pass
488 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
489 """
490 finds the next route that matches the current url rule, and executes it.
491 args, kwargs: pass all arguments of the current route
492 """
493 from flask import current_app, request, g
494 from werkzeug.exceptions import NotFound
495
496 # build a list of endpoints that match the current request's url rule:
497 matching = [
498 rule.endpoint
499 for rule in current_app.url_map.iter_rules()
500 if rule.rule == request.url_rule.rule
501 ]
502 current = matching.index(request.endpoint)
503
504 # since we can't change request.endpoint, we always get the original
505 # endpoint back. so for repeated fall throughs, we use the g object to
506 # increment how often we want to fall through.
507 if not '_fallback_next' in g:
508 g._fallback_next = 0
509 g._fallback_next += 1
510
511 next_ep = current + g._fallback_next
512
513 if next_ep < len(matching):
514 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
515 else:
516 raise NoFallbackException
517
518 def websub_url_hmac(key, feed_id, timestamp, nonce):
519 """ generate sha1 hmac, as required by websub/pubsubhubbub """
520 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
521 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
522
523 def websub_body_hmac(key, body):
524 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
525
526 def pp(*args):
527 from pprint import pprint
528 import sys, codecs
529 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum