]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
return bytes (not str) from fetch_xml()
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import base64
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 timestamp, published = None, None
110 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
111 # video gets uploaded as unlisted on day A and set to public on day B;
112 # the webhook is sent on day B, but 'published' says A. The video
113 # therefore looks like it's just an update to an older video). If
114 # that's the case, we fetch get_video_info and double-check.
115 # We only need to do this to not-yet-in-the-database videos.
116 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 current_app.logger.warning(f"new video {video['video_id']}") # XXX: remove
119 if from_webhook and new_video:
120 current_app.logger.warning(f" is webhook and new") # XXX: remove
121 _, meta, _, _ = get_video_info(video['video_id'])
122 if meta:
123 meta = prepare_metadata(meta)
124 published = dateutil.parser.parse(meta['published'])
125 current_app.logger.warning(f" uploaded {published}") # XXX: remove
126 # if published within the last week, assume it's new
127 if (now - published).days < 7:
128 timestamp = now
129 else:#, it's just an update to an older video.
130 timestamp = published
131 # if we update from an rss-pull, we can rely on the embedded published
132 # dates (and don't have to fire off a whole bunch of requests)
133 else:
134 updated = dateutil.parser.parse(video['updated'])
135 published = dateutil.parser.parse(video['published'])
136 if (updated - published).seconds < 60 and (now - published).days < 7:
137 timestamp = now
138 else:#, it's just an update to an older video.
139 timestamp = published
140
141
142 c.execute("""
143 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
144 VALUES (?, ?, ?, datetime(?), datetime(?))
145 """, (
146 video['video_id'],
147 video['channel_id'],
148 video['title'],
149 video['published'],
150 timestamp
151 ))
152
153 if i == 0: # only required once per feed
154 c.execute("""
155 INSERT OR REPLACE INTO channels (id, name)
156 VALUES (?, ?)
157 """, (video['channel_id'], video['author']))
158 db.commit()
159
160 return True
161
162 def get_video_info(video_id, sts=0, algo=""):
163 """
164 returns: best-quality muxed video stream, player_response, error-type/mesage
165 error types: player, malformed, livestream, geolocked, exhausted
166 """
167 player_error = None # for 'exhausted'
168 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
169 r = requests.get("https://www.youtube.com/get_video_info", {
170 "video_id": video_id,
171 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
172 "el": el,
173 "sts": sts,
174 "hl": "en_US",
175 })
176 params = parse_qs(r.text)
177 if 'errorcode' in params: # status=fail
178 return None, None, 'malformed', params['reason'][0]
179
180 metadata = json.loads(params.get('player_response')[0])
181 playabilityStatus = metadata['playabilityStatus']['status']
182 if playabilityStatus != "OK":
183 playabilityReason = metadata['playabilityStatus'].get('reason',
184 '//'.join(metadata['playabilityStatus'].get('messages',[])))
185 player_error = f"{playabilityStatus}: {playabilityReason}"
186 if playabilityStatus == "UNPLAYABLE":
187 continue # try again with next el value (or fail as exhausted)
188 # without videoDetails, there's only the error message
189 maybe_metadata = metadata if 'videoDetails' in metadata else None
190 return None, maybe_metadata, 'player', player_error
191 if metadata['videoDetails']['isLiveContent'] and \
192 (metadata['videoDetails'].get('isLive', False) or \
193 metadata['videoDetails'].get('isPostLiveDvr', False)):
194 return None, metadata, 'livestream', None
195
196 if not 'formats' in metadata['streamingData']:
197 continue # no urls
198
199 formats = metadata['streamingData']['formats']
200 for (i,v) in enumerate(formats):
201 if not ('cipher' in v or 'signatureCipher' in v): continue
202 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
203 formats[i]['url'] = unscramble(cipher, algo)
204
205 # todo: check if we have urls or try again
206 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
207
208 if 'gcr' in parse_qs(url):
209 return None, metadata, 'geolocked', None
210
211 return url, metadata, None, None
212 else:
213 return None, metadata, 'exhausted', player_error
214
215 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
216 signature = list(cipher['s'][0])
217 for c in algo.split():
218 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
219 ix = int(ix) % len(signature) if ix else 0
220 if not op: continue
221 if op == 'r': signature = list(reversed(signature))
222 if op == 's': signature = signature[ix:]
223 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
224 sp = cipher.get('sp', ['signature'])[0]
225 sig = cipher.get('sig', [''.join(signature)])[0]
226 return f"{cipher['url'][0]}&{sp}={sig}"
227
228 def prepare_metadata(metadata):
229 meta1 = metadata['videoDetails']
230 meta2 = metadata['microformat']['playerMicroformatRenderer']
231 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
232 if 'cards' in metadata else []
233 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
234 if 'endscreen' in metadata else []
235
236 # the actual video streams have exact information:
237 try:
238 sd = metadata['streamingData']
239 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
240 aspect_ratio = some_stream['width'] / some_stream['height']
241 # if that's unavailable (e.g. on livestreams), fall back to
242 # thumbnails (only either 4:3 or 16:9).
243 except:
244 some_img = meta2['thumbnail']['thumbnails'][0]
245 aspect_ratio = some_img['width'] / some_img['height']
246
247 subtitles = sorted([
248 {'url':cc['baseUrl'],
249 'code':cc['languageCode'],
250 'autogenerated':cc.get('kind')=="asr",
251 'name':cc['name']['simpleText']}
252 for cc in metadata.get('captions',{})
253 .get('playerCaptionsTracklistRenderer',{})
254 .get('captionTracks',[])
255 ], key=lambda cc: cc['autogenerated'])
256
257 def clean_url(url):
258 # externals URLs are redirected through youtube.com/redirect, but we
259 # may encounter internal URLs, too
260 return parse_qs(urlparse(url).query).get('q',[url])[0]
261 # Remove left-/rightmost word from string:
262 delL = lambda s: s.partition(' ')[2]
263 delR = lambda s: s.rpartition(' ')[0]
264 # Thousands seperator aware int():
265 intT = lambda s: int(s.replace(',', ''))
266
267 def parse_infocard(card):
268 card = card['cardRenderer']
269 ctype = list(card['content'].keys())[0]
270 content = card['content'][ctype]
271 if ctype == "pollRenderer":
272 ctype = "POLL"
273 content = {
274 'question': content['question']['simpleText'],
275 'answers': [(a['text']['simpleText'],a['numVotes']) \
276 for a in content['choices']],
277 }
278 elif ctype == "videoInfoCardContentRenderer":
279 ctype = "VIDEO"
280 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
281 # TODO: this is ugly; cleanup.
282 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
283 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
284 content = {
285 'video_id': content['action']['watchEndpoint']['videoId'],
286 'title': content['videoTitle']['simpleText'],
287 'author': delL(content['channelName']['simpleText']),
288 'length': length,
289 'views': intT(delR(content['viewCountText']['simpleText'])),
290 }
291 elif ctype == "playlistInfoCardContentRenderer":
292 ctype = "PLAYLIST"
293 content = {
294 'playlist_id': content['action']['watchEndpoint']['playlistId'],
295 'video_id': content['action']['watchEndpoint']['videoId'],
296 'title': content['playlistTitle']['simpleText'],
297 'author': delL(content['channelName']['simpleText']),
298 'n_videos': intT(content['playlistVideoCount']['simpleText']),
299 }
300 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
301 ctype = "WEBSITE"
302 content = {
303 'url': clean_url(content['command']['urlEndpoint']['url']),
304 'domain': content['displayDomain']['simpleText'],
305 'title': content['title']['simpleText'],
306 # XXX: no thumbnails for infocards
307 }
308 elif ctype == "collaboratorInfoCardContentRenderer":
309 ctype = "CHANNEL"
310 content = {
311 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
312 'title': content['channelName']['simpleText'],
313 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
314 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
315 }
316 else:
317 import pprint
318 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
319
320 return {'type': ctype, 'content': content}
321
322 def mkthumbs(thumbs):
323 return {e['height']: e['url'] for e in thumbs}
324 def parse_endcard(card):
325 card = card.get('endscreenElementRenderer', card) #only sometimes nested
326 ctype = card['style']
327 if ctype == "CHANNEL":
328 content = {
329 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
330 'title': card['title']['simpleText'],
331 'icons': mkthumbs(card['image']['thumbnails']),
332 }
333 elif ctype == "VIDEO":
334 content = {
335 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
336 'title': card['title']['simpleText'],
337 'length': card['videoDuration']['simpleText'], # '12:21'
338 'views': delR(card['metadata']['simpleText']),
339 # XXX: no channel name
340 }
341 elif ctype == "PLAYLIST":
342 content = {
343 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
344 'video_id': card['endpoint']['watchEndpoint']['videoId'],
345 'title': card['title']['simpleText'],
346 'author': delL(card['metadata']['simpleText']),
347 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
348 }
349 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
350 ctype = "WEBSITE"
351 url = clean_url(card['endpoint']['urlEndpoint']['url'])
352 content = {
353 'url': url,
354 'domain': urlparse(url).netloc,
355 'title': card['title']['simpleText'],
356 'icons': mkthumbs(card['image']['thumbnails']),
357 }
358 else:
359 import pprint
360 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
361
362 return {'type': ctype, 'content': content}
363
364 infocards = [parse_infocard(card) for card in cards]
365 endcards = [parse_endcard(card) for card in endsc]
366 # combine cards to weed out duplicates. for videos and playlists prefer
367 # infocards, for channels and websites prefer endcards, as those have more
368 # information than the other.
369 # if the card type is not in ident, we use the whole card for comparison
370 # (otherwise they'd all replace each other)
371 ident = { # ctype -> ident
372 'VIDEO': 'video_id',
373 'PLAYLIST': 'playlist_id',
374 'CHANNEL': 'channel_id',
375 'WEBSITE': 'url',
376 'POLL': 'question',
377 }
378 getident = lambda c: c['content'].get(ident.get(c['type']), c)
379 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
380 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
381
382 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
383 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
384
385 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
386 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
387 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
388 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
389 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
390 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
391 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
392 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
393 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
394 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
395 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
396 whitelisted = sorted(meta2.get('availableCountries',[]))
397 blacklisted = sorted(set(all_countries) - set(whitelisted))
398
399 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
400 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
401 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
402 published_at = meta2['liveBroadcastDetails']['startTimestamp']
403
404 return {
405 'title': meta1['title'],
406 'author': meta1['author'],
407 'channel_id': meta1['channelId'],
408 'description': meta1['shortDescription'],
409 'published': published_at,
410 'views': meta1['viewCount'],
411 'length': int(meta1['lengthSeconds']),
412 'rating': meta1['averageRating'],
413 'category': meta2['category'],
414 'aspectr': aspect_ratio,
415 'unlisted': meta2['isUnlisted'],
416 'whitelisted': whitelisted,
417 'blacklisted': blacklisted,
418 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
419 'infocards': infocards,
420 'endcards': endcards,
421 'all_cards': allcards,
422 'subtitles': subtitles,
423 }
424
425 class RedditException(Exception): pass
426 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
427 count=None, before=None, after=None):
428 """
429 fetches data from a subreddit (or a multireddit like gif+gifs) and
430 filters/sorts results.
431 sorted_by values: hot, new, rising, controversial, top
432 time values: hour, day, week, month, year, all (for top and controversial)
433 """
434
435 if not subreddits:
436 return None
437
438 query = {k:v for k,v in {
439 'count':count,
440 'before':before,
441 'after':after,
442 'limit':limit, # 1..100 (default 25)
443 't': time, # hour,week,month,year,all
444 }.items() if v}
445 multireddit = '+'.join(subreddits)
446 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
447 query, headers={'User-Agent':'Mozilla/5.0'})
448 if not r.ok or not 'data' in r.json():
449 raise RedditException(r.text)
450
451 return r.json()
452
453 def fetch_reddit_post(post_id):
454 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
455 r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
456 headers={'User-Agent':'Mozilla/5.0'})
457 if not r.ok or not 'data' in r.json():
458 raise RedditException(r.text)
459
460 return r.json()
461
462 def parse_reddit_videos(data):
463 videos = []
464 entries = sorted(data['data']['children'],
465 key=lambda e: e['data']['score'] > 1,
466 reverse=True)
467 for entry in entries:
468 e = entry['data']
469 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
470 continue
471 try:
472 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
473 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
474 except:
475 continue # XXX: should we log that?
476 if not video_id: continue
477 videos.append({
478 'video_id': video_id,
479 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
480 'url': e['permalink'],
481 'n_comments': e['num_comments'],
482 'n_karma': e['score'],
483 'subreddit': e['subreddit'],
484 'post_id': e['id'],
485 })
486
487 return videos
488
489 class NoFallbackException(Exception): pass
490 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
491 """
492 finds the next route that matches the current url rule, and executes it.
493 args, kwargs: pass all arguments of the current route
494 """
495 from flask import current_app, request, g
496 from werkzeug.exceptions import NotFound
497
498 # build a list of endpoints that match the current request's url rule:
499 matching = [
500 rule.endpoint
501 for rule in current_app.url_map.iter_rules()
502 if rule.rule == request.url_rule.rule
503 ]
504 current = matching.index(request.endpoint)
505
506 # since we can't change request.endpoint, we always get the original
507 # endpoint back. so for repeated fall throughs, we use the g object to
508 # increment how often we want to fall through.
509 if not '_fallback_next' in g:
510 g._fallback_next = 0
511 g._fallback_next += 1
512
513 next_ep = current + g._fallback_next
514
515 if next_ep < len(matching):
516 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
517 else:
518 raise NoFallbackException
519
520 def websub_url_hmac(key, feed_id, timestamp, nonce):
521 """ generate sha1 hmac, as required by websub/pubsubhubbub """
522 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
523 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
524
525 def websub_body_hmac(key, body):
526 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
527
528 def pp(*args):
529 from pprint import pprint
530 import sys, codecs
531 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum