]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
split fetch_reddit into fetching and parsing blocks
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import requests
6 import requests_cache
7 import dateutil.parser
8 from xml.etree import ElementTree
9 from configparser import ConfigParser
10 from datetime import datetime, timezone
11 from urllib.parse import parse_qs, urlparse
12
13 cf = ConfigParser()
14 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
15 cf.read(config_filename)
16 if not 'global' in cf: # todo: full config check
17 raise Exception("Configuration file not found or empty")
18
19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
20 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
21
22 # Note: this should only be required for the 'memory' backed cache.
23 # TODO: only run for long-running processes, i.e. the frontend
24 from threading import Timer
25 def purge_cache(sec):
26 requests_cache.remove_expired_responses()
27 t = Timer(sec, purge_cache, args=(sec,))
28 t.setDaemon(True)
29 t.start()
30 purge_cache(10*60)
31
32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
34 try:
35 #raise Exception()
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 if 'api_requests' not in g:
45 g.api_requests = []
46 g.api_requests.append((url, params, response.text))
47 return response
48 requests.Session = requests.sessions.Session = _NSASession
49 except:
50 pass
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.text
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71 if feed.find('at:deleted-entry',ns):
72 author = feed.find('at:deleted-entry/at:by/name',ns).text
73 ref = feed.find('at:deleted-entry',ns).get('ref')
74 (_, _, video_id) = ref.rpartition(':')
75 return None, None, []
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 title, _, videos = parse_xml(xmldata)
98
99 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
100
101 c = db.cursor()
102 for i, video in enumerate(videos):
103 now = datetime.now(timezone.utc)
104 updated = dateutil.parser.parse(video['updated'])
105 published = dateutil.parser.parse(video['published'])
106 # if update and published time are near-identical, we assume it's new.
107 if (updated - published).seconds < 60 and (now - published).days < 7:
108 timestamp = now
109 else:#, it's just an update to an older video.
110 timestamp = published
111
112 c.execute("""
113 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
114 VALUES (?, ?, ?, datetime(?), datetime(?))
115 """, (
116 video['video_id'],
117 video['channel_id'],
118 video['title'],
119 video['published'],
120 timestamp
121 ))
122
123 if i == 0: # only required once per feed
124 c.execute("""
125 INSERT OR REPLACE INTO channels (id, name)
126 VALUES (?, ?)
127 """, (video['channel_id'], video['author']))
128 db.commit()
129
130 return True
131
132 def get_video_info(video_id, sts=0, algo=""):
133 """
134 returns: best-quality muxed video stream, player_response, error-type/mesage
135 error types: player, malformed, livestream, geolocked, exhausted
136 """
137 player_error = None # for 'exhausted'
138 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
139 r = requests.get("https://www.youtube.com/get_video_info", {
140 "video_id": video_id,
141 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
142 "el": el,
143 "sts": sts,
144 "hl": "en_US",
145 })
146 params = parse_qs(r.text)
147 if 'errorcode' in params: # status=fail
148 return None, None, 'malformed', params['reason'][0]
149
150 metadata = json.loads(params.get('player_response')[0])
151 playabilityStatus = metadata['playabilityStatus']['status']
152 if playabilityStatus != "OK":
153 playabilityReason = metadata['playabilityStatus'].get('reason',
154 '//'.join(metadata['playabilityStatus'].get('messages')))
155 player_error = f"{playabilityStatus}: {playabilityReason}"
156 if playabilityStatus == "UNPLAYABLE":
157 continue # try again with next el value (or fail as exhausted)
158 # without videoDetails, there's only the error message
159 maybe_metadata = metadata if 'videoDetails' in metadata else None
160 return None, maybe_metadata, 'player', player_error
161 if metadata['videoDetails']['isLiveContent'] and \
162 (metadata['videoDetails'].get('isLive', False) or \
163 metadata['videoDetails'].get('isPostLiveDvr', False)):
164 return None, metadata, 'livestream', None
165
166 if not 'formats' in metadata['streamingData']:
167 continue # no urls
168
169 formats = metadata['streamingData']['formats']
170 for (i,v) in enumerate(formats):
171 if not ('cipher' in v or 'signatureCipher' in v): continue
172 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
173 formats[i]['url'] = unscramble(cipher, algo)
174
175 # todo: check if we have urls or try again
176 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
177
178 if 'gcr' in parse_qs(url):
179 return None, metadata, 'geolocked', None
180
181 return url, metadata, None, None
182 else:
183 return None, metadata, 'exhausted', player_error
184
185 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
186 signature = list(cipher['s'][0])
187 for c in algo.split():
188 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
189 ix = int(ix) % len(signature) if ix else 0
190 if not op: continue
191 if op == 'r': signature = list(reversed(signature))
192 if op == 's': signature = signature[ix:]
193 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
194 sp = cipher.get('sp', ['signature'])[0]
195 sig = cipher.get('sig', [''.join(signature)])[0]
196 return f"{cipher['url'][0]}&{sp}={sig}"
197
198 def prepare_metadata(metadata):
199 meta1 = metadata['videoDetails']
200 meta2 = metadata['microformat']['playerMicroformatRenderer']
201 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
202 if 'cards' in metadata else []
203 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
204 if 'endscreen' in metadata else []
205
206 # the actual video streams have exact information:
207 try:
208 sd = metadata['streamingData']
209 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
210 aspect_ratio = some_stream['width'] / some_stream['height']
211 # if that's unavailable (e.g. on livestreams), fall back to
212 # thumbnails (only either 4:3 or 16:9).
213 except:
214 some_img = meta2['thumbnail']['thumbnails'][0]
215 aspect_ratio = some_img['width'] / some_img['height']
216
217 subtitles = sorted([
218 {'url':cc['baseUrl'],
219 'code':cc['languageCode'],
220 'autogenerated':cc.get('kind')=="asr",
221 'name':cc['name']['simpleText']}
222 for cc in metadata.get('captions',{})
223 .get('playerCaptionsTracklistRenderer',{})
224 .get('captionTracks',[])
225 ], key=lambda cc: cc['autogenerated'])
226
227 def clean_url(url):
228 # externals URLs are redirected through youtube.com/redirect, but we
229 # may encounter internal URLs, too
230 return parse_qs(urlparse(url).query).get('q',[url])[0]
231 # Remove left-/rightmost word from string:
232 delL = lambda s: s.partition(' ')[2]
233 delR = lambda s: s.rpartition(' ')[0]
234 # Thousands seperator aware int():
235 intT = lambda s: int(s.replace(',', ''))
236
237 def parse_infocard(card):
238 card = card['cardRenderer']
239 ctype = list(card['content'].keys())[0]
240 content = card['content'][ctype]
241 if ctype == "pollRenderer":
242 ctype = "POLL"
243 content = {
244 'question': content['question']['simpleText'],
245 'answers': [(a['text']['simpleText'],a['numVotes']) \
246 for a in content['choices']],
247 }
248 elif ctype == "videoInfoCardContentRenderer":
249 ctype = "VIDEO"
250 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
251 # TODO: this is ugly; cleanup.
252 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
253 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
254 content = {
255 'video_id': content['action']['watchEndpoint']['videoId'],
256 'title': content['videoTitle']['simpleText'],
257 'author': delL(content['channelName']['simpleText']),
258 'length': length,
259 'views': intT(delR(content['viewCountText']['simpleText'])),
260 }
261 elif ctype == "playlistInfoCardContentRenderer":
262 ctype = "PLAYLIST"
263 content = {
264 'playlist_id': content['action']['watchEndpoint']['playlistId'],
265 'video_id': content['action']['watchEndpoint']['videoId'],
266 'title': content['playlistTitle']['simpleText'],
267 'author': delL(content['channelName']['simpleText']),
268 'n_videos': intT(content['playlistVideoCount']['simpleText']),
269 }
270 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
271 ctype = "WEBSITE"
272 content = {
273 'url': clean_url(content['command']['urlEndpoint']['url']),
274 'domain': content['displayDomain']['simpleText'],
275 'title': content['title']['simpleText'],
276 # XXX: no thumbnails for infocards
277 }
278 elif ctype == "collaboratorInfoCardContentRenderer":
279 ctype = "CHANNEL"
280 content = {
281 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
282 'title': content['channelName']['simpleText'],
283 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
284 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
285 }
286 else:
287 import pprint
288 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
289
290 return {'type': ctype, 'content': content}
291
292 def mkthumbs(thumbs):
293 return {e['height']: e['url'] for e in thumbs}
294 def parse_endcard(card):
295 card = card.get('endscreenElementRenderer', card) #only sometimes nested
296 ctype = card['style']
297 if ctype == "CHANNEL":
298 content = {
299 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
300 'title': card['title']['simpleText'],
301 'icons': mkthumbs(card['image']['thumbnails']),
302 }
303 elif ctype == "VIDEO":
304 content = {
305 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
306 'title': card['title']['simpleText'],
307 'length': card['videoDuration']['simpleText'], # '12:21'
308 'views': delR(card['metadata']['simpleText']),
309 # XXX: no channel name
310 }
311 elif ctype == "PLAYLIST":
312 content = {
313 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
314 'video_id': card['endpoint']['watchEndpoint']['videoId'],
315 'title': card['title']['simpleText'],
316 'author': delL(card['metadata']['simpleText']),
317 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
318 }
319 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
320 ctype = "WEBSITE"
321 url = clean_url(card['endpoint']['urlEndpoint']['url'])
322 content = {
323 'url': url,
324 'domain': urlparse(url).netloc,
325 'title': card['title']['simpleText'],
326 'icons': mkthumbs(card['image']['thumbnails']),
327 }
328 else:
329 import pprint
330 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
331
332 return {'type': ctype, 'content': content}
333
334 infocards = [parse_infocard(card) for card in cards]
335 endcards = [parse_endcard(card) for card in endsc]
336 # combine cards to weed out duplicates. for videos and playlists prefer
337 # infocards, for channels and websites prefer endcards, as those have more
338 # information than the other.
339 # if the card type is not in ident, we use the whole card for comparison
340 # (otherwise they'd all replace each other)
341 ident = { # ctype -> ident
342 'VIDEO': 'video_id',
343 'PLAYLIST': 'playlist_id',
344 'CHANNEL': 'channel_id',
345 'WEBSITE': 'url',
346 'POLL': 'question',
347 }
348 getident = lambda c: c['content'].get(ident.get(c['type']), c)
349 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
350 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
351
352 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
353 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
354
355 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
356 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
357 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
358 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
359 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
360 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
361 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
362 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
363 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
364 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
365 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
366 whitelisted = sorted(meta2.get('availableCountries',[]))
367 blacklisted = sorted(set(all_countries) - set(whitelisted))
368
369 return {
370 'title': meta1['title'],
371 'author': meta1['author'],
372 'channel_id': meta1['channelId'],
373 'description': meta1['shortDescription'],
374 'published': meta2['publishDate'],
375 'views': meta1['viewCount'],
376 'length': int(meta1['lengthSeconds']),
377 'rating': meta1['averageRating'],
378 'category': meta2['category'],
379 'aspectr': aspect_ratio,
380 'unlisted': meta2['isUnlisted'],
381 'countries': whitelisted,
382 'blacklisted': blacklisted,
383 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
384 'infocards': infocards,
385 'endcards': endcards,
386 'all_cards': allcards,
387 'subtitles': subtitles,
388 }
389
390 class RedditException(Exception): pass
391 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
392 count=None, before=None, after=None):
393 """
394 fetches data from a subreddit (or a multireddit like gif+gifs) and
395 filters/sorts results.
396 sorted_by values: hot, new, rising, controversial, top
397 time values: hour, day, week, month, year, all (for top and controversial)
398 """
399 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
400
401 if not subreddits:
402 return None
403
404 query = {k:v for k,v in {
405 'count':count,
406 'before':before,
407 'after':after,
408 'limit':limit, # 1..100 (default 25)
409 't': time, # hour,week,month,year,all
410 }.items() if v}
411 multireddit = '+'.join(subreddits)
412 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
413 query, headers={'User-Agent':'Mozilla/5.0'})
414 if not r.ok or not 'data' in r.json():
415 raise RedditException(r.text)
416
417 return r.json()
418
419 def parse_reddit_videos(data):
420 videos = []
421 entries = sorted(data['data']['children'],
422 key=lambda e: e['data']['score'] > 1,
423 reverse=True)
424 for entry in entries:
425 e = entry['data']
426 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
427 continue
428 try:
429 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
430 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
431 except:
432 continue # XXX: should we log that?
433 if not video_id: continue
434 videos.append({
435 'video_id': video_id,
436 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
437 'url': e['permalink'],
438 'n_comments': e['num_comments'],
439 'n_karma': e['score'],
440 'subreddit': e['subreddit'],
441 'post_id': e['id'],
442 })
443
444 return videos
445
446 class NoFallbackException(Exception): pass
447 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
448 """
449 finds the next route that matches the current url rule, and executes it.
450 args, kwargs: pass all arguments of the current route
451 """
452 from flask import current_app, request, g
453 from werkzeug.exceptions import NotFound
454
455 # build a list of endpoints that match the current request's url rule:
456 matching = [
457 rule.endpoint
458 for rule in current_app.url_map.iter_rules()
459 if rule.rule == request.url_rule.rule
460 ]
461 current = matching.index(request.endpoint)
462
463 # since we can't change request.endpoint, we always get the original
464 # endpoint back. so for repeated fall throughs, we use the g object to
465 # increment how often we want to fall through.
466 if not '_fallback_next' in g:
467 g._fallback_next = 0
468 g._fallback_next += 1
469
470 next_ep = current + g._fallback_next
471
472 if next_ep < len(matching):
473 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
474 else:
475 raise NoFallbackException
476
477 def pp(*args):
478 from pprint import pprint
479 import sys, codecs
480 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum