]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
log api responses on error
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
30 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
31 try:
32 #raise Exception()
33 from flask import g
34 import requests
35 from requests import Session as OriginalSession
36 class _NSASession(OriginalSession):
37 def request(self, method, url, params=None, data=None, **kwargs):
38 response = super(_NSASession, self).request(
39 method, url, params, data, **kwargs
40 )
41 if 'api_requests' not in g:
42 g.api_requests = []
43 g.api_requests.append((url, params, response.text))
44 return response
45 requests.Session = requests.sessions.Session = _NSASession
46 except:
47 pass
48
49 def fetch_xml(feed_type, feed_id):
50 # TODO: handle requests.exceptions.ConnectionError
51 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
52 feed_type: feed_id,
53 })
54 if not r.ok:
55 return None
56
57 return r.text
58
59 def parse_xml(xmldata):
60 ns = {
61 'atom':"http://www.w3.org/2005/Atom",
62 'yt': "http://www.youtube.com/xml/schemas/2015",
63 'media':"http://search.yahoo.com/mrss/",
64 'at': "http://purl.org/atompub/tombstones/1.0",
65 }
66
67 feed = ElementTree.fromstring(xmldata)
68 if feed.find('at:deleted-entry',ns):
69 author = feed.find('at:deleted-entry/at:by/name',ns).text
70 ref = feed.find('at:deleted-entry',ns).get('ref')
71 (_, _, video_id) = ref.rpartition(':')
72 return None, None, []
73 title = feed.find('atom:title',ns).text
74 author = feed.find('atom:author/atom:name',ns).text \
75 if feed.find('atom:author',ns) else None
76 videos = []
77 for entry in feed.findall('atom:entry',ns):
78 videos.append({
79 'video_id': entry.find('yt:videoId',ns).text,
80 'title': entry.find('atom:title',ns).text,
81 'published': entry.find('atom:published',ns).text,
82 'channel_id': entry.find('yt:channelId',ns).text,
83 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
84 # extra fields for pull_subs/webhook:
85 'updated': entry.find('atom:updated',ns).text,
86 })
87
88 return title, author, videos
89
90 def update_channel(db, xmldata):
91 if not xmldata: return False
92
93 # Note: websub does not return global author, hence taking from first video
94 title, _, videos = parse_xml(xmldata)
95
96 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
97
98 c = db.cursor()
99 for i, video in enumerate(videos):
100 now = datetime.now(timezone.utc)
101 updated = dateutil.parser.parse(video['updated'])
102 published = dateutil.parser.parse(video['published'])
103 # if update and published time are near-identical, we assume it's new.
104 if (updated - published).seconds < 60 and (now - published).days < 7:
105 timestamp = now
106 else:#, it's just an update to an older video.
107 timestamp = published
108
109 c.execute("""
110 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
111 VALUES (?, ?, ?, datetime(?), datetime(?))
112 """, (
113 video['video_id'],
114 video['channel_id'],
115 video['title'],
116 video['published'],
117 timestamp
118 ))
119
120 if i == 0: # only required once per feed
121 c.execute("""
122 INSERT OR REPLACE INTO channels (id, name)
123 VALUES (?, ?)
124 """, (video['channel_id'], video['author']))
125 db.commit()
126
127 return True
128
129 def get_video_info(video_id, sts=0, algo=""):
130 """
131 returns: best-quality muxed video stream, player_response, error-type/mesage
132 error types: player, malformed, livestream, geolocked, exhausted
133 """
134 player_error = None # for 'exhausted'
135 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
136 r = requests.get("https://www.youtube.com/get_video_info", {
137 "video_id": video_id,
138 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
139 "el": el,
140 "sts": sts,
141 "hl": "en_US",
142 })
143 params = parse_qs(r.text)
144 if 'errorcode' in params: # status=fail
145 return None, None, 'malformed', params['reason'][0]
146
147 metadata = json.loads(params.get('player_response')[0])
148 playabilityStatus = metadata['playabilityStatus']['status']
149 if playabilityStatus != "OK":
150 playabilityReason = metadata['playabilityStatus']['reason']
151 player_error = f"{playabilityStatus}: {playabilityReason}"
152 if playabilityStatus == "UNPLAYABLE":
153 continue # try again with next el value (or fail as exhausted)
154 # without videoDetails, there's only the error message
155 maybe_metadata = metadata if 'videoDetails' in metadata else None
156 return None, maybe_metadata, 'player', player_error
157 if metadata['videoDetails']['isLiveContent'] and \
158 (metadata['videoDetails'].get('isLive', False) or \
159 metadata['videoDetails'].get('isPostLiveDvr', False)):
160 return None, metadata, 'livestream', None
161
162 if not 'formats' in metadata['streamingData']:
163 continue # no urls
164
165 formats = metadata['streamingData']['formats']
166 for (i,v) in enumerate(formats):
167 if not ('cipher' in v or 'signatureCipher' in v): continue
168 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
169 formats[i]['url'] = unscramble(cipher, algo)
170
171 # todo: check if we have urls or try again
172 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
173
174 if 'gcr' in parse_qs(url):
175 return None, metadata, 'geolocked', None
176
177 return url, metadata, None, None
178 else:
179 return None, metadata, 'exhausted', player_error
180
181 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
182 signature = list(cipher['s'][0])
183 for c in algo.split():
184 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
185 ix = int(ix) % len(signature) if ix else 0
186 if not op: continue
187 if op == 'r': signature = list(reversed(signature))
188 if op == 's': signature = signature[ix:]
189 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
190 sp = cipher.get('sp', ['signature'])[0]
191 sig = cipher.get('sig', [''.join(signature)])[0]
192 return f"{cipher['url'][0]}&{sp}={sig}"
193
194 def prepare_metadata(metadata):
195 meta1 = metadata['videoDetails']
196 meta2 = metadata['microformat']['playerMicroformatRenderer']
197 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
198 if 'cards' in metadata else []
199 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
200 if 'endscreen' in metadata else []
201
202 # the actual video streams have exact information:
203 try:
204 sd = metadata['streamingData']
205 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
206 aspect_ratio = some_stream['width'] / some_stream['height']
207 # if that's unavailable (e.g. on livestreams), fall back to
208 # thumbnails (only either 4:3 or 16:9).
209 except:
210 some_img = meta2['thumbnail']['thumbnails'][0]
211 aspect_ratio = some_img['width'] / some_img['height']
212
213 subtitles = sorted([
214 {'url':cc['baseUrl'],
215 'code':cc['languageCode'],
216 'autogenerated':cc.get('kind')=="asr",
217 'name':cc['name']['simpleText']}
218 for cc in metadata.get('captions',{})
219 .get('playerCaptionsTracklistRenderer',{})
220 .get('captionTracks',[])
221 ], key=lambda cc: cc['autogenerated'])
222
223 def clean_url(url):
224 # externals URLs are redirected through youtube.com/redirect, but we
225 # may encounter internal URLs, too
226 return parse_qs(urlparse(url).query).get('q',[url])[0]
227 # Remove left-/rightmost word from string:
228 delL = lambda s: s.partition(' ')[2]
229 delR = lambda s: s.rpartition(' ')[0]
230 # Thousands seperator aware int():
231 intT = lambda s: int(s.replace(',', ''))
232
233 def parse_infocard(card):
234 card = card['cardRenderer']
235 ctype = list(card['content'].keys())[0]
236 content = card['content'][ctype]
237 if ctype == "pollRenderer":
238 ctype = "POLL"
239 content = {
240 'question': content['question']['simpleText'],
241 'answers': [(a['text']['simpleText'],a['numVotes']) \
242 for a in content['choices']],
243 }
244 elif ctype == "videoInfoCardContentRenderer":
245 ctype = "VIDEO"
246 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
247 # TODO: this is ugly; cleanup.
248 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
249 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
250 content = {
251 'video_id': content['action']['watchEndpoint']['videoId'],
252 'title': content['videoTitle']['simpleText'],
253 'author': delL(content['channelName']['simpleText']),
254 'length': length,
255 'views': intT(delR(content['viewCountText']['simpleText'])),
256 }
257 elif ctype == "playlistInfoCardContentRenderer":
258 ctype = "PLAYLIST"
259 content = {
260 'playlist_id': content['action']['watchEndpoint']['playlistId'],
261 'video_id': content['action']['watchEndpoint']['videoId'],
262 'title': content['playlistTitle']['simpleText'],
263 'author': delL(content['channelName']['simpleText']),
264 'n_videos': intT(content['playlistVideoCount']['simpleText']),
265 }
266 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
267 ctype = "WEBSITE"
268 content = {
269 'url': clean_url(content['command']['urlEndpoint']['url']),
270 'domain': content['displayDomain']['simpleText'],
271 'title': content['title']['simpleText'],
272 # XXX: no thumbnails for infocards
273 }
274 elif ctype == "collaboratorInfoCardContentRenderer":
275 ctype = "CHANNEL"
276 content = {
277 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
278 'title': content['channelName']['simpleText'],
279 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
280 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
281 }
282 else:
283 import pprint
284 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
285
286 return {'type': ctype, 'content': content}
287
288 def mkthumbs(thumbs):
289 return {e['height']: e['url'] for e in thumbs}
290 def parse_endcard(card):
291 card = card.get('endscreenElementRenderer', card) #only sometimes nested
292 ctype = card['style']
293 if ctype == "CHANNEL":
294 content = {
295 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
296 'title': card['title']['simpleText'],
297 'icons': mkthumbs(card['image']['thumbnails']),
298 }
299 elif ctype == "VIDEO":
300 content = {
301 'video_id': card['endpoint']['watchEndpoint']['videoId'],
302 'title': card['title']['simpleText'],
303 'length': card['videoDuration']['simpleText'], # '12:21'
304 'views': delR(card['metadata']['simpleText']),
305 # XXX: no channel name
306 }
307 elif ctype == "PLAYLIST":
308 content = {
309 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
310 'video_id': card['endpoint']['watchEndpoint']['videoId'],
311 'title': card['title']['simpleText'],
312 'author': delL(card['metadata']['simpleText']),
313 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
314 }
315 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
316 ctype = "WEBSITE"
317 url = clean_url(card['endpoint']['urlEndpoint']['url'])
318 content = {
319 'url': url,
320 'domain': urlparse(url).netloc,
321 'title': card['title']['simpleText'],
322 'icons': mkthumbs(card['image']['thumbnails']),
323 }
324 else:
325 import pprint
326 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
327
328 return {'type': ctype, 'content': content}
329
330 infocards = [parse_infocard(card) for card in cards]
331 endcards = [parse_endcard(card) for card in endsc]
332 # combine cards to weed out duplicates. for videos and playlists prefer
333 # infocards, for channels and websites prefer endcards, as those have more
334 # information than the other.
335 # if the card type is not in ident, we use the whole card for comparison
336 # (otherwise they'd all replace each other)
337 ident = { # ctype -> ident
338 'VIDEO': 'video_id',
339 'PLAYLIST': 'playlist_id',
340 'CHANNEL': 'channel_id',
341 'WEBSITE': 'url',
342 'POLL': 'question',
343 }
344 getident = lambda c: c['content'].get(ident.get(c['type']), c)
345 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
346 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
347
348 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
349 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
350
351 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
352 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
353 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
354 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
355 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
356 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
357 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
358 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
359 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
360 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
361 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
362 whitelisted = sorted(meta2.get('availableCountries',[]))
363 blacklisted = sorted(set(all_countries) - set(whitelisted))
364
365 return {
366 'title': meta1['title'],
367 'author': meta1['author'],
368 'channel_id': meta1['channelId'],
369 'description': meta1['shortDescription'],
370 'published': meta2['publishDate'],
371 'views': meta1['viewCount'],
372 'length': int(meta1['lengthSeconds']),
373 'rating': meta1['averageRating'],
374 'category': meta2['category'],
375 'aspectr': aspect_ratio,
376 'unlisted': meta2['isUnlisted'],
377 'countries': whitelisted,
378 'blacklisted': blacklisted,
379 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
380 'infocards': infocards,
381 'endcards': endcards,
382 'all_cards': allcards,
383 'subtitles': subtitles,
384 }
385
386 class RedditException(Exception): pass
387 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
388 count=None, before=None, after=None):
389 """
390 fetches data from a subreddit (or a multireddit like gif+gifs) and
391 filters/sorts results.
392 sorted_by values: hot, new, rising, controversial, top
393 time values: hour, week, month, year, all (for top and controversial)
394 returns a tuple of ([{video}],before,after)
395 """
396 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
397
398 if not subreddits:
399 return [], None, None
400
401 query = {k:v for k,v in {
402 'count':count,
403 'before':before,
404 'after':after,
405 'limit':limit, # 1..100 (default 25)
406 't': time, # hour,week,month,year,all
407 }.items() if v}
408 multireddit = '+'.join(subreddits)
409 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
410 query, headers={'User-Agent':'Mozilla/5.0'})
411 if not r.ok or not 'data' in r.json():
412 raise RedditException(r.text)
413
414 videos = []
415 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
416 for entry in entries:
417 e = entry['data']
418 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
419 continue
420 try:
421 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
422 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
423 except:
424 continue # XXX: should we log that?
425 if not video_id: continue
426 videos.append({
427 'video_id': video_id,
428 'title': e['title'],
429 'url': e['permalink'],
430 'n_comments': e['num_comments'],
431 'n_karma': e['score'],
432 'subreddit': e['subreddit'],
433 'post_id': e['id'],
434 })
435 before = r.json()['data']['before']
436 after = r.json()['data']['after']
437
438 return videos, before, after
439
440 def pp(*args):
441 from pprint import pprint
442 import sys, codecs
443 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum