]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
don't double escape reddit submission titles
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import html
5 import requests
6 import requests_cache
7 import dateutil.parser
8 from xml.etree import ElementTree
9 from configparser import ConfigParser
10 from datetime import datetime, timezone
11 from urllib.parse import parse_qs, urlparse
12
13 cf = ConfigParser()
14 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
15 cf.read(config_filename)
16 if not 'global' in cf: # todo: full config check
17 raise Exception("Configuration file not found or empty")
18
19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
20 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
21
22 # Note: this should only be required for the 'memory' backed cache.
23 # TODO: only run for long-running processes, i.e. the frontend
24 from threading import Timer
25 def purge_cache(sec):
26 requests_cache.remove_expired_responses()
27 t = Timer(sec, purge_cache, args=(sec,))
28 t.setDaemon(True)
29 t.start()
30 purge_cache(10*60)
31
32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
34 try:
35 #raise Exception()
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 if 'api_requests' not in g:
45 g.api_requests = []
46 g.api_requests.append((url, params, response.text))
47 return response
48 requests.Session = requests.sessions.Session = _NSASession
49 except:
50 pass
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.text
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71 if feed.find('at:deleted-entry',ns):
72 author = feed.find('at:deleted-entry/at:by/name',ns).text
73 ref = feed.find('at:deleted-entry',ns).get('ref')
74 (_, _, video_id) = ref.rpartition(':')
75 return None, None, []
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 title, _, videos = parse_xml(xmldata)
98
99 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
100
101 c = db.cursor()
102 for i, video in enumerate(videos):
103 now = datetime.now(timezone.utc)
104 updated = dateutil.parser.parse(video['updated'])
105 published = dateutil.parser.parse(video['published'])
106 # if update and published time are near-identical, we assume it's new.
107 if (updated - published).seconds < 60 and (now - published).days < 7:
108 timestamp = now
109 else:#, it's just an update to an older video.
110 timestamp = published
111
112 c.execute("""
113 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
114 VALUES (?, ?, ?, datetime(?), datetime(?))
115 """, (
116 video['video_id'],
117 video['channel_id'],
118 video['title'],
119 video['published'],
120 timestamp
121 ))
122
123 if i == 0: # only required once per feed
124 c.execute("""
125 INSERT OR REPLACE INTO channels (id, name)
126 VALUES (?, ?)
127 """, (video['channel_id'], video['author']))
128 db.commit()
129
130 return True
131
132 def get_video_info(video_id, sts=0, algo=""):
133 """
134 returns: best-quality muxed video stream, player_response, error-type/mesage
135 error types: player, malformed, livestream, geolocked, exhausted
136 """
137 player_error = None # for 'exhausted'
138 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
139 r = requests.get("https://www.youtube.com/get_video_info", {
140 "video_id": video_id,
141 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
142 "el": el,
143 "sts": sts,
144 "hl": "en_US",
145 })
146 params = parse_qs(r.text)
147 if 'errorcode' in params: # status=fail
148 return None, None, 'malformed', params['reason'][0]
149
150 metadata = json.loads(params.get('player_response')[0])
151 playabilityStatus = metadata['playabilityStatus']['status']
152 if playabilityStatus != "OK":
153 playabilityReason = metadata['playabilityStatus']['reason']
154 player_error = f"{playabilityStatus}: {playabilityReason}"
155 if playabilityStatus == "UNPLAYABLE":
156 continue # try again with next el value (or fail as exhausted)
157 # without videoDetails, there's only the error message
158 maybe_metadata = metadata if 'videoDetails' in metadata else None
159 return None, maybe_metadata, 'player', player_error
160 if metadata['videoDetails']['isLiveContent'] and \
161 (metadata['videoDetails'].get('isLive', False) or \
162 metadata['videoDetails'].get('isPostLiveDvr', False)):
163 return None, metadata, 'livestream', None
164
165 if not 'formats' in metadata['streamingData']:
166 continue # no urls
167
168 formats = metadata['streamingData']['formats']
169 for (i,v) in enumerate(formats):
170 if not ('cipher' in v or 'signatureCipher' in v): continue
171 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
172 formats[i]['url'] = unscramble(cipher, algo)
173
174 # todo: check if we have urls or try again
175 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
176
177 if 'gcr' in parse_qs(url):
178 return None, metadata, 'geolocked', None
179
180 return url, metadata, None, None
181 else:
182 return None, metadata, 'exhausted', player_error
183
184 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
185 signature = list(cipher['s'][0])
186 for c in algo.split():
187 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
188 ix = int(ix) % len(signature) if ix else 0
189 if not op: continue
190 if op == 'r': signature = list(reversed(signature))
191 if op == 's': signature = signature[ix:]
192 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
193 sp = cipher.get('sp', ['signature'])[0]
194 sig = cipher.get('sig', [''.join(signature)])[0]
195 return f"{cipher['url'][0]}&{sp}={sig}"
196
197 def prepare_metadata(metadata):
198 meta1 = metadata['videoDetails']
199 meta2 = metadata['microformat']['playerMicroformatRenderer']
200 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
201 if 'cards' in metadata else []
202 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
203 if 'endscreen' in metadata else []
204
205 # the actual video streams have exact information:
206 try:
207 sd = metadata['streamingData']
208 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
209 aspect_ratio = some_stream['width'] / some_stream['height']
210 # if that's unavailable (e.g. on livestreams), fall back to
211 # thumbnails (only either 4:3 or 16:9).
212 except:
213 some_img = meta2['thumbnail']['thumbnails'][0]
214 aspect_ratio = some_img['width'] / some_img['height']
215
216 subtitles = sorted([
217 {'url':cc['baseUrl'],
218 'code':cc['languageCode'],
219 'autogenerated':cc.get('kind')=="asr",
220 'name':cc['name']['simpleText']}
221 for cc in metadata.get('captions',{})
222 .get('playerCaptionsTracklistRenderer',{})
223 .get('captionTracks',[])
224 ], key=lambda cc: cc['autogenerated'])
225
226 def clean_url(url):
227 # externals URLs are redirected through youtube.com/redirect, but we
228 # may encounter internal URLs, too
229 return parse_qs(urlparse(url).query).get('q',[url])[0]
230 # Remove left-/rightmost word from string:
231 delL = lambda s: s.partition(' ')[2]
232 delR = lambda s: s.rpartition(' ')[0]
233 # Thousands seperator aware int():
234 intT = lambda s: int(s.replace(',', ''))
235
236 def parse_infocard(card):
237 card = card['cardRenderer']
238 ctype = list(card['content'].keys())[0]
239 content = card['content'][ctype]
240 if ctype == "pollRenderer":
241 ctype = "POLL"
242 content = {
243 'question': content['question']['simpleText'],
244 'answers': [(a['text']['simpleText'],a['numVotes']) \
245 for a in content['choices']],
246 }
247 elif ctype == "videoInfoCardContentRenderer":
248 ctype = "VIDEO"
249 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
250 # TODO: this is ugly; cleanup.
251 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
252 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
253 content = {
254 'video_id': content['action']['watchEndpoint']['videoId'],
255 'title': content['videoTitle']['simpleText'],
256 'author': delL(content['channelName']['simpleText']),
257 'length': length,
258 'views': intT(delR(content['viewCountText']['simpleText'])),
259 }
260 elif ctype == "playlistInfoCardContentRenderer":
261 ctype = "PLAYLIST"
262 content = {
263 'playlist_id': content['action']['watchEndpoint']['playlistId'],
264 'video_id': content['action']['watchEndpoint']['videoId'],
265 'title': content['playlistTitle']['simpleText'],
266 'author': delL(content['channelName']['simpleText']),
267 'n_videos': intT(content['playlistVideoCount']['simpleText']),
268 }
269 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
270 ctype = "WEBSITE"
271 content = {
272 'url': clean_url(content['command']['urlEndpoint']['url']),
273 'domain': content['displayDomain']['simpleText'],
274 'title': content['title']['simpleText'],
275 # XXX: no thumbnails for infocards
276 }
277 elif ctype == "collaboratorInfoCardContentRenderer":
278 ctype = "CHANNEL"
279 content = {
280 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
281 'title': content['channelName']['simpleText'],
282 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
283 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
284 }
285 else:
286 import pprint
287 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
288
289 return {'type': ctype, 'content': content}
290
291 def mkthumbs(thumbs):
292 return {e['height']: e['url'] for e in thumbs}
293 def parse_endcard(card):
294 card = card.get('endscreenElementRenderer', card) #only sometimes nested
295 ctype = card['style']
296 if ctype == "CHANNEL":
297 content = {
298 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
299 'title': card['title']['simpleText'],
300 'icons': mkthumbs(card['image']['thumbnails']),
301 }
302 elif ctype == "VIDEO":
303 content = {
304 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
305 'title': card['title']['simpleText'],
306 'length': card['videoDuration']['simpleText'], # '12:21'
307 'views': delR(card['metadata']['simpleText']),
308 # XXX: no channel name
309 }
310 elif ctype == "PLAYLIST":
311 content = {
312 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
313 'video_id': card['endpoint']['watchEndpoint']['videoId'],
314 'title': card['title']['simpleText'],
315 'author': delL(card['metadata']['simpleText']),
316 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
317 }
318 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
319 ctype = "WEBSITE"
320 url = clean_url(card['endpoint']['urlEndpoint']['url'])
321 content = {
322 'url': url,
323 'domain': urlparse(url).netloc,
324 'title': card['title']['simpleText'],
325 'icons': mkthumbs(card['image']['thumbnails']),
326 }
327 else:
328 import pprint
329 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
330
331 return {'type': ctype, 'content': content}
332
333 infocards = [parse_infocard(card) for card in cards]
334 endcards = [parse_endcard(card) for card in endsc]
335 # combine cards to weed out duplicates. for videos and playlists prefer
336 # infocards, for channels and websites prefer endcards, as those have more
337 # information than the other.
338 # if the card type is not in ident, we use the whole card for comparison
339 # (otherwise they'd all replace each other)
340 ident = { # ctype -> ident
341 'VIDEO': 'video_id',
342 'PLAYLIST': 'playlist_id',
343 'CHANNEL': 'channel_id',
344 'WEBSITE': 'url',
345 'POLL': 'question',
346 }
347 getident = lambda c: c['content'].get(ident.get(c['type']), c)
348 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
349 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
350
351 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
352 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
353
354 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
355 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
356 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
357 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
358 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
359 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
360 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
361 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
362 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
363 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
364 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
365 whitelisted = sorted(meta2.get('availableCountries',[]))
366 blacklisted = sorted(set(all_countries) - set(whitelisted))
367
368 return {
369 'title': meta1['title'],
370 'author': meta1['author'],
371 'channel_id': meta1['channelId'],
372 'description': meta1['shortDescription'],
373 'published': meta2['publishDate'],
374 'views': meta1['viewCount'],
375 'length': int(meta1['lengthSeconds']),
376 'rating': meta1['averageRating'],
377 'category': meta2['category'],
378 'aspectr': aspect_ratio,
379 'unlisted': meta2['isUnlisted'],
380 'countries': whitelisted,
381 'blacklisted': blacklisted,
382 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
383 'infocards': infocards,
384 'endcards': endcards,
385 'all_cards': allcards,
386 'subtitles': subtitles,
387 }
388
389 class RedditException(Exception): pass
390 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
391 count=None, before=None, after=None):
392 """
393 fetches data from a subreddit (or a multireddit like gif+gifs) and
394 filters/sorts results.
395 sorted_by values: hot, new, rising, controversial, top
396 time values: hour, day, week, month, year, all (for top and controversial)
397 returns a tuple of ([{video}],before,after)
398 """
399 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
400
401 if not subreddits:
402 return [], None, None
403
404 query = {k:v for k,v in {
405 'count':count,
406 'before':before,
407 'after':after,
408 'limit':limit, # 1..100 (default 25)
409 't': time, # hour,week,month,year,all
410 }.items() if v}
411 multireddit = '+'.join(subreddits)
412 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
413 query, headers={'User-Agent':'Mozilla/5.0'})
414 if not r.ok or not 'data' in r.json():
415 raise RedditException(r.text)
416
417 videos = []
418 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
419 for entry in entries:
420 e = entry['data']
421 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
422 continue
423 try:
424 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
425 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
426 except:
427 continue # XXX: should we log that?
428 if not video_id: continue
429 videos.append({
430 'video_id': video_id,
431 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
432 'url': e['permalink'],
433 'n_comments': e['num_comments'],
434 'n_karma': e['score'],
435 'subreddit': e['subreddit'],
436 'post_id': e['id'],
437 })
438 before = r.json()['data']['before']
439 after = r.json()['data']['after']
440
441 return videos, before, after
442
443 def pp(*args):
444 from pprint import pprint
445 import sys, codecs
446 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum