]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
raise error if config file not found, some comments
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15 if not 'global' in cf: # todo: full config check
16 raise Exception("Configuration file not found or empty")
17
18 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
19 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
20
21 # Note: this should only be required for the 'memory' backed cache.
22 # TODO: only run for long-running processes, i.e. the frontend
23 from threading import Timer
24 def purge_cache(sec):
25 requests_cache.remove_expired_responses()
26 t = Timer(sec, purge_cache, args=(sec,))
27 t.setDaemon(True)
28 t.start()
29 purge_cache(10*60)
30
31 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
32 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
33 try:
34 #raise Exception()
35 from flask import g
36 import requests
37 from requests import Session as OriginalSession
38 class _NSASession(OriginalSession):
39 def request(self, method, url, params=None, data=None, **kwargs):
40 response = super(_NSASession, self).request(
41 method, url, params, data, **kwargs
42 )
43 if 'api_requests' not in g:
44 g.api_requests = []
45 g.api_requests.append((url, params, response.text))
46 return response
47 requests.Session = requests.sessions.Session = _NSASession
48 except:
49 pass
50
51 def fetch_xml(feed_type, feed_id):
52 # TODO: handle requests.exceptions.ConnectionError
53 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
54 feed_type: feed_id,
55 })
56 if not r.ok:
57 return None
58
59 return r.text
60
61 def parse_xml(xmldata):
62 ns = {
63 'atom':"http://www.w3.org/2005/Atom",
64 'yt': "http://www.youtube.com/xml/schemas/2015",
65 'media':"http://search.yahoo.com/mrss/",
66 'at': "http://purl.org/atompub/tombstones/1.0",
67 }
68
69 feed = ElementTree.fromstring(xmldata)
70 if feed.find('at:deleted-entry',ns):
71 author = feed.find('at:deleted-entry/at:by/name',ns).text
72 ref = feed.find('at:deleted-entry',ns).get('ref')
73 (_, _, video_id) = ref.rpartition(':')
74 return None, None, []
75 title = feed.find('atom:title',ns).text
76 author = feed.find('atom:author/atom:name',ns).text \
77 if feed.find('atom:author',ns) else None
78 videos = []
79 for entry in feed.findall('atom:entry',ns):
80 videos.append({
81 'video_id': entry.find('yt:videoId',ns).text,
82 'title': entry.find('atom:title',ns).text,
83 'published': entry.find('atom:published',ns).text,
84 'channel_id': entry.find('yt:channelId',ns).text,
85 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
86 # extra fields for pull_subs/webhook:
87 'updated': entry.find('atom:updated',ns).text,
88 })
89
90 return title, author, videos
91
92 def update_channel(db, xmldata):
93 if not xmldata: return False
94
95 # Note: websub does not return global author, hence taking from first video
96 title, _, videos = parse_xml(xmldata)
97
98 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
99
100 c = db.cursor()
101 for i, video in enumerate(videos):
102 now = datetime.now(timezone.utc)
103 updated = dateutil.parser.parse(video['updated'])
104 published = dateutil.parser.parse(video['published'])
105 # if update and published time are near-identical, we assume it's new.
106 if (updated - published).seconds < 60 and (now - published).days < 7:
107 timestamp = now
108 else:#, it's just an update to an older video.
109 timestamp = published
110
111 c.execute("""
112 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
113 VALUES (?, ?, ?, datetime(?), datetime(?))
114 """, (
115 video['video_id'],
116 video['channel_id'],
117 video['title'],
118 video['published'],
119 timestamp
120 ))
121
122 if i == 0: # only required once per feed
123 c.execute("""
124 INSERT OR REPLACE INTO channels (id, name)
125 VALUES (?, ?)
126 """, (video['channel_id'], video['author']))
127 db.commit()
128
129 return True
130
131 def get_video_info(video_id, sts=0, algo=""):
132 """
133 returns: best-quality muxed video stream, player_response, error-type/mesage
134 error types: player, malformed, livestream, geolocked, exhausted
135 """
136 player_error = None # for 'exhausted'
137 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
138 r = requests.get("https://www.youtube.com/get_video_info", {
139 "video_id": video_id,
140 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
141 "el": el,
142 "sts": sts,
143 "hl": "en_US",
144 })
145 params = parse_qs(r.text)
146 if 'errorcode' in params: # status=fail
147 return None, None, 'malformed', params['reason'][0]
148
149 metadata = json.loads(params.get('player_response')[0])
150 playabilityStatus = metadata['playabilityStatus']['status']
151 if playabilityStatus != "OK":
152 playabilityReason = metadata['playabilityStatus']['reason']
153 player_error = f"{playabilityStatus}: {playabilityReason}"
154 if playabilityStatus == "UNPLAYABLE":
155 continue # try again with next el value (or fail as exhausted)
156 # without videoDetails, there's only the error message
157 maybe_metadata = metadata if 'videoDetails' in metadata else None
158 return None, maybe_metadata, 'player', player_error
159 if metadata['videoDetails']['isLiveContent'] and \
160 (metadata['videoDetails'].get('isLive', False) or \
161 metadata['videoDetails'].get('isPostLiveDvr', False)):
162 return None, metadata, 'livestream', None
163
164 if not 'formats' in metadata['streamingData']:
165 continue # no urls
166
167 formats = metadata['streamingData']['formats']
168 for (i,v) in enumerate(formats):
169 if not ('cipher' in v or 'signatureCipher' in v): continue
170 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
171 formats[i]['url'] = unscramble(cipher, algo)
172
173 # todo: check if we have urls or try again
174 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
175
176 if 'gcr' in parse_qs(url):
177 return None, metadata, 'geolocked', None
178
179 return url, metadata, None, None
180 else:
181 return None, metadata, 'exhausted', player_error
182
183 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
184 signature = list(cipher['s'][0])
185 for c in algo.split():
186 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
187 ix = int(ix) % len(signature) if ix else 0
188 if not op: continue
189 if op == 'r': signature = list(reversed(signature))
190 if op == 's': signature = signature[ix:]
191 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
192 sp = cipher.get('sp', ['signature'])[0]
193 sig = cipher.get('sig', [''.join(signature)])[0]
194 return f"{cipher['url'][0]}&{sp}={sig}"
195
196 def prepare_metadata(metadata):
197 meta1 = metadata['videoDetails']
198 meta2 = metadata['microformat']['playerMicroformatRenderer']
199 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
200 if 'cards' in metadata else []
201 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
202 if 'endscreen' in metadata else []
203
204 # the actual video streams have exact information:
205 try:
206 sd = metadata['streamingData']
207 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
208 aspect_ratio = some_stream['width'] / some_stream['height']
209 # if that's unavailable (e.g. on livestreams), fall back to
210 # thumbnails (only either 4:3 or 16:9).
211 except:
212 some_img = meta2['thumbnail']['thumbnails'][0]
213 aspect_ratio = some_img['width'] / some_img['height']
214
215 subtitles = sorted([
216 {'url':cc['baseUrl'],
217 'code':cc['languageCode'],
218 'autogenerated':cc.get('kind')=="asr",
219 'name':cc['name']['simpleText']}
220 for cc in metadata.get('captions',{})
221 .get('playerCaptionsTracklistRenderer',{})
222 .get('captionTracks',[])
223 ], key=lambda cc: cc['autogenerated'])
224
225 def clean_url(url):
226 # externals URLs are redirected through youtube.com/redirect, but we
227 # may encounter internal URLs, too
228 return parse_qs(urlparse(url).query).get('q',[url])[0]
229 # Remove left-/rightmost word from string:
230 delL = lambda s: s.partition(' ')[2]
231 delR = lambda s: s.rpartition(' ')[0]
232 # Thousands seperator aware int():
233 intT = lambda s: int(s.replace(',', ''))
234
235 def parse_infocard(card):
236 card = card['cardRenderer']
237 ctype = list(card['content'].keys())[0]
238 content = card['content'][ctype]
239 if ctype == "pollRenderer":
240 ctype = "POLL"
241 content = {
242 'question': content['question']['simpleText'],
243 'answers': [(a['text']['simpleText'],a['numVotes']) \
244 for a in content['choices']],
245 }
246 elif ctype == "videoInfoCardContentRenderer":
247 ctype = "VIDEO"
248 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
249 # TODO: this is ugly; cleanup.
250 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
251 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
252 content = {
253 'video_id': content['action']['watchEndpoint']['videoId'],
254 'title': content['videoTitle']['simpleText'],
255 'author': delL(content['channelName']['simpleText']),
256 'length': length,
257 'views': intT(delR(content['viewCountText']['simpleText'])),
258 }
259 elif ctype == "playlistInfoCardContentRenderer":
260 ctype = "PLAYLIST"
261 content = {
262 'playlist_id': content['action']['watchEndpoint']['playlistId'],
263 'video_id': content['action']['watchEndpoint']['videoId'],
264 'title': content['playlistTitle']['simpleText'],
265 'author': delL(content['channelName']['simpleText']),
266 'n_videos': intT(content['playlistVideoCount']['simpleText']),
267 }
268 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
269 ctype = "WEBSITE"
270 content = {
271 'url': clean_url(content['command']['urlEndpoint']['url']),
272 'domain': content['displayDomain']['simpleText'],
273 'title': content['title']['simpleText'],
274 # XXX: no thumbnails for infocards
275 }
276 elif ctype == "collaboratorInfoCardContentRenderer":
277 ctype = "CHANNEL"
278 content = {
279 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
280 'title': content['channelName']['simpleText'],
281 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
282 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
283 }
284 else:
285 import pprint
286 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
287
288 return {'type': ctype, 'content': content}
289
290 def mkthumbs(thumbs):
291 return {e['height']: e['url'] for e in thumbs}
292 def parse_endcard(card):
293 card = card.get('endscreenElementRenderer', card) #only sometimes nested
294 ctype = card['style']
295 if ctype == "CHANNEL":
296 content = {
297 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
298 'title': card['title']['simpleText'],
299 'icons': mkthumbs(card['image']['thumbnails']),
300 }
301 elif ctype == "VIDEO":
302 content = {
303 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
304 'title': card['title']['simpleText'],
305 'length': card['videoDuration']['simpleText'], # '12:21'
306 'views': delR(card['metadata']['simpleText']),
307 # XXX: no channel name
308 }
309 elif ctype == "PLAYLIST":
310 content = {
311 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
312 'video_id': card['endpoint']['watchEndpoint']['videoId'],
313 'title': card['title']['simpleText'],
314 'author': delL(card['metadata']['simpleText']),
315 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
316 }
317 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
318 ctype = "WEBSITE"
319 url = clean_url(card['endpoint']['urlEndpoint']['url'])
320 content = {
321 'url': url,
322 'domain': urlparse(url).netloc,
323 'title': card['title']['simpleText'],
324 'icons': mkthumbs(card['image']['thumbnails']),
325 }
326 else:
327 import pprint
328 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
329
330 return {'type': ctype, 'content': content}
331
332 infocards = [parse_infocard(card) for card in cards]
333 endcards = [parse_endcard(card) for card in endsc]
334 # combine cards to weed out duplicates. for videos and playlists prefer
335 # infocards, for channels and websites prefer endcards, as those have more
336 # information than the other.
337 # if the card type is not in ident, we use the whole card for comparison
338 # (otherwise they'd all replace each other)
339 ident = { # ctype -> ident
340 'VIDEO': 'video_id',
341 'PLAYLIST': 'playlist_id',
342 'CHANNEL': 'channel_id',
343 'WEBSITE': 'url',
344 'POLL': 'question',
345 }
346 getident = lambda c: c['content'].get(ident.get(c['type']), c)
347 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
348 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
349
350 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
351 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
352
353 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
354 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
355 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
356 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
357 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
358 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
359 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
360 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
361 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
362 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
363 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
364 whitelisted = sorted(meta2.get('availableCountries',[]))
365 blacklisted = sorted(set(all_countries) - set(whitelisted))
366
367 return {
368 'title': meta1['title'],
369 'author': meta1['author'],
370 'channel_id': meta1['channelId'],
371 'description': meta1['shortDescription'],
372 'published': meta2['publishDate'],
373 'views': meta1['viewCount'],
374 'length': int(meta1['lengthSeconds']),
375 'rating': meta1['averageRating'],
376 'category': meta2['category'],
377 'aspectr': aspect_ratio,
378 'unlisted': meta2['isUnlisted'],
379 'countries': whitelisted,
380 'blacklisted': blacklisted,
381 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
382 'infocards': infocards,
383 'endcards': endcards,
384 'all_cards': allcards,
385 'subtitles': subtitles,
386 }
387
388 class RedditException(Exception): pass
389 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
390 count=None, before=None, after=None):
391 """
392 fetches data from a subreddit (or a multireddit like gif+gifs) and
393 filters/sorts results.
394 sorted_by values: hot, new, rising, controversial, top
395 time values: hour, day, week, month, year, all (for top and controversial)
396 returns a tuple of ([{video}],before,after)
397 """
398 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
399
400 if not subreddits:
401 return [], None, None
402
403 query = {k:v for k,v in {
404 'count':count,
405 'before':before,
406 'after':after,
407 'limit':limit, # 1..100 (default 25)
408 't': time, # hour,week,month,year,all
409 }.items() if v}
410 multireddit = '+'.join(subreddits)
411 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
412 query, headers={'User-Agent':'Mozilla/5.0'})
413 if not r.ok or not 'data' in r.json():
414 raise RedditException(r.text)
415
416 videos = []
417 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
418 for entry in entries:
419 e = entry['data']
420 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
421 continue
422 try:
423 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
424 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
425 except:
426 continue # XXX: should we log that?
427 if not video_id: continue
428 videos.append({
429 'video_id': video_id,
430 'title': e['title'],
431 'url': e['permalink'],
432 'n_comments': e['num_comments'],
433 'n_karma': e['score'],
434 'subreddit': e['subreddit'],
435 'post_id': e['id'],
436 })
437 before = r.json()['data']['before']
438 after = r.json()['data']['after']
439
440 return videos, before, after
441
442 def pp(*args):
443 from pprint import pprint
444 import sys, codecs
445 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum