]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
use "prepared" query strings with requests
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 def fetch_xml(feed_type, feed_id):
30 # TODO: handle requests.exceptions.ConnectionError
31 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
32 feed_type: feed_id,
33 })
34 if not r.ok:
35 return None
36
37 return r.text
38
39 def parse_xml(xmldata):
40 ns = {
41 'atom':"http://www.w3.org/2005/Atom",
42 'yt': "http://www.youtube.com/xml/schemas/2015",
43 'media':"http://search.yahoo.com/mrss/",
44 'at': "http://purl.org/atompub/tombstones/1.0",
45 }
46
47 feed = ElementTree.fromstring(xmldata)
48 if feed.find('at:deleted-entry',ns):
49 author = feed.find('at:deleted-entry/at:by/name',ns).text
50 ref = feed.find('at:deleted-entry',ns).get('ref')
51 (_, _, video_id) = ref.rpartition(':')
52 return None, None, []
53 title = feed.find('atom:title',ns).text
54 author = feed.find('atom:author/atom:name',ns).text \
55 if feed.find('atom:author',ns) else None
56 videos = []
57 for entry in feed.findall('atom:entry',ns):
58 videos.append({
59 'video_id': entry.find('yt:videoId',ns).text,
60 'title': entry.find('atom:title',ns).text,
61 'published': entry.find('atom:published',ns).text,
62 'channel_id': entry.find('yt:channelId',ns).text,
63 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
64 # extra fields for pull_subs/webhook:
65 'updated': entry.find('atom:updated',ns).text,
66 })
67
68 return title, author, videos
69
70 def update_channel(db, xmldata):
71 if not xmldata: return False
72
73 # Note: websub does not return global author, hence taking from first video
74 title, _, videos = parse_xml(xmldata)
75
76 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
77
78 c = db.cursor()
79 for i, video in enumerate(videos):
80 now = datetime.now(timezone.utc)
81 updated = dateutil.parser.parse(video['updated'])
82 published = dateutil.parser.parse(video['published'])
83 # if update and published time are near-identical, we assume it's new.
84 if (updated - published).seconds < 60 and (now - published).days < 7:
85 timestamp = now
86 else:#, it's just an update to an older video.
87 timestamp = published
88
89 c.execute("""
90 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
91 VALUES (?, ?, ?, datetime(?), datetime(?))
92 """, (
93 video['video_id'],
94 video['channel_id'],
95 video['title'],
96 video['published'],
97 timestamp
98 ))
99
100 if i == 0: # only required once per feed
101 c.execute("""
102 INSERT OR REPLACE INTO channels (id, name)
103 VALUES (?, ?)
104 """, (video['channel_id'], video['author']))
105 db.commit()
106
107 return True
108
109 def get_video_info(video_id, sts=0, algo=""):
110 """
111 returns: best-quality muxed video stream, player_response, error-type/mesage
112 error types: player, malformed, livestream, geolocked, exhausted
113 """
114 player_error = None # for 'exhausted'
115 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
116 r = requests.get("https://www.youtube.com/get_video_info", {
117 "video_id": video_id,
118 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
119 "el": el,
120 "sts": sts,
121 "hl": "en_US",
122 })
123 params = parse_qs(r.text)
124 if 'errorcode' in params: # status=fail
125 return None, None, 'malformed', params['reason'][0]
126
127 metadata = json.loads(params.get('player_response')[0])
128 playabilityStatus = metadata['playabilityStatus']['status']
129 if playabilityStatus != "OK":
130 playabilityReason = metadata['playabilityStatus']['reason']
131 player_error = f"{playabilityStatus}: {playabilityReason}"
132 if playabilityStatus == "UNPLAYABLE":
133 continue # try again with next el value (or fail as exhausted)
134 # without videoDetails, there's only the error message
135 maybe_metadata = metadata if 'videoDetails' in metadata else None
136 return None, maybe_metadata, 'player', player_error
137 if metadata['videoDetails']['isLiveContent'] and \
138 (metadata['videoDetails'].get('isLive', False) or \
139 metadata['videoDetails'].get('isPostLiveDvr', False)):
140 return None, metadata, 'livestream', None
141
142 if not 'formats' in metadata['streamingData']:
143 continue # no urls
144
145 formats = metadata['streamingData']['formats']
146 for (i,v) in enumerate(formats):
147 if not ('cipher' in v or 'signatureCipher' in v): continue
148 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
149 formats[i]['url'] = unscramble(cipher, algo)
150
151 # todo: check if we have urls or try again
152 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
153
154 if 'gcr' in parse_qs(url):
155 return None, metadata, 'geolocked', None
156
157 return url, metadata, None, None
158 else:
159 return None, metadata, 'exhausted', player_error
160
161 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
162 signature = list(cipher['s'][0])
163 for c in algo.split():
164 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
165 ix = int(ix) % len(signature) if ix else 0
166 if not op: continue
167 if op == 'r': signature = list(reversed(signature))
168 if op == 's': signature = signature[ix:]
169 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
170 sp = cipher.get('sp', ['signature'])[0]
171 sig = cipher.get('sig', [''.join(signature)])[0]
172 return f"{cipher['url'][0]}&{sp}={sig}"
173
174 def prepare_metadata(metadata):
175 meta1 = metadata['videoDetails']
176 meta2 = metadata['microformat']['playerMicroformatRenderer']
177 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
178 if 'cards' in metadata else []
179 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
180 if 'endscreen' in metadata else []
181
182 # the actual video streams have exact information:
183 try:
184 sd = metadata['streamingData']
185 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
186 aspect_ratio = some_stream['width'] / some_stream['height']
187 # if that's unavailable (e.g. on livestreams), fall back to
188 # thumbnails (only either 4:3 or 16:9).
189 except:
190 some_img = meta2['thumbnail']['thumbnails'][0]
191 aspect_ratio = some_img['width'] / some_img['height']
192
193 subtitles = sorted([
194 {'url':cc['baseUrl'],
195 'code':cc['languageCode'],
196 'autogenerated':cc.get('kind')=="asr",
197 'name':cc['name']['simpleText']}
198 for cc in metadata.get('captions',{})
199 .get('playerCaptionsTracklistRenderer',{})
200 .get('captionTracks',[])
201 ], key=lambda cc: cc['autogenerated'])
202
203 def clean_url(url):
204 # externals URLs are redirected through youtube.com/redirect, but we
205 # may encounter internal URLs, too
206 return parse_qs(urlparse(url).query).get('q',[url])[0]
207 # Remove left-/rightmost word from string:
208 delL = lambda s: s.partition(' ')[2]
209 delR = lambda s: s.rpartition(' ')[0]
210 # Thousands seperator aware int():
211 intT = lambda s: int(s.replace(',', ''))
212
213 def parse_infocard(card):
214 card = card['cardRenderer']
215 ctype = list(card['content'].keys())[0]
216 content = card['content'][ctype]
217 if ctype == "pollRenderer":
218 ctype = "POLL"
219 content = {
220 'question': content['question']['simpleText'],
221 'answers': [(a['text']['simpleText'],a['numVotes']) \
222 for a in content['choices']],
223 }
224 elif ctype == "videoInfoCardContentRenderer":
225 ctype = "VIDEO"
226 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
227 # TODO: this is ugly; cleanup.
228 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
229 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
230 content = {
231 'video_id': content['action']['watchEndpoint']['videoId'],
232 'title': content['videoTitle']['simpleText'],
233 'author': delL(content['channelName']['simpleText']),
234 'length': length,
235 'views': intT(delR(content['viewCountText']['simpleText'])),
236 }
237 elif ctype == "playlistInfoCardContentRenderer":
238 ctype = "PLAYLIST"
239 content = {
240 'playlist_id': content['action']['watchEndpoint']['playlistId'],
241 'video_id': content['action']['watchEndpoint']['videoId'],
242 'title': content['playlistTitle']['simpleText'],
243 'author': delL(content['channelName']['simpleText']),
244 'n_videos': intT(content['playlistVideoCount']['simpleText']),
245 }
246 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
247 ctype = "WEBSITE"
248 content = {
249 'url': clean_url(content['command']['urlEndpoint']['url']),
250 'domain': content['displayDomain']['simpleText'],
251 'title': content['title']['simpleText'],
252 # XXX: no thumbnails for infocards
253 }
254 elif ctype == "collaboratorInfoCardContentRenderer":
255 ctype = "CHANNEL"
256 content = {
257 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
258 'title': content['channelName']['simpleText'],
259 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
260 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
261 }
262 else:
263 import pprint
264 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
265
266 return {'type': ctype, 'content': content}
267
268 def mkthumbs(thumbs):
269 return {e['height']: e['url'] for e in thumbs}
270 def parse_endcard(card):
271 card = card.get('endscreenElementRenderer', card) #only sometimes nested
272 ctype = card['style']
273 if ctype == "CHANNEL":
274 content = {
275 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
276 'title': card['title']['simpleText'],
277 'icons': mkthumbs(card['image']['thumbnails']),
278 }
279 elif ctype == "VIDEO":
280 content = {
281 'video_id': card['endpoint']['watchEndpoint']['videoId'],
282 'title': card['title']['simpleText'],
283 'length': card['videoDuration']['simpleText'], # '12:21'
284 'views': delR(card['metadata']['simpleText']),
285 # XXX: no channel name
286 }
287 elif ctype == "PLAYLIST":
288 content = {
289 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
290 'video_id': card['endpoint']['watchEndpoint']['videoId'],
291 'title': card['title']['simpleText'],
292 'author': delL(card['metadata']['simpleText']),
293 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
294 }
295 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
296 ctype = "WEBSITE"
297 url = clean_url(card['endpoint']['urlEndpoint']['url'])
298 content = {
299 'url': url,
300 'domain': urlparse(url).netloc,
301 'title': card['title']['simpleText'],
302 'icons': mkthumbs(card['image']['thumbnails']),
303 }
304 else:
305 import pprint
306 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
307
308 return {'type': ctype, 'content': content}
309
310 infocards = [parse_infocard(card) for card in cards]
311 endcards = [parse_endcard(card) for card in endsc]
312 # combine cards to weed out duplicates. for videos and playlists prefer
313 # infocards, for channels and websites prefer endcards, as those have more
314 # information than the other.
315 # if the card type is not in ident, we use the whole card for comparison
316 # (otherwise they'd all replace each other)
317 ident = { # ctype -> ident
318 'VIDEO': 'video_id',
319 'PLAYLIST': 'playlist_id',
320 'CHANNEL': 'channel_id',
321 'WEBSITE': 'url',
322 'POLL': 'question',
323 }
324 getident = lambda c: c['content'].get(ident.get(c['type']), c)
325 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
326 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
327
328 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
329 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
330
331 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
332 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
333 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
334 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
335 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
336 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
337 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
338 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
339 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
340 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
341 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
342 whitelisted = sorted(meta2.get('availableCountries',[]))
343 blacklisted = sorted(set(all_countries) - set(whitelisted))
344
345 return {
346 'title': meta1['title'],
347 'author': meta1['author'],
348 'channel_id': meta1['channelId'],
349 'description': meta1['shortDescription'],
350 'published': meta2['publishDate'],
351 'views': meta1['viewCount'],
352 'length': int(meta1['lengthSeconds']),
353 'rating': meta1['averageRating'],
354 'category': meta2['category'],
355 'aspectr': aspect_ratio,
356 'unlisted': meta2['isUnlisted'],
357 'countries': whitelisted,
358 'blacklisted': blacklisted,
359 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
360 'infocards': infocards,
361 'endcards': endcards,
362 'all_cards': allcards,
363 'subtitles': subtitles,
364 }
365
366 class RedditException(Exception): pass
367 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
368 count=None, before=None, after=None):
369 """
370 fetches data from a subreddit (or a multireddit like gif+gifs) and
371 filters/sorts results.
372 sorted_by values: hot, new, rising, controversial, top
373 time values: hour, week, month, year, all (for top and controversial)
374 returns a tuple of ([{video}],before,after)
375 """
376 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
377
378 if not subreddits:
379 return [], None, None
380
381 query = {k:v for k,v in {
382 'count':count,
383 'before':before,
384 'after':after,
385 'limit':limit, # 1..100 (default 25)
386 't': time, # hour,week,month,year,all
387 }.items() if v}
388 multireddit = '+'.join(subreddits)
389 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
390 query, headers={'User-Agent':'Mozilla/5.0'})
391 if not r.ok or not 'data' in r.json():
392 raise RedditException(r.text)
393
394 videos = []
395 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
396 for entry in entries:
397 e = entry['data']
398 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
399 continue
400 try:
401 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
402 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
403 except:
404 continue # XXX: should we log that?
405 if not video_id: continue
406 videos.append({
407 'video_id': video_id,
408 'title': e['title'],
409 'url': e['permalink'],
410 'n_comments': e['num_comments'],
411 'n_karma': e['score'],
412 'subreddit': e['subreddit'],
413 'post_id': e['id'],
414 })
415 before = r.json()['data']['before']
416 after = r.json()['data']['after']
417
418 return videos, before, after
419
420 def pp(*args):
421 from pprint import pprint
422 import sys, codecs
423 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum