]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
/search using invidious api (very ugly)
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 def fetch_xml(feed_type, feed_id):
30 # TODO: handle requests.exceptions.ConnectionError
31 r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
32 if not r.ok:
33 return None
34
35 return r.text
36
37 def parse_xml(xmldata):
38 ns = {
39 'atom':"http://www.w3.org/2005/Atom",
40 'yt': "http://www.youtube.com/xml/schemas/2015",
41 'media':"http://search.yahoo.com/mrss/",
42 'at': "http://purl.org/atompub/tombstones/1.0",
43 }
44
45 feed = ElementTree.fromstring(xmldata)
46 if feed.find('at:deleted-entry',ns):
47 author = feed.find('at:deleted-entry/at:by/name',ns).text
48 ref = feed.find('at:deleted-entry',ns).get('ref')
49 (_, _, video_id) = ref.rpartition(':')
50 return None, None, []
51 title = feed.find('atom:title',ns).text
52 author = feed.find('atom:author/atom:name',ns).text \
53 if feed.find('atom:author',ns) else None
54 videos = []
55 for entry in feed.findall('atom:entry',ns):
56 videos.append({
57 'video_id': entry.find('yt:videoId',ns).text,
58 'title': entry.find('atom:title',ns).text,
59 'published': entry.find('atom:published',ns).text,
60 'channel_id': entry.find('yt:channelId',ns).text,
61 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
62 # extra fields for pull_subs/webhook:
63 'updated': entry.find('atom:updated',ns).text,
64 })
65
66 return title, author, videos
67
68 def update_channel(db, xmldata):
69 if not xmldata: return False
70
71 # Note: websub does not return global author, hence taking from first video
72 title, _, videos = parse_xml(xmldata)
73
74 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
75
76 c = db.cursor()
77 for i, video in enumerate(videos):
78 now = datetime.now(timezone.utc)
79 updated = dateutil.parser.parse(video['updated'])
80 published = dateutil.parser.parse(video['published'])
81 # if update and published time are near-identical, we assume it's new.
82 if (updated - published).seconds < 60 and (now - published).days < 7:
83 timestamp = now
84 else:#, it's just an update to an older video.
85 timestamp = published
86
87 c.execute("""
88 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
89 VALUES (?, ?, ?, datetime(?), datetime(?))
90 """, (
91 video['video_id'],
92 video['channel_id'],
93 video['title'],
94 video['published'],
95 timestamp
96 ))
97
98 if i == 0: # only required once per feed
99 c.execute("""
100 INSERT OR REPLACE INTO channels (id, name)
101 VALUES (?, ?)
102 """, (video['channel_id'], video['author']))
103 db.commit()
104
105 return True
106
107 def get_video_info(video_id, sts=0, algo=""):
108 """
109 returns: best-quality muxed video stream, player_response, error-type/mesage
110 error types: player, malformed, livestream, geolocked, exhausted
111 """
112 player_error = None # for 'exhausted'
113 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
114 r = requests.get(f"https://www.youtube.com/get_video_info"+
115 f"?video_id={video_id}"+
116 f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
117 f"&el={el}"+
118 f"&sts={sts}"+
119 f"&hl=en_US") #"&hl=en&gl=US"
120 params = parse_qs(r.text)
121 if 'errorcode' in params: # status=fail
122 return None, None, 'malformed', params['reason'][0]
123
124 metadata = json.loads(params.get('player_response')[0])
125 playabilityStatus = metadata['playabilityStatus']['status']
126 if playabilityStatus != "OK":
127 playabilityReason = metadata['playabilityStatus']['reason']
128 player_error = f"{playabilityStatus}: {playabilityReason}"
129 if playabilityStatus == "UNPLAYABLE":
130 continue # try again with next el value (or fail as exhausted)
131 # without videoDetails, there's only the error message
132 maybe_metadata = metadata if 'videoDetails' in metadata else None
133 return None, maybe_metadata, 'player', player_error
134 if metadata['videoDetails']['isLiveContent'] and \
135 (metadata['videoDetails'].get('isLive', False) or \
136 metadata['videoDetails'].get('isPostLiveDvr', False)):
137 return None, metadata, 'livestream', None
138
139 if not 'formats' in metadata['streamingData']:
140 continue # no urls
141
142 formats = metadata['streamingData']['formats']
143 for (i,v) in enumerate(formats):
144 if not ('cipher' in v or 'signatureCipher' in v): continue
145 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
146 formats[i]['url'] = unscramble(cipher, algo)
147
148 # todo: check if we have urls or try again
149 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
150
151 if 'gcr' in parse_qs(url):
152 return None, metadata, 'geolocked', None
153
154 return url, metadata, None, None
155 else:
156 return None, metadata, 'exhausted', player_error
157
158 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
159 signature = list(cipher['s'][0])
160 for c in algo.split():
161 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
162 ix = int(ix) % len(signature) if ix else 0
163 if not op: continue
164 if op == 'r': signature = list(reversed(signature))
165 if op == 's': signature = signature[ix:]
166 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
167 sp = cipher.get('sp', ['signature'])[0]
168 sig = cipher.get('sig', [''.join(signature)])[0]
169 return f"{cipher['url'][0]}&{sp}={sig}"
170
171 def prepare_metadata(metadata):
172 meta1 = metadata['videoDetails']
173 meta2 = metadata['microformat']['playerMicroformatRenderer']
174 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
175 if 'cards' in metadata else []
176 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
177 if 'endscreen' in metadata else []
178
179 # the actual video streams have exact information:
180 try:
181 sd = metadata['streamingData']
182 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
183 aspect_ratio = some_stream['width'] / some_stream['height']
184 # if that's unavailable (e.g. on livestreams), fall back to
185 # thumbnails (only either 4:3 or 16:9).
186 except:
187 some_img = meta2['thumbnail']['thumbnails'][0]
188 aspect_ratio = some_img['width'] / some_img['height']
189
190 subtitles = sorted([
191 {'url':cc['baseUrl'],
192 'code':cc['languageCode'],
193 'autogenerated':cc.get('kind')=="asr",
194 'name':cc['name']['simpleText']}
195 for cc in metadata.get('captions',{})
196 .get('playerCaptionsTracklistRenderer',{})
197 .get('captionTracks',[])
198 ], key=lambda cc: cc['autogenerated'])
199
200 def clean_url(url):
201 # externals URLs are redirected through youtube.com/redirect, but we
202 # may encounter internal URLs, too
203 return parse_qs(urlparse(url).query).get('q',[url])[0]
204 # Remove left-/rightmost word from string:
205 delL = lambda s: s.partition(' ')[2]
206 delR = lambda s: s.rpartition(' ')[0]
207 # Thousands seperator aware int():
208 intT = lambda s: int(s.replace(',', ''))
209
210 def parse_infocard(card):
211 card = card['cardRenderer']
212 ctype = list(card['content'].keys())[0]
213 content = card['content'][ctype]
214 if ctype == "pollRenderer":
215 ctype = "POLL"
216 content = {
217 'question': content['question']['simpleText'],
218 'answers': [(a['text']['simpleText'],a['numVotes']) \
219 for a in content['choices']],
220 }
221 elif ctype == "videoInfoCardContentRenderer":
222 ctype = "VIDEO"
223 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
224 # TODO: this is ugly; cleanup.
225 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
226 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
227 content = {
228 'video_id': content['action']['watchEndpoint']['videoId'],
229 'title': content['videoTitle']['simpleText'],
230 'author': delL(content['channelName']['simpleText']),
231 'length': length,
232 'views': intT(delR(content['viewCountText']['simpleText'])),
233 }
234 elif ctype == "playlistInfoCardContentRenderer":
235 ctype = "PLAYLIST"
236 content = {
237 'playlist_id': content['action']['watchEndpoint']['playlistId'],
238 'video_id': content['action']['watchEndpoint']['videoId'],
239 'title': content['playlistTitle']['simpleText'],
240 'author': delL(content['channelName']['simpleText']),
241 'n_videos': intT(content['playlistVideoCount']['simpleText']),
242 }
243 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
244 ctype = "WEBSITE"
245 content = {
246 'url': clean_url(content['command']['urlEndpoint']['url']),
247 'domain': content['displayDomain']['simpleText'],
248 'title': content['title']['simpleText'],
249 # XXX: no thumbnails for infocards
250 }
251 elif ctype == "collaboratorInfoCardContentRenderer":
252 ctype = "CHANNEL"
253 content = {
254 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
255 'title': content['channelName']['simpleText'],
256 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
257 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
258 }
259 else:
260 import pprint
261 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
262
263 return {'type': ctype, 'content': content}
264
265 def mkthumbs(thumbs):
266 return {e['height']: e['url'] for e in thumbs}
267 def parse_endcard(card):
268 card = card.get('endscreenElementRenderer', card) #only sometimes nested
269 ctype = card['style']
270 if ctype == "CHANNEL":
271 content = {
272 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
273 'title': card['title']['simpleText'],
274 'icons': mkthumbs(card['image']['thumbnails']),
275 }
276 elif ctype == "VIDEO":
277 content = {
278 'video_id': card['endpoint']['watchEndpoint']['videoId'],
279 'title': card['title']['simpleText'],
280 'length': card['videoDuration']['simpleText'], # '12:21'
281 'views': delR(card['metadata']['simpleText']),
282 # XXX: no channel name
283 }
284 elif ctype == "PLAYLIST":
285 content = {
286 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
287 'video_id': card['endpoint']['watchEndpoint']['videoId'],
288 'title': card['title']['simpleText'],
289 'author': delL(card['metadata']['simpleText']),
290 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
291 }
292 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
293 ctype = "WEBSITE"
294 url = clean_url(card['endpoint']['urlEndpoint']['url'])
295 content = {
296 'url': url,
297 'domain': urlparse(url).netloc,
298 'title': card['title']['simpleText'],
299 'icons': mkthumbs(card['image']['thumbnails']),
300 }
301 else:
302 import pprint
303 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
304
305 return {'type': ctype, 'content': content}
306
307 infocards = [parse_infocard(card) for card in cards]
308 endcards = [parse_endcard(card) for card in endsc]
309 # combine cards to weed out duplicates. for videos and playlists prefer
310 # infocards, for channels and websites prefer endcards, as those have more
311 # information than the other.
312 # if the card type is not in ident, we use the whole card for comparison
313 # (otherwise they'd all replace each other)
314 ident = { # ctype -> ident
315 'VIDEO': 'video_id',
316 'PLAYLIST': 'playlist_id',
317 'CHANNEL': 'channel_id',
318 'WEBSITE': 'url',
319 'POLL': 'question',
320 }
321 getident = lambda c: c['content'].get(ident.get(c['type']), c)
322 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
323 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
324
325 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
326 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
327
328 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
329 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
330 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
331 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
332 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
333 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
334 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
335 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
336 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
337 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
338 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
339 whitelisted = sorted(meta2.get('availableCountries',[]))
340 blacklisted = sorted(set(all_countries) - set(whitelisted))
341
342 return {
343 'title': meta1['title'],
344 'author': meta1['author'],
345 'channel_id': meta1['channelId'],
346 'description': meta1['shortDescription'],
347 'published': meta2['publishDate'],
348 'views': meta1['viewCount'],
349 'length': int(meta1['lengthSeconds']),
350 'rating': meta1['averageRating'],
351 'category': meta2['category'],
352 'aspectr': aspect_ratio,
353 'unlisted': meta2['isUnlisted'],
354 'countries': whitelisted,
355 'blacklisted': blacklisted,
356 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
357 'infocards': infocards,
358 'endcards': endcards,
359 'all_cards': allcards,
360 'subtitles': subtitles,
361 }
362
363 class RedditException(Exception): pass
364 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
365 count=None, before=None, after=None):
366 """
367 fetches data from a subreddit (or a multireddit like gif+gifs) and
368 filters/sorts results.
369 sorted_by values: hot, new, rising, controversial, top
370 time values: hour, week, month, year, all (for top and controversial)
371 returns a tuple of ([{video}],before,after)
372 """
373 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
374
375 if not subreddits:
376 return [], None, None
377
378 query = '&'.join([f"{k}={v}" for k,v in {
379 'count':count,
380 'before':before,
381 'after':after,
382 'limit':limit, # 1..100 (default 25)
383 't': time, # hour,week,month,year,all
384 }.items() if v])
385 multireddit = '+'.join(subreddits)
386 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
387 headers={'User-Agent':'Mozilla/5.0'})
388 if not r.ok or not 'data' in r.json():
389 raise RedditException(r.text)
390
391 videos = []
392 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
393 for entry in entries:
394 e = entry['data']
395 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
396 continue
397 try:
398 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
399 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
400 except:
401 continue # XXX: should we log that?
402 if not video_id: continue
403 videos.append({
404 'video_id': video_id,
405 'title': e['title'],
406 'url': e['permalink'],
407 'n_comments': e['num_comments'],
408 'n_karma': e['score'],
409 'subreddit': e['subreddit'],
410 'post_id': e['id'],
411 })
412 before = r.json()['data']['before']
413 after = r.json()['data']['after']
414
415 return videos, before, after
416
417 def pp(*args):
418 from pprint import pprint
419 import sys, codecs
420 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum