]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
fix metadata on completely unavailable videos
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 def fetch_xml(feed_type, feed_id):
30 # TODO: handle requests.exceptions.ConnectionError
31 r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
32 if not r.ok:
33 return None
34
35 return r.text
36
37 def parse_xml(xmldata):
38 ns = {
39 'atom':"http://www.w3.org/2005/Atom",
40 'yt': "http://www.youtube.com/xml/schemas/2015",
41 'media':"http://search.yahoo.com/mrss/"
42 }
43
44 feed = ElementTree.fromstring(xmldata)
45 title = feed.find('atom:title',ns).text
46 author = feed.find('atom:author/atom:name',ns).text \
47 if feed.find('atom:author',ns) else None
48 videos = []
49 for entry in feed.findall('atom:entry',ns):
50 videos.append({
51 'video_id': entry.find('yt:videoId',ns).text,
52 'title': entry.find('atom:title',ns).text,
53 'published': entry.find('atom:published',ns).text,
54 'channel_id': entry.find('yt:channelId',ns).text,
55 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
56 # extra fields for pull_subs/webhook:
57 'updated': entry.find('atom:updated',ns).text,
58 })
59
60 return title, author, videos
61
62 def update_channel(db, xmldata):
63 if not xmldata: return False
64
65 # Note: websub does not return global author, hence taking from first video
66 title, _, videos = parse_xml(xmldata)
67
68 c = db.cursor()
69 for i, video in enumerate(videos):
70 now = datetime.now(timezone.utc)
71 updated = dateutil.parser.parse(video['updated'])
72 published = dateutil.parser.parse(video['published'])
73 # if update and published time are near-identical, we assume it's new.
74 if (updated - published).seconds < 60 and (now - published).days < 7:
75 timestamp = now
76 else:#, it's just an update to an older video.
77 timestamp = published
78
79 c.execute("""
80 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
81 VALUES (?, ?, ?, datetime(?), datetime(?))
82 """, (
83 video['video_id'],
84 video['channel_id'],
85 video['title'],
86 video['published'],
87 timestamp
88 ))
89
90 if i == 0: # only required once per feed
91 c.execute("""
92 INSERT OR REPLACE INTO channels (id, name)
93 VALUES (?, ?)
94 """, (video['channel_id'], video['author']))
95 db.commit()
96
97 return True
98
99 def get_video_info(video_id, sts=0, algo=""):
100 """
101 returns: best-quality muxed video stream, player_response, error-type/mesage
102 error types: player, malformed, livestream, geolocked, exhausted
103 """
104 player_error = None # for 'exhausted'
105 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
106 r = requests.get(f"https://www.youtube.com/get_video_info"+
107 f"?video_id={video_id}"+
108 f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
109 f"&el={el}"+
110 f"&sts={sts}"+
111 f"&hl=en_US") #"&hl=en&gl=US"
112 params = parse_qs(r.text)
113 if 'errorcode' in params: # status=fail
114 return None, None, 'malformed', params['reason'][0]
115
116 metadata = json.loads(params.get('player_response')[0])
117 playabilityStatus = metadata['playabilityStatus']['status']
118 if playabilityStatus != "OK":
119 playabilityReason = metadata['playabilityStatus']['reason']
120 player_error = f"{playabilityStatus}: {playabilityReason}"
121 if playabilityStatus == "UNPLAYABLE":
122 continue # try again with next el value (or fail as exhausted)
123 # without videoDetails, there's only the error message
124 maybe_metadata = metadata if 'videoDetails' in metadata else None
125 return None, maybe_metadata, 'player', player_error
126 if metadata['videoDetails']['isLiveContent'] and \
127 (metadata['videoDetails'].get('isLive', False) or \
128 metadata['videoDetails'].get('isPostLiveDvr', False)):
129 return None, metadata, 'livestream', None
130
131 if not 'formats' in metadata['streamingData']:
132 continue # no urls
133
134 formats = metadata['streamingData']['formats']
135 for (i,v) in enumerate(formats):
136 if not ('cipher' in v or 'signatureCipher' in v): continue
137 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
138 formats[i]['url'] = unscramble(cipher, algo)
139
140 # todo: check if we have urls or try again
141 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
142
143 if 'gcr' in parse_qs(url):
144 return None, metadata, 'geolocked', None
145
146 return url, metadata, None, None
147 else:
148 return None, metadata, 'exhausted', player_error
149
150 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
151 signature = list(cipher['s'][0])
152 for c in algo.split():
153 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
154 ix = int(ix) % len(signature) if ix else 0
155 if not op: continue
156 if op == 'r': signature = list(reversed(signature))
157 if op == 's': signature = signature[ix:]
158 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
159 sp = cipher.get('sp', ['signature'])[0]
160 sig = cipher.get('sig', [''.join(signature)])[0]
161 return f"{cipher['url'][0]}&{sp}={sig}"
162
163 def prepare_metadata(metadata):
164 meta1 = metadata['videoDetails']
165 meta2 = metadata['microformat']['playerMicroformatRenderer']
166 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
167 if 'cards' in metadata else []
168 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
169 if 'endscreen' in metadata else []
170
171 # the actual video streams have exact information:
172 try:
173 sd = metadata['streamingData']
174 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
175 aspect_ratio = some_stream['width'] / some_stream['height']
176 # if that's unavailable (e.g. on livestreams), fall back to
177 # thumbnails (only either 4:3 or 16:9).
178 except:
179 some_img = meta2['thumbnail']['thumbnails'][0]
180 aspect_ratio = some_img['width'] / some_img['height']
181
182 subtitles = sorted([
183 {'url':cc['baseUrl'],
184 'code':cc['languageCode'],
185 'autogenerated':cc.get('kind')=="asr",
186 'name':cc['name']['simpleText']}
187 for cc in metadata.get('captions',{})
188 .get('playerCaptionsTracklistRenderer',{})
189 .get('captionTracks',[])
190 ], key=lambda cc: cc['autogenerated'])
191
192 def clean_url(url):
193 # externals URLs are redirected through youtube.com/redirect, but we
194 # may encounter internal URLs, too
195 return parse_qs(urlparse(url).query).get('q',[url])[0]
196 # Remove left-/rightmost word from string:
197 delL = lambda s: s.partition(' ')[2]
198 delR = lambda s: s.rpartition(' ')[0]
199 # Thousands seperator aware int():
200 intT = lambda s: int(s.replace(',', ''))
201
202 def parse_infocard(card):
203 card = card['cardRenderer']
204 ctype = list(card['content'].keys())[0]
205 content = card['content'][ctype]
206 if ctype == "pollRenderer":
207 ctype = "POLL"
208 content = {
209 'question': content['question']['simpleText'],
210 'answers': [(a['text']['simpleText'],a['numVotes']) \
211 for a in content['choices']],
212 }
213 elif ctype == "videoInfoCardContentRenderer":
214 ctype = "VIDEO"
215 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
216 # TODO: this is ugly; cleanup.
217 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
218 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
219 content = {
220 'video_id': content['action']['watchEndpoint']['videoId'],
221 'title': content['videoTitle']['simpleText'],
222 'author': delL(content['channelName']['simpleText']),
223 'length': length,
224 'views': intT(delR(content['viewCountText']['simpleText'])),
225 }
226 elif ctype == "playlistInfoCardContentRenderer":
227 ctype = "PLAYLIST"
228 content = {
229 'playlist_id': content['action']['watchEndpoint']['playlistId'],
230 'video_id': content['action']['watchEndpoint']['videoId'],
231 'title': content['playlistTitle']['simpleText'],
232 'author': delL(content['channelName']['simpleText']),
233 'n_videos': intT(content['playlistVideoCount']['simpleText']),
234 }
235 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
236 ctype = "WEBSITE"
237 content = {
238 'url': clean_url(content['command']['urlEndpoint']['url']),
239 'domain': content['displayDomain']['simpleText'],
240 'title': content['title']['simpleText'],
241 # XXX: no thumbnails for infocards
242 }
243 elif ctype == "collaboratorInfoCardContentRenderer":
244 ctype = "CHANNEL"
245 content = {
246 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
247 'title': content['channelName']['simpleText'],
248 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
249 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
250 }
251 else:
252 import pprint
253 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
254
255 return {'type': ctype, 'content': content}
256
257 def mkthumbs(thumbs):
258 return {e['height']: e['url'] for e in thumbs}
259 def parse_endcard(card):
260 card = card.get('endscreenElementRenderer', card) #only sometimes nested
261 ctype = card['style']
262 if ctype == "CHANNEL":
263 content = {
264 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
265 'title': card['title']['simpleText'],
266 'icons': mkthumbs(card['image']['thumbnails']),
267 }
268 elif ctype == "VIDEO":
269 content = {
270 'video_id': card['endpoint']['watchEndpoint']['videoId'],
271 'title': card['title']['simpleText'],
272 'length': card['videoDuration']['simpleText'], # '12:21'
273 'views': delR(card['metadata']['simpleText']),
274 # XXX: no channel name
275 }
276 elif ctype == "PLAYLIST":
277 content = {
278 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
279 'video_id': card['endpoint']['watchEndpoint']['videoId'],
280 'title': card['title']['simpleText'],
281 'author': delL(card['metadata']['simpleText']),
282 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
283 }
284 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
285 ctype = "WEBSITE"
286 url = clean_url(card['endpoint']['urlEndpoint']['url'])
287 content = {
288 'url': url,
289 'domain': urlparse(url).netloc,
290 'title': card['title']['simpleText'],
291 'icons': mkthumbs(card['image']['thumbnails']),
292 }
293 else:
294 import pprint
295 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
296
297 return {'type': ctype, 'content': content}
298
299 infocards = [parse_infocard(card) for card in cards]
300 endcards = [parse_endcard(card) for card in endsc]
301 # combine cards to weed out duplicates. for videos and playlists prefer
302 # infocards, for channels and websites prefer endcards, as those have more
303 # information than the other.
304 # if the card type is not in ident, we use the whole card for comparison
305 # (otherwise they'd all replace each other)
306 ident = { # ctype -> ident
307 'VIDEO': 'video_id',
308 'PLAYLIST': 'playlist_id',
309 'CHANNEL': 'channel_id',
310 'WEBSITE': 'url',
311 'POLL': 'question',
312 }
313 getident = lambda c: c['content'].get(ident.get(c['type']), c)
314 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
315 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
316
317 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
318 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
319
320 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
321 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
322 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
323 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
324 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
325 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
326 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
327 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
328 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
329 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
330 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
331 whitelisted = sorted(meta2.get('availableCountries',[]))
332 blacklisted = sorted(set(all_countries) - set(whitelisted))
333
334 return {
335 'title': meta1['title'],
336 'author': meta1['author'],
337 'channel_id': meta1['channelId'],
338 'description': meta1['shortDescription'],
339 'published': meta2['publishDate'],
340 'views': meta1['viewCount'],
341 'length': int(meta1['lengthSeconds']),
342 'rating': meta1['averageRating'],
343 'category': meta2['category'],
344 'aspectr': aspect_ratio,
345 'unlisted': meta2['isUnlisted'],
346 'countries': whitelisted,
347 'blacklisted': blacklisted,
348 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
349 'infocards': infocards,
350 'endcards': endcards,
351 'all_cards': allcards,
352 'subtitles': subtitles,
353 }
354
355 class RedditException(Exception): pass
356 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
357 count=None, before=None, after=None):
358 """
359 fetches data from a subreddit (or a multireddit like gif+gifs) and
360 filters/sorts results.
361 sorted_by values: hot, new, rising, controversial, top
362 time values: hour, week, month, year, all (for top and controversial)
363 returns a tuple of ([{video}],before,after)
364 """
365 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
366
367 if not subreddits:
368 return [], None, None
369
370 query = '&'.join([f"{k}={v}" for k,v in {
371 'count':count,
372 'before':before,
373 'after':after,
374 'limit':limit, # 1..100 (default 25)
375 't': time, # hour,week,month,year,all
376 }.items() if v])
377 multireddit = '+'.join(subreddits)
378 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
379 headers={'User-Agent':'Mozilla/5.0'})
380 if not r.ok or not 'data' in r.json():
381 raise RedditException(r.text)
382
383 videos = []
384 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
385 for entry in entries:
386 e = entry['data']
387 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
388 continue
389 try:
390 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
391 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
392 except:
393 continue # XXX: should we log that?
394 if not video_id: continue
395 videos.append({
396 'video_id': video_id,
397 'title': e['title'],
398 'url': e['permalink'],
399 'n_comments': e['num_comments'],
400 'n_karma': e['score'],
401 'subreddit': e['subreddit'],
402 'post_id': e['id'],
403 })
404 before = r.json()['data']['before']
405 after = r.json()['data']['after']
406
407 return videos, before, after
408
409 def pp(*args):
410 from pprint import pprint
411 import sys, codecs
412 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum