]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
fix livestream detection
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 def fetch_xml(feed_type, feed_id):
30 r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
31 if not r.ok:
32 return None
33
34 return r.text
35
36 def parse_xml(xmldata):
37 ns = {
38 'atom':"http://www.w3.org/2005/Atom",
39 'yt': "http://www.youtube.com/xml/schemas/2015",
40 'media':"http://search.yahoo.com/mrss/"
41 }
42
43 feed = ElementTree.fromstring(xmldata)
44 title = feed.find('atom:title',ns).text
45 author = feed.find('atom:author/atom:name',ns).text \
46 if feed.find('atom:author',ns) else None
47 videos = []
48 for entry in feed.findall('atom:entry',ns):
49 videos.append({
50 'video_id': entry.find('yt:videoId',ns).text,
51 'title': entry.find('atom:title',ns).text,
52 'published': entry.find('atom:published',ns).text,
53 'channel_id': entry.find('yt:channelId',ns).text,
54 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
55 # extra fields for pull_subs/webhook:
56 'updated': entry.find('atom:updated',ns).text,
57 })
58
59 return title, author, videos
60
61 def update_channel(db, xmldata):
62 if not xmldata: return False
63
64 # Note: websub does not return global author, hence taking from first video
65 title, _, videos = parse_xml(xmldata)
66
67 c = db.cursor()
68 for i, video in enumerate(videos):
69 now = datetime.now(timezone.utc)
70 updated = dateutil.parser.parse(video['updated'])
71 published = dateutil.parser.parse(video['published'])
72 # if update and published time are near-identical, we assume it's new.
73 if (updated - published).seconds < 60 and (now - published).days < 7:
74 timestamp = now
75 else:#, it's just an update to an older video.
76 timestamp = published
77
78 c.execute("""
79 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
80 VALUES (?, ?, ?, datetime(?), datetime(?))
81 """, (
82 video['video_id'],
83 video['channel_id'],
84 video['title'],
85 video['published'],
86 timestamp
87 ))
88
89 if i == 0: # only required once per feed
90 c.execute("""
91 INSERT OR REPLACE INTO channels (id, name)
92 VALUES (?, ?)
93 """, (video['channel_id'], video['author']))
94 db.commit()
95
96 return True
97
98 def get_video_info(video_id, sts=0, algo=""):
99 """
100 returns: best-quality muxed video stream, player_response, error-type/mesage
101 error types: player, malformed, livestream, geolocked, exhausted
102 """
103 player_error = None # for 'exhausted'
104 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
105 r = requests.get(f"https://www.youtube.com/get_video_info"+
106 f"?video_id={video_id}"+
107 f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
108 f"&el={el}"+
109 f"&sts={sts}"+
110 f"&hl=en_US") #"&hl=en&gl=US"
111 params = parse_qs(r.text)
112 if 'errorcode' in params: # status=fail
113 return None, None, 'malformed', params['reason'][0]
114
115 metadata = json.loads(params.get('player_response')[0])
116 playabilityStatus = metadata['playabilityStatus']['status']
117 if playabilityStatus != "OK":
118 playabilityReason = metadata['playabilityStatus']['reason']
119 player_error = f"{playabilityStatus}: {playabilityReason}"
120 if playabilityStatus == "UNPLAYABLE":
121 continue # try again with next el value (or fail as exhausted)
122 # without videoDetails, there's only the error message
123 maybe_metadata = metadata if 'videoDetails' in metadata else None
124 return None, maybe_metadata, 'player', player_error
125 if metadata['videoDetails']['isLiveContent'] and \
126 metadata['videoDetails'].get('isPostLiveDvr', False):
127 return None, metadata, 'livestream', None
128
129 if not 'formats' in metadata['streamingData']:
130 continue # no urls
131
132 formats = metadata['streamingData']['formats']
133 for (i,v) in enumerate(formats):
134 if not ('cipher' in v or 'signatureCipher' in v): continue
135 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
136 formats[i]['url'] = unscramble(cipher, algo)
137
138 # todo: check if we have urls or try again
139 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
140
141 if 'gcr' in parse_qs(url):
142 return None, metadata, 'geolocked', None
143
144 return url, metadata, None, None
145 else:
146 return None, metadata, 'exhausted', player_error
147
148 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
149 signature = list(cipher['s'][0])
150 for c in algo.split():
151 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
152 ix = int(ix) % len(signature) if ix else 0
153 if not op: continue
154 if op == 'r': signature = list(reversed(signature))
155 if op == 's': signature = signature[ix:]
156 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
157 sp = cipher.get('sp', ['signature'])[0]
158 sig = cipher.get('sig', [''.join(signature)])[0]
159 return f"{cipher['url'][0]}&{sp}={sig}"
160
161 def prepare_metadata(metadata):
162 meta1 = metadata['videoDetails']
163 meta2 = metadata['microformat']['playerMicroformatRenderer']
164 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
165 if 'cards' in metadata else []
166 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
167 if 'endscreen' in metadata else []
168
169 # the actual video streams have exact information:
170 try:
171 sd = metadata['streamingData']
172 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
173 aspect_ratio = some_stream['width'] / some_stream['height']
174 # if that's unavailable (e.g. on livestreams), fall back to
175 # thumbnails (only either 4:3 or 16:9).
176 except:
177 some_img = meta2['thumbnail']['thumbnails'][0]
178 aspect_ratio = some_img['width'] / some_img['height']
179
180 subtitles = sorted([
181 {'url':cc['baseUrl'],
182 'code':cc['languageCode'],
183 'autogenerated':cc.get('kind')=="asr",
184 'name':cc['name']['simpleText']}
185 for cc in metadata.get('captions',{})
186 .get('playerCaptionsTracklistRenderer',{})
187 .get('captionTracks',[])
188 ], key=lambda cc: cc['autogenerated'])
189
190 def clean_url(url):
191 # externals URLs are redirected through youtube.com/redirect, but we
192 # may encounter internal URLs, too
193 return parse_qs(urlparse(url).query).get('q',[url])[0]
194 # Remove left-/rightmost word from string:
195 delL = lambda s: s.partition(' ')[2]
196 delR = lambda s: s.rpartition(' ')[0]
197 # Thousands seperator aware int():
198 intT = lambda s: int(s.replace(',', ''))
199
200 def parse_infocard(card):
201 card = card['cardRenderer']
202 ctype = list(card['content'].keys())[0]
203 content = card['content'][ctype]
204 if ctype == "pollRenderer":
205 ctype = "POLL"
206 content = {
207 'question': content['question']['simpleText'],
208 'answers': [(a['text']['simpleText'],a['numVotes']) \
209 for a in content['choices']],
210 }
211 elif ctype == "videoInfoCardContentRenderer":
212 ctype = "VIDEO"
213 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
214 # TODO: this is ugly; cleanup.
215 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
216 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
217 content = {
218 'video_id': content['action']['watchEndpoint']['videoId'],
219 'title': content['videoTitle']['simpleText'],
220 'author': delL(content['channelName']['simpleText']),
221 'length': length,
222 'views': intT(delR(content['viewCountText']['simpleText'])),
223 }
224 elif ctype == "playlistInfoCardContentRenderer":
225 ctype = "PLAYLIST"
226 content = {
227 'playlist_id': content['action']['watchEndpoint']['playlistId'],
228 'video_id': content['action']['watchEndpoint']['videoId'],
229 'title': content['playlistTitle']['simpleText'],
230 'author': delL(content['channelName']['simpleText']),
231 'n_videos': intT(content['playlistVideoCount']['simpleText']),
232 }
233 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
234 ctype = "WEBSITE"
235 content = {
236 'url': clean_url(content['command']['urlEndpoint']['url']),
237 'domain': content['displayDomain']['simpleText'],
238 'title': content['title']['simpleText'],
239 # XXX: no thumbnails for infocards
240 }
241 elif ctype == "collaboratorInfoCardContentRenderer":
242 ctype = "CHANNEL"
243 content = {
244 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
245 'title': content['channelName']['simpleText'],
246 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
247 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
248 }
249 else:
250 import pprint
251 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
252
253 return {'type': ctype, 'content': content}
254
255 def mkthumbs(thumbs):
256 return {e['height']: e['url'] for e in thumbs}
257 def parse_endcard(card):
258 card = card.get('endscreenElementRenderer', card) #only sometimes nested
259 ctype = card['style']
260 if ctype == "CHANNEL":
261 content = {
262 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
263 'title': card['title']['simpleText'],
264 'icons': mkthumbs(card['image']['thumbnails']),
265 }
266 elif ctype == "VIDEO":
267 content = {
268 'video_id': card['endpoint']['watchEndpoint']['videoId'],
269 'title': card['title']['simpleText'],
270 'length': card['videoDuration']['simpleText'], # '12:21'
271 'views': delR(card['metadata']['simpleText']),
272 # XXX: no channel name
273 }
274 elif ctype == "PLAYLIST":
275 content = {
276 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
277 'video_id': card['endpoint']['watchEndpoint']['videoId'],
278 'title': card['title']['simpleText'],
279 'author': delL(card['metadata']['simpleText']),
280 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
281 }
282 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
283 ctype = "WEBSITE"
284 url = clean_url(card['endpoint']['urlEndpoint']['url'])
285 content = {
286 'url': url,
287 'domain': urlparse(url).netloc,
288 'title': card['title']['simpleText'],
289 'icons': mkthumbs(card['image']['thumbnails']),
290 }
291 else:
292 import pprint
293 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
294
295 return {'type': ctype, 'content': content}
296
297 infocards = [parse_infocard(card) for card in cards]
298 endcards = [parse_endcard(card) for card in endsc]
299 # combine cards to weed out duplicates. for videos and playlists prefer
300 # infocards, for channels and websites prefer endcards, as those have more
301 # information than the other.
302 # if the card type is not in ident, we use the whole card for comparison
303 # (otherwise they'd all replace each other)
304 ident = { # ctype -> ident
305 'VIDEO': 'video_id',
306 'PLAYLIST': 'playlist_id',
307 'CHANNEL': 'channel_id',
308 'WEBSITE': 'url',
309 'POLL': 'question',
310 }
311 getident = lambda c: c['content'].get(ident.get(c['type']), c)
312 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
313 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
314
315 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
316 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
317
318 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
319 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
320 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
321 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
322 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
323 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
324 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
325 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
326 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
327 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
328 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
329 whitelisted = sorted(meta2['availableCountries'])
330 blacklisted = sorted(set(all_countries) - set(whitelisted))
331
332 return {
333 'title': meta1['title'],
334 'author': meta1['author'],
335 'channel_id': meta1['channelId'],
336 'description': meta1['shortDescription'],
337 'published': meta2['publishDate'],
338 'views': meta1['viewCount'],
339 'length': int(meta1['lengthSeconds']),
340 'rating': meta1['averageRating'],
341 'category': meta2['category'],
342 'aspectr': aspect_ratio,
343 'unlisted': meta2['isUnlisted'],
344 'countries': whitelisted,
345 'blacklisted': blacklisted,
346 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
347 'infocards': infocards,
348 'endcards': endcards,
349 'all_cards': allcards,
350 'subtitles': subtitles,
351 }
352
353 class RedditException(Exception): pass
354 def fetch_reddit(subreddits, params=[], count=None, before=None, after=None):
355 """
356 fetches data from a subreddit (or a multireddit like gif+gifs) and
357 filters/sorts results.
358 returns a tuple of ([{video}],before,after)
359 """
360 # TODO support /r/videos/top/?t=week
361 # TODO support ?limit=100
362
363 if not subreddits:
364 return [], None, None
365
366 query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after), *params] if v])
367 multireddit = '+'.join(subreddits)
368 r = requests.get(f"https://old.reddit.com/r/{multireddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'})
369 if not r.ok or not 'data' in r.json():
370 raise RedditException(r.text)
371
372 videos = []
373 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
374 for entry in entries:
375 e = entry['data']
376 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
377 continue
378 try:
379 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
380 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
381 except:
382 continue # XXX: should we log that?
383 if not video_id: continue
384 videos.append({
385 'video_id': video_id,
386 'title': e['title'],
387 'url': e['permalink'],
388 'n_comments': e['num_comments'],
389 'n_karma': e['score'],
390 'subreddit': e['subreddit'],
391 'post_id': e['id'],
392 })
393 before = r.json()['data']['before']
394 after = r.json()['data']['after']
395
396 return videos, before, after
397
398 def pp(*args):
399 from pprint import pprint
400 import sys, codecs
401 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum