]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
fix currently-running livestream detection
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import requests
5 import requests_cache
6 import dateutil.parser
7 from xml.etree import ElementTree
8 from configparser import ConfigParser
9 from datetime import datetime, timezone
10 from urllib.parse import parse_qs, urlparse
11
12 cf = ConfigParser()
13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
14 cf.read(config_filename)
15
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
18
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading import Timer
22 def purge_cache(sec):
23 requests_cache.remove_expired_responses()
24 t = Timer(sec, purge_cache, args=(sec,))
25 t.setDaemon(True)
26 t.start()
27 purge_cache(10*60)
28
29 def fetch_xml(feed_type, feed_id):
30 r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
31 if not r.ok:
32 return None
33
34 return r.text
35
36 def parse_xml(xmldata):
37 ns = {
38 'atom':"http://www.w3.org/2005/Atom",
39 'yt': "http://www.youtube.com/xml/schemas/2015",
40 'media':"http://search.yahoo.com/mrss/"
41 }
42
43 feed = ElementTree.fromstring(xmldata)
44 title = feed.find('atom:title',ns).text
45 author = feed.find('atom:author/atom:name',ns).text \
46 if feed.find('atom:author',ns) else None
47 videos = []
48 for entry in feed.findall('atom:entry',ns):
49 videos.append({
50 'video_id': entry.find('yt:videoId',ns).text,
51 'title': entry.find('atom:title',ns).text,
52 'published': entry.find('atom:published',ns).text,
53 'channel_id': entry.find('yt:channelId',ns).text,
54 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
55 # extra fields for pull_subs/webhook:
56 'updated': entry.find('atom:updated',ns).text,
57 })
58
59 return title, author, videos
60
61 def update_channel(db, xmldata):
62 if not xmldata: return False
63
64 # Note: websub does not return global author, hence taking from first video
65 title, _, videos = parse_xml(xmldata)
66
67 c = db.cursor()
68 for i, video in enumerate(videos):
69 now = datetime.now(timezone.utc)
70 updated = dateutil.parser.parse(video['updated'])
71 published = dateutil.parser.parse(video['published'])
72 # if update and published time are near-identical, we assume it's new.
73 if (updated - published).seconds < 60 and (now - published).days < 7:
74 timestamp = now
75 else:#, it's just an update to an older video.
76 timestamp = published
77
78 c.execute("""
79 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
80 VALUES (?, ?, ?, datetime(?), datetime(?))
81 """, (
82 video['video_id'],
83 video['channel_id'],
84 video['title'],
85 video['published'],
86 timestamp
87 ))
88
89 if i == 0: # only required once per feed
90 c.execute("""
91 INSERT OR REPLACE INTO channels (id, name)
92 VALUES (?, ?)
93 """, (video['channel_id'], video['author']))
94 db.commit()
95
96 return True
97
98 def get_video_info(video_id, sts=0, algo=""):
99 """
100 returns: best-quality muxed video stream, player_response, error-type/mesage
101 error types: player, malformed, livestream, geolocked, exhausted
102 """
103 player_error = None # for 'exhausted'
104 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
105 r = requests.get(f"https://www.youtube.com/get_video_info"+
106 f"?video_id={video_id}"+
107 f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
108 f"&el={el}"+
109 f"&sts={sts}"+
110 f"&hl=en_US") #"&hl=en&gl=US"
111 params = parse_qs(r.text)
112 if 'errorcode' in params: # status=fail
113 return None, None, 'malformed', params['reason'][0]
114
115 metadata = json.loads(params.get('player_response')[0])
116 playabilityStatus = metadata['playabilityStatus']['status']
117 if playabilityStatus != "OK":
118 playabilityReason = metadata['playabilityStatus']['reason']
119 player_error = f"{playabilityStatus}: {playabilityReason}"
120 if playabilityStatus == "UNPLAYABLE":
121 continue # try again with next el value (or fail as exhausted)
122 # without videoDetails, there's only the error message
123 maybe_metadata = metadata if 'videoDetails' in metadata else None
124 return None, maybe_metadata, 'player', player_error
125 if metadata['videoDetails']['isLiveContent'] and \
126 (metadata['videoDetails'].get('isLive', False) or \
127 metadata['videoDetails'].get('isPostLiveDvr', False)):
128 return None, metadata, 'livestream', None
129
130 if not 'formats' in metadata['streamingData']:
131 continue # no urls
132
133 formats = metadata['streamingData']['formats']
134 for (i,v) in enumerate(formats):
135 if not ('cipher' in v or 'signatureCipher' in v): continue
136 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
137 formats[i]['url'] = unscramble(cipher, algo)
138
139 # todo: check if we have urls or try again
140 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
141
142 if 'gcr' in parse_qs(url):
143 return None, metadata, 'geolocked', None
144
145 return url, metadata, None, None
146 else:
147 return None, metadata, 'exhausted', player_error
148
149 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
150 signature = list(cipher['s'][0])
151 for c in algo.split():
152 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
153 ix = int(ix) % len(signature) if ix else 0
154 if not op: continue
155 if op == 'r': signature = list(reversed(signature))
156 if op == 's': signature = signature[ix:]
157 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
158 sp = cipher.get('sp', ['signature'])[0]
159 sig = cipher.get('sig', [''.join(signature)])[0]
160 return f"{cipher['url'][0]}&{sp}={sig}"
161
162 def prepare_metadata(metadata):
163 meta1 = metadata['videoDetails']
164 meta2 = metadata['microformat']['playerMicroformatRenderer']
165 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
166 if 'cards' in metadata else []
167 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
168 if 'endscreen' in metadata else []
169
170 # the actual video streams have exact information:
171 try:
172 sd = metadata['streamingData']
173 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
174 aspect_ratio = some_stream['width'] / some_stream['height']
175 # if that's unavailable (e.g. on livestreams), fall back to
176 # thumbnails (only either 4:3 or 16:9).
177 except:
178 some_img = meta2['thumbnail']['thumbnails'][0]
179 aspect_ratio = some_img['width'] / some_img['height']
180
181 subtitles = sorted([
182 {'url':cc['baseUrl'],
183 'code':cc['languageCode'],
184 'autogenerated':cc.get('kind')=="asr",
185 'name':cc['name']['simpleText']}
186 for cc in metadata.get('captions',{})
187 .get('playerCaptionsTracklistRenderer',{})
188 .get('captionTracks',[])
189 ], key=lambda cc: cc['autogenerated'])
190
191 def clean_url(url):
192 # externals URLs are redirected through youtube.com/redirect, but we
193 # may encounter internal URLs, too
194 return parse_qs(urlparse(url).query).get('q',[url])[0]
195 # Remove left-/rightmost word from string:
196 delL = lambda s: s.partition(' ')[2]
197 delR = lambda s: s.rpartition(' ')[0]
198 # Thousands seperator aware int():
199 intT = lambda s: int(s.replace(',', ''))
200
201 def parse_infocard(card):
202 card = card['cardRenderer']
203 ctype = list(card['content'].keys())[0]
204 content = card['content'][ctype]
205 if ctype == "pollRenderer":
206 ctype = "POLL"
207 content = {
208 'question': content['question']['simpleText'],
209 'answers': [(a['text']['simpleText'],a['numVotes']) \
210 for a in content['choices']],
211 }
212 elif ctype == "videoInfoCardContentRenderer":
213 ctype = "VIDEO"
214 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
215 # TODO: this is ugly; cleanup.
216 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
217 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
218 content = {
219 'video_id': content['action']['watchEndpoint']['videoId'],
220 'title': content['videoTitle']['simpleText'],
221 'author': delL(content['channelName']['simpleText']),
222 'length': length,
223 'views': intT(delR(content['viewCountText']['simpleText'])),
224 }
225 elif ctype == "playlistInfoCardContentRenderer":
226 ctype = "PLAYLIST"
227 content = {
228 'playlist_id': content['action']['watchEndpoint']['playlistId'],
229 'video_id': content['action']['watchEndpoint']['videoId'],
230 'title': content['playlistTitle']['simpleText'],
231 'author': delL(content['channelName']['simpleText']),
232 'n_videos': intT(content['playlistVideoCount']['simpleText']),
233 }
234 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
235 ctype = "WEBSITE"
236 content = {
237 'url': clean_url(content['command']['urlEndpoint']['url']),
238 'domain': content['displayDomain']['simpleText'],
239 'title': content['title']['simpleText'],
240 # XXX: no thumbnails for infocards
241 }
242 elif ctype == "collaboratorInfoCardContentRenderer":
243 ctype = "CHANNEL"
244 content = {
245 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
246 'title': content['channelName']['simpleText'],
247 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
248 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
249 }
250 else:
251 import pprint
252 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
253
254 return {'type': ctype, 'content': content}
255
256 def mkthumbs(thumbs):
257 return {e['height']: e['url'] for e in thumbs}
258 def parse_endcard(card):
259 card = card.get('endscreenElementRenderer', card) #only sometimes nested
260 ctype = card['style']
261 if ctype == "CHANNEL":
262 content = {
263 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
264 'title': card['title']['simpleText'],
265 'icons': mkthumbs(card['image']['thumbnails']),
266 }
267 elif ctype == "VIDEO":
268 content = {
269 'video_id': card['endpoint']['watchEndpoint']['videoId'],
270 'title': card['title']['simpleText'],
271 'length': card['videoDuration']['simpleText'], # '12:21'
272 'views': delR(card['metadata']['simpleText']),
273 # XXX: no channel name
274 }
275 elif ctype == "PLAYLIST":
276 content = {
277 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
278 'video_id': card['endpoint']['watchEndpoint']['videoId'],
279 'title': card['title']['simpleText'],
280 'author': delL(card['metadata']['simpleText']),
281 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
282 }
283 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
284 ctype = "WEBSITE"
285 url = clean_url(card['endpoint']['urlEndpoint']['url'])
286 content = {
287 'url': url,
288 'domain': urlparse(url).netloc,
289 'title': card['title']['simpleText'],
290 'icons': mkthumbs(card['image']['thumbnails']),
291 }
292 else:
293 import pprint
294 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
295
296 return {'type': ctype, 'content': content}
297
298 infocards = [parse_infocard(card) for card in cards]
299 endcards = [parse_endcard(card) for card in endsc]
300 # combine cards to weed out duplicates. for videos and playlists prefer
301 # infocards, for channels and websites prefer endcards, as those have more
302 # information than the other.
303 # if the card type is not in ident, we use the whole card for comparison
304 # (otherwise they'd all replace each other)
305 ident = { # ctype -> ident
306 'VIDEO': 'video_id',
307 'PLAYLIST': 'playlist_id',
308 'CHANNEL': 'channel_id',
309 'WEBSITE': 'url',
310 'POLL': 'question',
311 }
312 getident = lambda c: c['content'].get(ident.get(c['type']), c)
313 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
314 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
315
316 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
317 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
318
319 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
320 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
321 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
322 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
323 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
324 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
325 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
326 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
327 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
328 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
329 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
330 whitelisted = sorted(meta2['availableCountries'])
331 blacklisted = sorted(set(all_countries) - set(whitelisted))
332
333 return {
334 'title': meta1['title'],
335 'author': meta1['author'],
336 'channel_id': meta1['channelId'],
337 'description': meta1['shortDescription'],
338 'published': meta2['publishDate'],
339 'views': meta1['viewCount'],
340 'length': int(meta1['lengthSeconds']),
341 'rating': meta1['averageRating'],
342 'category': meta2['category'],
343 'aspectr': aspect_ratio,
344 'unlisted': meta2['isUnlisted'],
345 'countries': whitelisted,
346 'blacklisted': blacklisted,
347 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
348 'infocards': infocards,
349 'endcards': endcards,
350 'all_cards': allcards,
351 'subtitles': subtitles,
352 }
353
354 class RedditException(Exception): pass
355 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
356 count=None, before=None, after=None):
357 """
358 fetches data from a subreddit (or a multireddit like gif+gifs) and
359 filters/sorts results.
360 sorted_by values: hot, new, rising, controversial, top
361 time values: hour, week, month, year, all (for top and controversial)
362 returns a tuple of ([{video}],before,after)
363 """
364 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
365
366 if not subreddits:
367 return [], None, None
368
369 query = '&'.join([f"{k}={v}" for k,v in {
370 'count':count,
371 'before':before,
372 'after':after,
373 'limit':limit, # 1..100 (default 25)
374 't': time, # hour,week,month,year,all
375 }.items() if v])
376 multireddit = '+'.join(subreddits)
377 r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
378 headers={'User-Agent':'Mozilla/5.0'})
379 if not r.ok or not 'data' in r.json():
380 raise RedditException(r.text)
381
382 videos = []
383 entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
384 for entry in entries:
385 e = entry['data']
386 if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
387 continue
388 try:
389 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
390 video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
391 except:
392 continue # XXX: should we log that?
393 if not video_id: continue
394 videos.append({
395 'video_id': video_id,
396 'title': e['title'],
397 'url': e['permalink'],
398 'n_comments': e['num_comments'],
399 'n_karma': e['score'],
400 'subreddit': e['subreddit'],
401 'post_id': e['id'],
402 })
403 before = r.json()['data']['before']
404 after = r.json()['data']['after']
405
406 return videos, before, after
407
408 def pp(*args):
409 from pprint import pprint
410 import sys, codecs
411 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum