]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
implement audio-only-mode, return all streams from g_v_i
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}]
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 videos = []
80 for entry in feed.findall('atom:entry',ns):
81 videos.append({
82 'video_id': entry.find('yt:videoId',ns).text,
83 'title': entry.find('atom:title',ns).text,
84 'published': entry.find('atom:published',ns).text,
85 'channel_id': entry.find('yt:channelId',ns).text,
86 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
87 # extra fields for pull_subs/webhook:
88 'updated': entry.find('atom:updated',ns).text,
89 })
90
91 return title, author, videos
92
93 def update_channel(db, xmldata, from_webhook=False):
94 if not xmldata: return False
95
96 # Note: websub does not return global author, hence taking from first video
97 _, _, videos = parse_xml(xmldata)
98
99 c = db.cursor()
100 from flask import current_app # XXX: remove
101 for i, video in enumerate(videos):
102 if video.get('deleted'):
103 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
106 break
107
108 now = datetime.now(timezone.utc)
109 updated = dateutil.parser.parse(video['updated'])
110 published = dateutil.parser.parse(video['published'])
111 # if update and published time are near-identical, we assume it's new.
112 # checking if it was posted this week is necessary during xmlfeed pulling.
113 if (updated - published).seconds < 60 and (now - published).days < 7:
114 timestamp = now
115 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
116 else:#, it might just an update to an older video, or a previously unlisted one.
117 # first, assume it's an older video (correct when pulling xmlfeeds)
118 timestamp = published
119 # then, check if we don't know about it and if so, look up the real date.
120
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video). If
125 # that's the case, we fetch get_video_info and double-check.
126 # We only need to do this to not-yet-in-the-database videos.
127 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
128 new_video = len(c.fetchall()) < 1
129 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
130 if from_webhook and new_video:
131 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
132 _, _, meta, _, _ = get_video_info(video['video_id'])
133 if meta:
134 meta = prepare_metadata(meta)
135 published = dateutil.parser.parse(meta['published'])
136 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
137 if (now - published).days < 7:
138 timestamp = now
139 else:#, it's just an update to an older video.
140 timestamp = published
141
142 c.execute("""
143 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
144 VALUES (?, ?, ?, datetime(?), datetime(?))
145 """, (
146 video['video_id'],
147 video['channel_id'],
148 video['title'],
149 video['published'],
150 timestamp
151 ))
152
153 if i == 0: # only required once per feed
154 c.execute("""
155 INSERT OR REPLACE INTO channels (id, name)
156 VALUES (?, ?)
157 """, (video['channel_id'], video['author']))
158 db.commit()
159
160 return True
161
162 def get_video_info(video_id, sts=0, algo=""):
163 """
164 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
165 error types: player, malformed, livestream, geolocked, exhausted
166 """
167 player_error = None # for 'exhausted'
168 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
169 r = requests.get("https://www.youtube.com/get_video_info", {
170 "video_id": video_id,
171 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
172 "el": el,
173 "sts": sts,
174 "hl": "en_US",
175 })
176 params = parse_qs(r.text)
177 if 'errorcode' in params: # status=fail
178 return None, None, None, 'malformed', params['reason'][0]
179
180 metadata = json.loads(params.get('player_response')[0])
181 playabilityStatus = metadata['playabilityStatus']['status']
182 if playabilityStatus != "OK":
183 playabilityReason = metadata['playabilityStatus'].get('reason',
184 '//'.join(metadata['playabilityStatus'].get('messages',[])))
185 player_error = f"{playabilityStatus}: {playabilityReason}"
186 if playabilityStatus == "UNPLAYABLE":
187 continue # try again with next el value (or fail as exhausted)
188 # without videoDetails, there's only the error message
189 maybe_metadata = metadata if 'videoDetails' in metadata else None
190 return None, None, maybe_metadata, 'player', player_error
191 if metadata['videoDetails'].get('isLive', False):
192 return None, None, metadata, 'livestream', None
193
194 if not 'formats' in metadata['streamingData']:
195 continue # no urls
196
197 formats = metadata['streamingData']['formats']
198 for (i,v) in enumerate(formats):
199 if not ('cipher' in v or 'signatureCipher' in v): continue
200 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
201 formats[i]['url'] = unscramble(cipher, algo)
202
203 adaptive = metadata['streamingData']['adaptiveFormats']
204 for (i,v) in enumerate(adaptive):
205 if not ('cipher' in v or 'signatureCipher' in v): continue
206 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
207 adaptive[i]['url'] = unscramble(cipher, algo)
208
209 stream_map = {'adaptive': adaptive, 'muxed': formats}
210
211 # todo: check if we have urls or try again
212 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
213
214 # ip-locked videos can be recovered if the proxy module is loaded:
215 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
216
217 return url, stream_map, metadata, is_geolocked, None
218 else:
219 return None, None, metadata, 'exhausted', player_error
220
221 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
222 signature = list(cipher['s'][0])
223 for c in algo.split():
224 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
225 ix = int(ix) % len(signature) if ix else 0
226 if not op: continue
227 if op == 'r': signature = list(reversed(signature))
228 if op == 's': signature = signature[ix:]
229 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
230 sp = cipher.get('sp', ['signature'])[0]
231 sig = cipher.get('sig', [''.join(signature)])[0]
232 return f"{cipher['url'][0]}&{sp}={sig}"
233
234 def prepare_metadata(metadata):
235 meta1 = metadata['videoDetails']
236 meta2 = metadata['microformat']['playerMicroformatRenderer']
237 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
238 if 'cards' in metadata else []
239 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
240 if 'endscreen' in metadata else []
241
242 # the actual video streams have exact information:
243 try:
244 sd = metadata['streamingData']
245 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
246 aspect_ratio = some_stream['width'] / some_stream['height']
247 # if that's unavailable (e.g. on livestreams), fall back to
248 # thumbnails (only either 4:3 or 16:9).
249 except:
250 some_img = meta2['thumbnail']['thumbnails'][0]
251 aspect_ratio = some_img['width'] / some_img['height']
252
253 # Note: we could get subtitles in multiple formats directly by querying
254 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
255 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
256 # but that won't give us autogenerated subtitles (and is an extra request).
257 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
258 try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
259 default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
260 main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
261 except:
262 main_subtitle = -1
263 subtitles = sorted([
264 {'url':cc['baseUrl'],
265 'code':cc['languageCode'],
266 'autogenerated':cc.get('kind')=="asr",
267 'name':cc['name']['simpleText'],
268 'default':i==main_subtitle,
269 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
270 for i,cc in enumerate(metadata.get('captions',{})
271 .get('playerCaptionsTracklistRenderer',{})
272 .get('captionTracks',[]))
273 # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
274 ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
275
276 def clean_url(url):
277 # externals URLs are redirected through youtube.com/redirect, but we
278 # may encounter internal URLs, too
279 return parse_qs(urlparse(url).query).get('q',[url])[0]
280 # Remove left-/rightmost word from string:
281 delL = lambda s: s.partition(' ')[2]
282 delR = lambda s: s.rpartition(' ')[0]
283 # Thousands seperator aware int():
284 intT = lambda s: int(s.replace(',', ''))
285
286 def parse_infocard(card):
287 card = card['cardRenderer']
288 ctype = list(card['content'].keys())[0]
289 content = card['content'][ctype]
290 if ctype == "pollRenderer":
291 ctype = "POLL"
292 content = {
293 'question': content['question']['simpleText'],
294 'answers': [(a['text']['simpleText'],a['numVotes']) \
295 for a in content['choices']],
296 }
297 elif ctype == "videoInfoCardContentRenderer":
298 ctype = "VIDEO"
299 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
300 # TODO: this is ugly; cleanup.
301 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
302 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
303 content = {
304 'video_id': content['action']['watchEndpoint']['videoId'],
305 'title': content['videoTitle']['simpleText'],
306 'author': delL(content['channelName']['simpleText']),
307 'length': length,
308 'views': intT(delR(content['viewCountText']['simpleText'])),
309 }
310 elif ctype == "playlistInfoCardContentRenderer":
311 ctype = "PLAYLIST"
312 content = {
313 'playlist_id': content['action']['watchEndpoint']['playlistId'],
314 'video_id': content['action']['watchEndpoint']['videoId'],
315 'title': content['playlistTitle']['simpleText'],
316 'author': delL(content['channelName']['simpleText']),
317 'n_videos': intT(content['playlistVideoCount']['simpleText']),
318 }
319 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
320 ctype = "WEBSITE"
321 content = {
322 'url': clean_url(content['command']['urlEndpoint']['url']),
323 'domain': content['displayDomain']['simpleText'],
324 'title': content['title']['simpleText'],
325 # XXX: no thumbnails for infocards
326 }
327 elif ctype == "collaboratorInfoCardContentRenderer":
328 ctype = "CHANNEL"
329 content = {
330 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
331 'title': content['channelName']['simpleText'],
332 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
333 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
334 }
335 else:
336 import pprint
337 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
338
339 return {'type': ctype, 'content': content}
340
341 def mkthumbs(thumbs):
342 return {e['height']: e['url'] for e in thumbs}
343 def parse_endcard(card):
344 card = card.get('endscreenElementRenderer', card) #only sometimes nested
345 ctype = card['style']
346 if ctype == "CHANNEL":
347 content = {
348 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
349 'title': card['title']['simpleText'],
350 'icons': mkthumbs(card['image']['thumbnails']),
351 }
352 elif ctype == "VIDEO":
353 content = {
354 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
355 'title': card['title']['simpleText'],
356 'length': card['videoDuration']['simpleText'], # '12:21'
357 'views': delR(card['metadata']['simpleText']),
358 # XXX: no channel name
359 }
360 elif ctype == "PLAYLIST":
361 content = {
362 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
363 'video_id': card['endpoint']['watchEndpoint']['videoId'],
364 'title': card['title']['simpleText'],
365 'author': delL(card['metadata']['simpleText']),
366 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
367 }
368 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
369 ctype = "WEBSITE"
370 url = clean_url(card['endpoint']['urlEndpoint']['url'])
371 content = {
372 'url': url,
373 'domain': urlparse(url).netloc,
374 'title': card['title']['simpleText'],
375 'icons': mkthumbs(card['image']['thumbnails']),
376 }
377 else:
378 import pprint
379 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
380
381 return {'type': ctype, 'content': content}
382
383 infocards = [parse_infocard(card) for card in cards]
384 endcards = [parse_endcard(card) for card in endsc]
385 # combine cards to weed out duplicates. for videos and playlists prefer
386 # infocards, for channels and websites prefer endcards, as those have more
387 # information than the other.
388 # if the card type is not in ident, we use the whole card for comparison
389 # (otherwise they'd all replace each other)
390 ident = { # ctype -> ident
391 'VIDEO': 'video_id',
392 'PLAYLIST': 'playlist_id',
393 'CHANNEL': 'channel_id',
394 'WEBSITE': 'url',
395 'POLL': 'question',
396 }
397 getident = lambda c: c['content'].get(ident.get(c['type']), c)
398 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
399 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
400
401 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
402 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
403
404 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
405 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
406 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
407 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
408 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
409 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
410 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
411 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
412 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
413 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
414 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
415 whitelisted = sorted(meta2.get('availableCountries',[]))
416 blacklisted = sorted(set(all_countries) - set(whitelisted))
417
418 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
419 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
420 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
421 published_at = meta2['liveBroadcastDetails']['startTimestamp']
422
423 return {
424 'title': meta1['title'],
425 'author': meta1['author'],
426 'channel_id': meta1['channelId'],
427 'description': meta1['shortDescription'],
428 'published': published_at,
429 'views': meta1['viewCount'],
430 'length': int(meta1['lengthSeconds']),
431 'rating': meta1['averageRating'],
432 'category': meta2['category'],
433 'aspectr': aspect_ratio,
434 'unlisted': meta2['isUnlisted'],
435 'whitelisted': whitelisted,
436 'blacklisted': blacklisted,
437 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
438 'infocards': infocards,
439 'endcards': endcards,
440 'all_cards': allcards,
441 'subtitles': subtitles,
442 }
443
444 def store_video_metadata(video_id):
445 # check if we know about it, and if not, fetch and store video metadata
446 with sqlite3.connect(cf['global']['database']) as conn:
447 c = conn.cursor()
448 c.execute("SELECT 1 from videos where id = ?", (video_id,))
449 new_video = len(c.fetchall()) < 1
450 if new_video:
451 _, _, meta, _, _ = get_video_info(video_id)
452 if meta:
453 meta = prepare_metadata(meta)
454 c.execute("""
455 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
456 VALUES (?, ?, ?, datetime(?), datetime(?))
457 """, (
458 video_id,
459 meta['channel_id'],
460 meta['title'],
461 meta['published'],
462 meta['published'],
463 ))
464 c.execute("""
465 INSERT OR REPLACE INTO channels (id, name)
466 VALUES (?, ?)
467 """, (meta['channel_id'], meta['author']))
468
469 from werkzeug.exceptions import NotFound
470 class NoFallbackException(NotFound): pass
471 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
472 """
473 finds the next route that matches the current url rule, and executes it.
474 args, kwargs: pass all arguments of the current route
475 """
476 from flask import current_app, request, g
477
478 # build a list of endpoints that match the current request's url rule:
479 matching = [
480 rule.endpoint
481 for rule in current_app.url_map.iter_rules()
482 if rule.rule == request.url_rule.rule
483 ]
484 current = matching.index(request.endpoint)
485
486 # since we can't change request.endpoint, we always get the original
487 # endpoint back. so for repeated fall throughs, we use the g object to
488 # increment how often we want to fall through.
489 if not '_fallback_next' in g:
490 g._fallback_next = 0
491 g._fallback_next += 1
492
493 next_ep = current + g._fallback_next
494
495 if next_ep < len(matching):
496 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
497 else:
498 raise NoFallbackException
499
500 def websub_url_hmac(key, feed_id, timestamp, nonce):
501 """ generate sha1 hmac, as required by websub/pubsubhubbub """
502 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
503 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
504
505 def websub_body_hmac(key, body):
506 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
507
508 def pp(*args):
509 from pprint import pprint
510 import sys, codecs
511 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum