]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
return channel and playlist ids from parse_xml
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 # TODO: handling of playlists how? (shouldn't delete video if removed from a playlist)
74 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
75 return None, None, [{'deleted': True, 'video_id': vid}], None, None
76
77 title = feed.find('atom:title',ns).text
78 author = feed.find('atom:author/atom:name',ns).text \
79 if feed.find('atom:author',ns) else None
80 # for /user/<> endpoint: find out UC-id:
81 # for playlists: this is who created the playlist:
82 channel_id = feed.find('yt:channelId',ns).text \
83 if feed.find('yt:channelId',ns) else None
84 # for websub/pullsub: if this exists, we're looking at a playlist:
85 playlist_id = feed.find('yt:playlistId',ns).text \
86 if feed.find('yt:playlistId',ns) else None
87 videos = []
88 for entry in feed.findall('atom:entry',ns):
89 videos.append({
90 'video_id': entry.find('yt:videoId',ns).text,
91 'title': entry.find('atom:title',ns).text,
92 'published': entry.find('atom:published',ns).text,
93 'channel_id': entry.find('yt:channelId',ns).text,
94 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
95 # extra fields for pull_subs/webhook:
96 'updated': entry.find('atom:updated',ns).text,
97 # extra fields for playlists:
98 'playlist_id': playlist_id,
99 })
100
101 return title, author, videos, channel_id, playlist_id
102
103 def update_channel(db, xmldata, from_webhook=False):
104 if not xmldata: return False
105
106 # Note: websub does not return global author, hence taking from first video
107 title, author, videos, channel, playlist = parse_xml(xmldata)
108
109 c = db.cursor()
110 from flask import current_app # XXX: remove
111 for i, video in enumerate(videos):
112 if video.get('deleted'):
113 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
114 # TODO: enable once we enforce hmac validation:
115 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
116 break
117
118 now = datetime.now(timezone.utc)
119 updated = dateutil.parser.parse(video['updated'])
120 published = dateutil.parser.parse(video['published'])
121 # if update and published time are near-identical, we assume it's new.
122 # checking if it was posted this week is necessary during xmlfeed pulling.
123 if (updated - published).seconds < 60 and (now - published).days < 7:
124 timestamp = now
125 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
126 else:#, it might just an update to an older video, or a previously unlisted one.
127 # first, assume it's an older video (correct when pulling xmlfeeds)
128 timestamp = published
129 # then, check if we don't know about it and if so, look up the real date.
130
131 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
132 # video gets uploaded as unlisted on day A and set to public on day B;
133 # the webhook is sent on day B, but 'published' says A. The video
134 # therefore looks like it's just an update to an older video). If
135 # that's the case, we fetch get_video_info and double-check.
136 # We only need to do this to not-yet-in-the-database videos.
137 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
138 new_video = len(c.fetchall()) < 1
139 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
140 if from_webhook and new_video:
141 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
142 _, _, meta, _, _ = get_video_info(video['video_id'])
143 if meta:
144 meta = prepare_metadata(meta)
145 published = dateutil.parser.parse(meta['published'])
146 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
147 if (now - published).days < 7:
148 timestamp = now
149 else:#, it's just an update to an older video.
150 timestamp = published
151
152 c.execute("""
153 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
154 VALUES (?, ?, ?, datetime(?), datetime(?))
155 """, (
156 video['video_id'],
157 video['channel_id'],
158 video['title'],
159 video['published'],
160 timestamp
161 ))
162
163 if i == 0: # only required once per feed
164 c.execute("""
165 INSERT OR REPLACE INTO channels (id, name)
166 VALUES (?, ?)
167 """, (video['channel_id'], video['author']))
168 db.commit()
169
170 return True
171
172 def get_video_info(video_id, sts=0, algo=""):
173 """
174 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
175 error types: player, malformed, livestream, geolocked, exhausted
176 """
177 player_error = None # for 'exhausted'
178 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
179 r = requests.get("https://www.youtube.com/get_video_info", {
180 "video_id": video_id,
181 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
182 "el": el,
183 "sts": sts,
184 "hl": "en_US",
185 })
186 params = parse_qs(r.text)
187 if 'errorcode' in params: # status=fail
188 return None, None, None, 'malformed', params['reason'][0]
189
190 metadata = json.loads(params.get('player_response')[0])
191 playabilityStatus = metadata['playabilityStatus']['status']
192 if playabilityStatus != "OK":
193 playabilityReason = metadata['playabilityStatus'].get('reason',
194 '//'.join(metadata['playabilityStatus'].get('messages',[])))
195 player_error = f"{playabilityStatus}: {playabilityReason}"
196 if playabilityStatus == "UNPLAYABLE":
197 continue # try again with next el value (or fail as exhausted)
198 # without videoDetails, there's only the error message
199 maybe_metadata = metadata if 'videoDetails' in metadata else None
200 return None, None, maybe_metadata, 'player', player_error
201 if metadata['videoDetails'].get('isLive', False):
202 return None, None, metadata, 'livestream', None
203
204 if not 'formats' in metadata['streamingData']:
205 continue # no urls
206
207 formats = metadata['streamingData']['formats']
208 for (i,v) in enumerate(formats):
209 if not ('cipher' in v or 'signatureCipher' in v): continue
210 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
211 formats[i]['url'] = unscramble(cipher, algo)
212
213 adaptive = metadata['streamingData']['adaptiveFormats']
214 for (i,v) in enumerate(adaptive):
215 if not ('cipher' in v or 'signatureCipher' in v): continue
216 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
217 adaptive[i]['url'] = unscramble(cipher, algo)
218
219 stream_map = {'adaptive': adaptive, 'muxed': formats}
220
221 # todo: check if we have urls or try again
222 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
223
224 # ip-locked videos can be recovered if the proxy module is loaded:
225 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
226
227 return url, stream_map, metadata, is_geolocked, None
228 else:
229 return None, None, metadata, 'exhausted', player_error
230
231 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
232 signature = list(cipher['s'][0])
233 for c in algo.split():
234 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
235 ix = int(ix) % len(signature) if ix else 0
236 if not op: continue
237 if op == 'r': signature = list(reversed(signature))
238 if op == 's': signature = signature[ix:]
239 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
240 sp = cipher.get('sp', ['signature'])[0]
241 sig = cipher.get('sig', [''.join(signature)])[0]
242 return f"{cipher['url'][0]}&{sp}={sig}"
243
244 def prepare_metadata(metadata):
245 meta1 = metadata['videoDetails']
246 meta2 = metadata['microformat']['playerMicroformatRenderer']
247 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
248 if 'cards' in metadata else []
249 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
250 if 'endscreen' in metadata else []
251
252 # the actual video streams have exact information:
253 try:
254 sd = metadata['streamingData']
255 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
256 aspect_ratio = some_stream['width'] / some_stream['height']
257 # if that's unavailable (e.g. on livestreams), fall back to
258 # thumbnails (only either 4:3 or 16:9).
259 except:
260 some_img = meta2['thumbnail']['thumbnails'][0]
261 aspect_ratio = some_img['width'] / some_img['height']
262
263 # Note: we could get subtitles in multiple formats directly by querying
264 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
265 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
266 # but that won't give us autogenerated subtitles (and is an extra request).
267 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
268 try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
269 default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
270 main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
271 except:
272 main_subtitle = -1
273 subtitles = sorted([
274 {'url':cc['baseUrl'],
275 'code':cc['languageCode'],
276 'autogenerated':cc.get('kind')=="asr",
277 'name':cc['name']['simpleText'],
278 'default':i==main_subtitle,
279 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
280 for i,cc in enumerate(metadata.get('captions',{})
281 .get('playerCaptionsTracklistRenderer',{})
282 .get('captionTracks',[]))
283 # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
284 ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
285
286 def clean_url(url):
287 # externals URLs are redirected through youtube.com/redirect, but we
288 # may encounter internal URLs, too
289 return parse_qs(urlparse(url).query).get('q',[url])[0]
290 # Remove left-/rightmost word from string:
291 delL = lambda s: s.partition(' ')[2]
292 delR = lambda s: s.rpartition(' ')[0]
293 # Thousands seperator aware int():
294 intT = lambda s: int(s.replace(',', ''))
295
296 def parse_infocard(card):
297 card = card['cardRenderer']
298 ctype = list(card['content'].keys())[0]
299 content = card['content'][ctype]
300 if ctype == "pollRenderer":
301 ctype = "POLL"
302 content = {
303 'question': content['question']['simpleText'],
304 'answers': [(a['text']['simpleText'],a['numVotes']) \
305 for a in content['choices']],
306 }
307 elif ctype == "videoInfoCardContentRenderer":
308 ctype = "VIDEO"
309 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
310 # TODO: this is ugly; cleanup.
311 is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
312 length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText'] # '23:03'
313 content = {
314 'video_id': content['action']['watchEndpoint']['videoId'],
315 'title': content['videoTitle']['simpleText'],
316 'author': delL(content['channelName']['simpleText']),
317 'length': length,
318 'views': intT(delR(content['viewCountText']['simpleText'])),
319 }
320 elif ctype == "playlistInfoCardContentRenderer":
321 ctype = "PLAYLIST"
322 content = {
323 'playlist_id': content['action']['watchEndpoint']['playlistId'],
324 'video_id': content['action']['watchEndpoint']['videoId'],
325 'title': content['playlistTitle']['simpleText'],
326 'author': delL(content['channelName']['simpleText']),
327 'n_videos': intT(content['playlistVideoCount']['simpleText']),
328 }
329 elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
330 ctype = "WEBSITE"
331 content = {
332 'url': clean_url(content['command']['urlEndpoint']['url']),
333 'domain': content['displayDomain']['simpleText'],
334 'title': content['title']['simpleText'],
335 # XXX: no thumbnails for infocards
336 }
337 elif ctype == "collaboratorInfoCardContentRenderer":
338 ctype = "CHANNEL"
339 content = {
340 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
341 'title': content['channelName']['simpleText'],
342 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
343 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
344 }
345 else:
346 import pprint
347 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
348
349 return {'type': ctype, 'content': content}
350
351 def mkthumbs(thumbs):
352 return {e['height']: e['url'] for e in thumbs}
353 def parse_endcard(card):
354 card = card.get('endscreenElementRenderer', card) #only sometimes nested
355 ctype = card['style']
356 if ctype == "CHANNEL":
357 content = {
358 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
359 'title': card['title']['simpleText'],
360 'icons': mkthumbs(card['image']['thumbnails']),
361 }
362 elif ctype == "VIDEO":
363 content = {
364 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
365 'title': card['title']['simpleText'],
366 'length': card['videoDuration']['simpleText'], # '12:21'
367 'views': delR(card['metadata']['simpleText']),
368 # XXX: no channel name
369 }
370 elif ctype == "PLAYLIST":
371 content = {
372 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
373 'video_id': card['endpoint']['watchEndpoint']['videoId'],
374 'title': card['title']['simpleText'],
375 'author': delL(card['metadata']['simpleText']),
376 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
377 }
378 elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
379 ctype = "WEBSITE"
380 url = clean_url(card['endpoint']['urlEndpoint']['url'])
381 content = {
382 'url': url,
383 'domain': urlparse(url).netloc,
384 'title': card['title']['simpleText'],
385 'icons': mkthumbs(card['image']['thumbnails']),
386 }
387 else:
388 import pprint
389 content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
390
391 return {'type': ctype, 'content': content}
392
393 infocards = [parse_infocard(card) for card in cards]
394 endcards = [parse_endcard(card) for card in endsc]
395 # combine cards to weed out duplicates. for videos and playlists prefer
396 # infocards, for channels and websites prefer endcards, as those have more
397 # information than the other.
398 # if the card type is not in ident, we use the whole card for comparison
399 # (otherwise they'd all replace each other)
400 ident = { # ctype -> ident
401 'VIDEO': 'video_id',
402 'PLAYLIST': 'playlist_id',
403 'CHANNEL': 'channel_id',
404 'WEBSITE': 'url',
405 'POLL': 'question',
406 }
407 getident = lambda c: c['content'].get(ident.get(c['type']), c)
408 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
409 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
410
411 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
412 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
413
414 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
415 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
416 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
417 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
418 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
419 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
420 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
421 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
422 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
423 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
424 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
425 whitelisted = sorted(meta2.get('availableCountries',[]))
426 blacklisted = sorted(set(all_countries) - set(whitelisted))
427
428 published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
429 # 'premiere' videos (and livestreams?) have a ISO8601 date available:
430 if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
431 published_at = meta2['liveBroadcastDetails']['startTimestamp']
432
433 return {
434 'title': meta1['title'],
435 'author': meta1['author'],
436 'channel_id': meta1['channelId'],
437 'description': meta1['shortDescription'],
438 'published': published_at,
439 'views': meta1['viewCount'],
440 'length': int(meta1['lengthSeconds']),
441 'rating': meta1['averageRating'],
442 'category': meta2['category'],
443 'aspectr': aspect_ratio,
444 'unlisted': meta2['isUnlisted'],
445 'whitelisted': whitelisted,
446 'blacklisted': blacklisted,
447 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
448 'infocards': infocards,
449 'endcards': endcards,
450 'all_cards': allcards,
451 'subtitles': subtitles,
452 }
453
454 def store_video_metadata(video_id):
455 # check if we know about it, and if not, fetch and store video metadata
456 with sqlite3.connect(cf['global']['database']) as conn:
457 c = conn.cursor()
458 c.execute("SELECT 1 from videos where id = ?", (video_id,))
459 new_video = len(c.fetchall()) < 1
460 if new_video:
461 _, _, meta, _, _ = get_video_info(video_id)
462 if meta:
463 meta = prepare_metadata(meta)
464 c.execute("""
465 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
466 VALUES (?, ?, ?, datetime(?), datetime(?))
467 """, (
468 video_id,
469 meta['channel_id'],
470 meta['title'],
471 meta['published'],
472 meta['published'],
473 ))
474 c.execute("""
475 INSERT OR REPLACE INTO channels (id, name)
476 VALUES (?, ?)
477 """, (meta['channel_id'], meta['author']))
478
479 from werkzeug.exceptions import NotFound
480 class NoFallbackException(NotFound): pass
481 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
482 """
483 finds the next route that matches the current url rule, and executes it.
484 args, kwargs: pass all arguments of the current route
485 """
486 from flask import current_app, request, g
487
488 # build a list of endpoints that match the current request's url rule:
489 matching = [
490 rule.endpoint
491 for rule in current_app.url_map.iter_rules()
492 if rule.rule == request.url_rule.rule
493 ]
494 current = matching.index(request.endpoint)
495
496 # since we can't change request.endpoint, we always get the original
497 # endpoint back. so for repeated fall throughs, we use the g object to
498 # increment how often we want to fall through.
499 if not '_fallback_next' in g:
500 g._fallback_next = 0
501 g._fallback_next += 1
502
503 next_ep = current + g._fallback_next
504
505 if next_ep < len(matching):
506 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
507 else:
508 raise NoFallbackException
509
510 def websub_url_hmac(key, feed_id, timestamp, nonce):
511 """ generate sha1 hmac, as required by websub/pubsubhubbub """
512 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
513 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
514
515 def websub_body_hmac(key, body):
516 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
517
518 def pp(*args):
519 from pprint import pprint
520 import sys, codecs
521 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum