]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
split metadata in essential and extended
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 from .innertube import parse_infocard, parse_endcard
16
17 cf = ConfigParser()
18 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
19 cf.read(config_filename)
20 if not 'global' in cf: # todo: full config check
21 raise Exception("Configuration file not found or empty")
22
23 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
24 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
25
26 # Note: this should only be required for the 'memory' backed cache.
27 # TODO: only run for long-running processes, i.e. the frontend
28 from threading import Timer
29 def purge_cache(sec):
30 requests_cache.remove_expired_responses()
31 t = Timer(sec, purge_cache, args=(sec,))
32 t.setDaemon(True)
33 t.start()
34 purge_cache(10*60)
35
36 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
37 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from flask import g
39 import requests
40 from requests import Session as OriginalSession
41 class _NSASession(OriginalSession):
42 def request(self, method, url, params=None, data=None, **kwargs):
43 response = super(_NSASession, self).request(
44 method, url, params, data, **kwargs
45 )
46 try:
47 if 'api_requests' not in g:
48 g.api_requests = []
49 g.api_requests.append((url, params, response.text))
50 except RuntimeError: pass # not within flask (e.g. utils.py)
51 return response
52 requests.Session = requests.sessions.Session = _NSASession
53
54 def fetch_xml(feed_type, feed_id):
55 # TODO: handle requests.exceptions.ConnectionError
56 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
57 feed_type: feed_id,
58 })
59 if not r.ok:
60 return None
61
62 return r.content
63
64 def parse_xml(xmldata):
65 ns = {
66 'atom':"http://www.w3.org/2005/Atom",
67 'yt': "http://www.youtube.com/xml/schemas/2015",
68 'media':"http://search.yahoo.com/mrss/",
69 'at': "http://purl.org/atompub/tombstones/1.0",
70 }
71
72 feed = ElementTree.fromstring(xmldata)
73
74 if feed.find('at:deleted-entry',ns):
75 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
76 return None, None, [{'deleted': True, 'video_id': vid}], None, None
77
78 title = feed.find('atom:title',ns).text
79 author = feed.find('atom:author/atom:name',ns).text \
80 if feed.find('atom:author',ns) else None
81 # for /user/<> endpoint: find out UC-id:
82 # for playlists: this is who created the playlist:
83 try: channel_id = feed.find('yt:channelId',ns).text
84 except:channel_id=None # XXX: why does ternary not work!?
85 # for pullsub: if this exists, we're looking at a playlist:
86 try: playlist_id = feed.find('yt:playlistId',ns).text
87 except:playlist_id=None # XXX: why does ternary not work!?
88 videos = []
89 for entry in feed.findall('atom:entry',ns):
90 videos.append({
91 'video_id': entry.find('yt:videoId',ns).text,
92 'title': entry.find('atom:title',ns).text,
93 'published': entry.find('atom:published',ns).text,
94 'channel_id': entry.find('yt:channelId',ns).text,
95 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
96 # extra fields for pull_subs/webhook:
97 'updated': entry.find('atom:updated',ns).text,
98 })
99
100 return title, author, videos, channel_id, playlist_id
101
102 def update_channel(db, xmldata, from_webhook=False):
103 if not xmldata: return False
104
105 # Note: websub does not return global author, hence taking from first video
106 title, author, videos, channel, playlist = parse_xml(xmldata)
107
108 c = db.cursor()
109 from flask import current_app # XXX: remove
110 for i, video in enumerate(videos):
111 if video.get('deleted'):
112 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
113 # TODO: enable once we enforce hmac validation:
114 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
115 break
116
117 now = datetime.now(timezone.utc)
118 updated = dateutil.parser.parse(video['updated'])
119 published = dateutil.parser.parse(video['published'])
120 # if update and published time are near-identical, we assume it's new.
121 # checking if it was posted this week is necessary during xmlfeed pulling.
122 if (updated - published).seconds < 60 and (now - published).days < 7:
123 timestamp = now
124 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
125 else:#, it might just an update to an older video, or a previously unlisted one.
126 # first, assume it's an older video (correct when pulling xmlfeeds)
127 timestamp = published
128 # then, check if we don't know about it and if so, look up the real date.
129
130 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
131 # video gets uploaded as unlisted on day A and set to public on day B;
132 # the webhook is sent on day B, but 'published' says A. The video
133 # therefore looks like it's just an update to an older video). If
134 # that's the case, we fetch get_video_info and double-check.
135 # We only need to do this to not-yet-in-the-database videos.
136 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
137 new_video = len(c.fetchall()) < 1
138 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
139 if from_webhook and new_video:
140 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
141 _, _, meta, _, _ = get_video_info(video['video_id'])
142 if meta:
143 meta = video_metadata(meta)
144 published = dateutil.parser.parse(meta['published'])
145 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
146 if (now - published).days < 7:
147 timestamp = now
148 else:#, it's just an update to an older video.
149 timestamp = published
150
151 c.execute("""
152 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
153 VALUES (?, ?, ?, datetime(?), datetime(?))
154 """, (
155 video['video_id'],
156 video['channel_id'],
157 video['title'],
158 video['published'],
159 timestamp
160 ))
161
162 # for channels, this is obviously always the same, but playlists can
163 # consist of videos from different channels:
164 if i == 0 or playlist:
165 c.execute("""
166 INSERT OR REPLACE INTO channels (id, name)
167 VALUES (?, ?)
168 """, (video['channel_id'], video['author']))
169
170 # keep track of which videos are in a playlist, so we can show the user
171 # why a video is in their feed:
172 if playlist:
173 c.execute("""
174 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
175 VALUES (?, ?)
176 """, (video['video_id'], playlist))
177
178 if playlist and not from_webhook: # Note: playlists can't get updated via websub
179 c.execute("""
180 INSERT OR REPLACE INTO playlists (id, name, author)
181 VALUES (?, ?, ?)
182 """, (playlist, title, channel))
183 c.execute("""
184 INSERT OR REPLACE INTO channels (id, name)
185 VALUES (?, ?)
186 """, (channel, author))
187
188 db.commit()
189
190 return True
191
192 def get_video_info(video_id, sts=0, algo=""):
193 """
194 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
195 error types: player, malformed, livestream, geolocked, exhausted
196 """
197 player_error = None # for 'exhausted'
198 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
199 r = requests.get("https://www.youtube.com/get_video_info", {
200 "video_id": video_id,
201 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
202 "el": el,
203 "sts": sts,
204 "hl": "en_US",
205 })
206 params = parse_qs(r.text)
207 if 'errorcode' in params: # status=fail
208 return None, None, None, 'malformed', params['reason'][0]
209
210 metadata = json.loads(params.get('player_response')[0])
211 playabilityStatus = metadata['playabilityStatus']['status']
212 if playabilityStatus != "OK":
213 playabilityReason = metadata['playabilityStatus'].get('reason',
214 '//'.join(metadata['playabilityStatus'].get('messages',[])))
215 player_error = f"{playabilityStatus}: {playabilityReason}"
216 if playabilityStatus == "UNPLAYABLE":
217 continue # try again with next el value (or fail as exhausted)
218 # without videoDetails, there's only the error message
219 maybe_metadata = metadata if 'videoDetails' in metadata else None
220 return None, None, maybe_metadata, 'player', player_error
221 if metadata['videoDetails'].get('isLive', False):
222 return None, None, metadata, 'livestream', None
223
224 if not 'formats' in metadata['streamingData']:
225 continue # no urls
226
227 formats = metadata['streamingData']['formats']
228 for (i,v) in enumerate(formats):
229 if not ('cipher' in v or 'signatureCipher' in v): continue
230 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
231 formats[i]['url'] = unscramble(cipher, algo)
232
233 adaptive = metadata['streamingData']['adaptiveFormats']
234 for (i,v) in enumerate(adaptive):
235 if not ('cipher' in v or 'signatureCipher' in v): continue
236 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
237 adaptive[i]['url'] = unscramble(cipher, algo)
238
239 stream_map = {'adaptive': adaptive, 'muxed': formats}
240
241 # todo: check if we have urls or try again
242 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
243
244 # ip-locked videos can be recovered if the proxy module is loaded:
245 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
246
247 return url, stream_map, metadata, is_geolocked, None
248 else:
249 return None, None, metadata, 'exhausted', player_error
250
251 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
252 signature = list(cipher['s'][0])
253 for c in algo.split():
254 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
255 ix = int(ix) % len(signature) if ix else 0
256 if not op: continue
257 if op == 'r': signature = list(reversed(signature))
258 if op == 's': signature = signature[ix:]
259 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
260 sp = cipher.get('sp', ['signature'])[0]
261 sig = cipher.get('sig', [''.join(signature)])[0]
262 return f"{cipher['url'][0]}&{sp}={sig}"
263
264 def video_metadata(metadata):
265 if not metadata:
266 return {}
267
268 meta1 = metadata['videoDetails']
269 meta2 = metadata['microformat']['playerMicroformatRenderer']
270
271 published_at = meta2.get('liveBroadcastDetails',{}) \
272 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
273
274 return {
275 'title': meta1['title'],
276 'author': meta1['author'],
277 'channel_id': meta1['channelId'],
278 'published': published_at,
279 'views': int(meta1['viewCount']),
280 'length': int(meta1['lengthSeconds']),
281 }
282
283 def prepare_metadata(metadata):
284 meta1 = metadata['videoDetails']
285 meta2 = metadata['microformat']['playerMicroformatRenderer']
286 cards = metadata['cards']['cardCollectionRenderer']['cards'] \
287 if 'cards' in metadata else []
288 endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
289 if 'endscreen' in metadata else []
290
291 # the actual video streams have exact information:
292 try:
293 sd = metadata['streamingData']
294 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
295 aspect_ratio = some_stream['width'] / some_stream['height']
296 # if that's unavailable (e.g. on livestreams), fall back to
297 # thumbnails (only either 4:3 or 16:9).
298 except:
299 some_img = meta2['thumbnail']['thumbnails'][0]
300 aspect_ratio = some_img['width'] / some_img['height']
301
302 # Note: we could get subtitles in multiple formats directly by querying
303 # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
304 # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
305 # but that won't give us autogenerated subtitles (and is an extra request).
306 # we can still add &fmt= to the extracted URLs below (first one takes precedence).
307 try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
308 default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
309 main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
310 except:
311 main_subtitle = -1
312 subtitles = sorted([
313 {'url':cc['baseUrl'],
314 'code':cc['languageCode'],
315 'autogenerated':cc.get('kind')=="asr",
316 'name':cc['name']['simpleText'],
317 'default':i==main_subtitle,
318 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
319 for i,cc in enumerate(metadata.get('captions',{})
320 .get('playerCaptionsTracklistRenderer',{})
321 .get('captionTracks',[]))
322 # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
323 ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
324
325 infocards = [parse_infocard(card) for card in cards]
326 endcards = [parse_endcard(card) for card in endsc]
327 # combine cards to weed out duplicates. for videos and playlists prefer
328 # infocards, for channels and websites prefer endcards, as those have more
329 # information than the other.
330 # if the card type is not in ident, we use the whole card for comparison
331 # (otherwise they'd all replace each other)
332 ident = { # ctype -> ident
333 'VIDEO': 'video_id',
334 'PLAYLIST': 'playlist_id',
335 'CHANNEL': 'channel_id',
336 'WEBSITE': 'url',
337 'POLL': 'question',
338 }
339 getident = lambda c: c['content'].get(ident.get(c['type']), c)
340 mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
341 exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
342
343 allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \
344 exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST']))
345
346 all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
347 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
348 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
349 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
350 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
351 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
352 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
353 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
354 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
355 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
356 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
357 whitelisted = sorted(meta2.get('availableCountries',[]))
358 blacklisted = sorted(set(all_countries) - set(whitelisted))
359
360 return {
361 **video_metadata(metadata),
362 'description': meta1['shortDescription'],
363 'rating': meta1['averageRating'],
364 'category': meta2['category'],
365 'aspectr': aspect_ratio,
366 'unlisted': meta2['isUnlisted'],
367 'whitelisted': whitelisted,
368 'blacklisted': blacklisted,
369 'poster': meta2['thumbnail']['thumbnails'][0]['url'],
370 'infocards': infocards,
371 'endcards': endcards,
372 'all_cards': allcards,
373 'subtitles': subtitles,
374 }
375
376 def store_video_metadata(video_id):
377 # check if we know about it, and if not, fetch and store video metadata
378 with sqlite3.connect(cf['global']['database']) as conn:
379 c = conn.cursor()
380 c.execute("SELECT 1 from videos where id = ?", (video_id,))
381 new_video = len(c.fetchall()) < 1
382 if new_video:
383 _, _, meta, _, _ = get_video_info(video_id)
384 if meta:
385 meta = video_metadata(meta)
386 c.execute("""
387 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
388 VALUES (?, ?, ?, datetime(?), datetime(?))
389 """, (
390 video_id,
391 meta['channel_id'],
392 meta['title'],
393 meta['published'],
394 meta['published'],
395 ))
396 c.execute("""
397 INSERT OR REPLACE INTO channels (id, name)
398 VALUES (?, ?)
399 """, (meta['channel_id'], meta['author']))
400
401 from werkzeug.exceptions import NotFound
402 class NoFallbackException(NotFound): pass
403 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
404 """
405 finds the next route that matches the current url rule, and executes it.
406 args, kwargs: pass all arguments of the current route
407 """
408 from flask import current_app, request, g
409
410 # build a list of endpoints that match the current request's url rule:
411 matching = [
412 rule.endpoint
413 for rule in current_app.url_map.iter_rules()
414 if rule.rule == request.url_rule.rule
415 ]
416 current = matching.index(request.endpoint)
417
418 # since we can't change request.endpoint, we always get the original
419 # endpoint back. so for repeated fall throughs, we use the g object to
420 # increment how often we want to fall through.
421 if not '_fallback_next' in g:
422 g._fallback_next = 0
423 g._fallback_next += 1
424
425 next_ep = current + g._fallback_next
426
427 if next_ep < len(matching):
428 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
429 else:
430 raise NoFallbackException
431
432 def websub_url_hmac(key, feed_id, timestamp, nonce):
433 """ generate sha1 hmac, as required by websub/pubsubhubbub """
434 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
435 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
436
437 def websub_body_hmac(key, body):
438 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
439
440 def pp(*args):
441 from pprint import pprint
442 import sys, codecs
443 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum