]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
update youtube api calls to client version 18
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, json=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params=params, data=data, json=json, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, json, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 class G:
54 """
55 null-coalescing version of dict.get() that also works on lists.
56
57 the | operator is overloaded to achieve similar looking code to jq(1) filters.
58 the first found key is used: dict(foo=1)|G('bar','foo') returns 1.
59 """
60 def __init__(self, *keys):
61 self.keys = keys
62 def __ror__(self, other):
63 for key in self.keys:
64 try: return other[key]
65 except: continue
66 return None
67 class _Text:
68 """ parses youtube's .runs[].text and .simpleText variants """
69 def __ror__(self, other): # Note: only returning runs[0], not concat'ing all!
70 return other|G('simpleText') or other|G('runs')|G(0)|G('text')
71 text = _Text()
72
73 def fetch_xml(feed_type, feed_id):
74 # TODO: handle requests.exceptions.ConnectionError
75 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
76 feed_type: feed_id,
77 })
78 if not r.ok:
79 return None
80
81 return r.content
82
83 def parse_xml(xmldata):
84 ns = {
85 'atom':"http://www.w3.org/2005/Atom",
86 'yt': "http://www.youtube.com/xml/schemas/2015",
87 'media':"http://search.yahoo.com/mrss/",
88 'at': "http://purl.org/atompub/tombstones/1.0",
89 }
90
91 feed = ElementTree.fromstring(xmldata)
92
93 if feed.find('at:deleted-entry',ns):
94 del_entry = feed.find('at:deleted-entry',ns)
95 del_author = del_entry.find('at:by',ns)
96 _, _, vid = del_entry.get('ref').rpartition(':')
97 _, _, channel_id = del_author.find('atom:uri',ns).text.rpartition('/')
98 author = del_author.find('atom:name',ns).text
99 entry = [{
100 'deleted': True,
101 'video_id': vid,
102 'channel_id': channel_id,
103 'author': author,
104 }]
105 return None, None, entry, None, None
106
107 title = feed.find('atom:title',ns).text
108 author = feed.find('atom:author/atom:name',ns).text \
109 if feed.find('atom:author',ns) else None
110 # for /user/<> endpoint: find out UC-id:
111 # for playlists: this is who created the playlist:
112 try: channel_id = feed.find('yt:channelId',ns).text
113 except:channel_id = None
114 # for pullsub: if this exists, we're looking at a playlist:
115 try: playlist_id = feed.find('yt:playlistId',ns).text
116 except:playlist_id = None
117 videos = []
118 for entry in feed.findall('atom:entry',ns):
119 videos.append({
120 'video_id': entry.find('yt:videoId',ns).text,
121 'title': entry.find('atom:title',ns).text,
122 'published': entry.find('atom:published',ns).text,
123 'channel_id': entry.find('yt:channelId',ns).text,
124 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
125 # extra fields for pull_subs/webhook:
126 'updated': entry.find('atom:updated',ns).text,
127 })
128
129 return title, author, videos, channel_id, playlist_id
130
131 def update_channel(db, xmldata, from_webhook=False):
132 if not xmldata: return False
133
134 # Note: websub does not return global author, hence taking from first video
135 title, author, videos, channel, playlist = parse_xml(xmldata)
136
137 c = db.cursor()
138 for i, video in enumerate(videos):
139 if video.get('deleted'):
140 # Note: Deletion events are not just fired for actual deletions,
141 # but also for unlisting videos and livestreams that just ended
142 # (even postLiveDVR ones). Hence, we don't follow it.
143 flask_logger(f"ignoring deleted/unlisted video or ended livestream {video['video_id']} by {video['channel_id']} ({video['author']})")
144 break
145
146 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
147 new_video = len(c.fetchall()) < 1
148 if new_video:
149 # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
150 _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
151 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
152 # video gets uploaded as unlisted on day A and set to public on day B;
153 # the webhook is sent on day B, but 'published' says A. The video
154 # therefore looks like it's just an update to an older video).
155 # g_v_i gives is the date the video was published to viewers, so we
156 # prefer that. But since g_v_i only returns the date without time,
157 # we still use xmlfeed's date if it's the same date.
158 published = dateutil.parser.parse(video['published'])
159 length = None
160 livestream = None
161 premiere = None
162 shorts = None
163 if meta:
164 meta = video_metadata(meta)
165 published2 = dateutil.parser.parse(meta['published'])
166 if published < published2: # g_v_i date is more accurate:
167 published = published2
168 length = meta['length']
169 livestream = meta['livestream']
170 premiere = meta['premiere']
171 shorts = meta['shorts']
172
173 now = datetime.now(timezone.utc)
174
175 # we pretend that all videos uploaded this week were uploaded just
176 # now, so the user sees it at the top of the feed, and it doesn't
177 # get inserted somewhere further down.
178 if (now - published).days < 7:
179 timestamp = now
180 else:#, it's just an update to an older video.
181 timestamp = published
182
183 c.execute("""
184 INSERT OR IGNORE INTO videos
185 (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
186 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
187 """, (
188 video['video_id'],
189 video['channel_id'],
190 video['title'],
191 length,
192 livestream,
193 premiere,
194 shorts,
195 published,
196 timestamp
197 ))
198 else:
199 # update video title (everything else can't change)
200 c.execute("""
201 UPDATE OR IGNORE videos
202 SET title = ?
203 WHERE id = ?
204 """, (
205 video['title'],
206 video['video_id'],
207 ))
208
209 # for channels, this is obviously always the same, but playlists can
210 # consist of videos from different channels:
211 if i == 0 or playlist:
212 c.execute("""
213 INSERT OR REPLACE INTO channels (id, name)
214 VALUES (?, ?)
215 """, (video['channel_id'], video['author']))
216
217 # keep track of which videos are in a playlist, so we can show the user
218 # why a video is in their feed:
219 if playlist:
220 c.execute("""
221 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
222 VALUES (?, ?)
223 """, (video['video_id'], playlist))
224
225 if playlist and not from_webhook: # Note: playlists can't get updated via websub
226 c.execute("""
227 INSERT OR REPLACE INTO playlists (id, name, author)
228 VALUES (?, ?, ?)
229 """, (playlist, title, channel))
230 c.execute("""
231 INSERT OR REPLACE INTO channels (id, name)
232 VALUES (?, ?)
233 """, (channel, author))
234
235 db.commit()
236
237 return True
238
239 def is_agegated(metadata):
240 playabilityStatus = metadata['playabilityStatus']
241 return bool(
242 playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
243 or playabilityStatus.get("desktopLegacyAgeGateReason")
244 )
245
246 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
247 """
248 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
249 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
250 """
251 player_error, metadata = None, None # for 'exhausted'
252 with sqlite3.connect(cf['global']['database']) as conn:
253 c = conn.cursor()
254 c.execute("SELECT * FROM captcha_cookies")
255 cookies = dict(c.fetchall())
256 today = datetime.now(timezone.utc).strftime("%Y%m%d")
257 # XXX: anticaptcha hasn't been adapted
258 key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" if metaOnly or _agegate_bypass else "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w"
259 # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
260 # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
261 # TODO: unscramble TVHTML5* streams (especially &n= throttling)
262 client = {
263 (False, False): { 'clientName': 'ANDROID', 'clientVersion': '18.11.34', 'androidSdkVersion': 30},
264 (False, True): { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0' },
265 (True, False): { 'clientName': 'WEB', 'clientVersion':f'2.{today}.01.01' },
266 }[(metaOnly, _agegate_bypass)]
267 r = requests.post("https://youtubei.googleapis.com/youtubei/v1/player", params={'key': key}, json={
268 'videoId': video_id,
269 'context': {
270 'client': {
271 'gl': 'US',
272 'hl': 'en',
273 **client,
274 },
275 'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
276 },
277 "racyCheckOk": True, # seems to do nothing, cargo-culted
278 "contentCheckOk": True, # fix "This video may be inappropriate for some users."
279 "params": "CgIQBg%3D%3D", # otherwise googlevideo URLs become 403/Forbidden after a few accesses (breaks buffering/scrubbing)
280 }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/18.11.34 (Linux; U; Android 11) gzip"})
281
282 if not r or r.status_code == 429:
283 return None, None, None, 'banned', 'possible IP ban'
284
285 metadata = r.json()
286 if "error" in metadata:
287 return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
288 real_vid = metadata.get("videoDetails", {}).get("videoId")
289 if video_id != real_vid and real_vid in ("M5t4UHllkUM", "aQvGIIdgFDM"):
290 # youtube redirected us to a clip called "Video Not Available". indicates a long-term ip ban.
291 return None, None, {}, "banned", "instance is probably ip banned"
292 playabilityStatus = metadata['playabilityStatus']['status']
293 if playabilityStatus != "OK":
294 playabilityReason = metadata['playabilityStatus'].get('reason',
295 '//'.join(metadata['playabilityStatus'].get('messages',[])))
296 player_error = f"{playabilityStatus}: {playabilityReason}"
297 if (is_agegated(metadata)
298 and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
299 and not _agegate_bypass
300 ):
301 _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
302 if error_embed == "player": # agegate bypass failed?
303 return None, None, metadata, 'agegated', player_error
304 elif not error_embed or error_embed in ('livestream','geolocked','scrambled', 'throttled'):
305 metadata = metadata_embed
306 else:
307 return None, None, metadata, error_embed, errormsg_embed
308 else:
309 # without videoDetails, there's only the error message
310 maybe_metadata = metadata if 'videoDetails' in metadata else None
311 return None, None, maybe_metadata, 'player', player_error
312
313 # livestreams have no adaptive/muxed formats:
314 is_live = metadata['videoDetails'].get('isLive', False)
315
316 if not 'formats' in metadata['streamingData'] and not is_live:
317 return None, None, metadata, 'no-url', player_error
318
319 formats = metadata['streamingData'].get('formats',[])
320 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
321 stream_map = {
322 'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
323 'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
324 'muxed': formats,
325 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
326 }
327
328 try:
329 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
330
331 query = parse_qs(urlparse(url).query)
332 # ip-locked videos can be recovered if the proxy module is loaded:
333 is_geolocked = 'gcr' in query
334 # "n-signature" requires javascript descrambling (not implemented):
335 is_throttled = 'ns' in query
336 except:
337 url = None
338 is_geolocked = False
339 is_throttled = False
340
341 is_drm = formats and 'signatureCipher' in formats[0]
342
343 nonfatal = 'livestream' if is_live \
344 else 'geolocked' if is_geolocked \
345 else 'scrambled' if is_drm \
346 else 'throttled' if is_throttled \
347 else None
348
349 return url, stream_map, metadata, nonfatal, None
350
351 def video_metadata(metadata):
352 if not metadata:
353 return {}
354
355 meta1 = metadata['videoDetails']
356 # With ANDROID player API, we don't get microformat => no publishDate!
357 meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
358
359 # sometimes, we receive the notification so early that the length is not
360 # yet populated. Nothing we can do about it. meta1 and meta2 use a
361 # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
362 length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
363
364 views = int(meta1['viewCount']) if 'viewCount' in meta1 else None
365
366 scheduled_time = metadata.get('playabilityStatus',{}) \
367 .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
368 .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
369 .get('scheduledStartTime')
370 if scheduled_time:
371 scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
372 .strftime("%Y-%m-%dT%H:%M:%SZ")
373 published_at = (
374 meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
375 scheduled_time or
376 meta2.get('publishDate','1970-01-01T00:00:00Z')
377 )
378
379 # the actual video streams have exact information:
380 # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
381 try:
382 sd = metadata['streamingData']
383 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
384 aspect_ratio = some_stream['width'] / some_stream['height']
385 # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
386 except:
387 aspect_ratio = None
388
389 is_livestream = meta1['isLiveContent']
390 is_premiere = meta1.get('isUpcoming', False) and not is_livestream
391 # shorts are <= 60 seconds and vertical or square. they can't be premiere
392 # or livestreams. if we were unable to determine it, we set it to None.
393 is_short = (
394 True if (length or 61) <= 60 and (aspect_ratio or 2) <= 1 else
395 False if (length or 0) > 60 or (aspect_ratio or 0) > 1 else
396 None if not is_premiere and not is_livestream else False
397 )
398
399 # Note: 'premiere' videos have livestream=False and published= will be the
400 # start of the premiere.
401 return {
402 'title': meta1['title'],
403 'author': meta1['author'],
404 'channel_id': meta1['channelId'],
405 'published': published_at,
406 'views': views,
407 'length': length,
408 'aspect': aspect_ratio or 16/9,
409 'livestream': is_livestream,
410 'premiere': is_premiere,
411 'shorts': is_short,
412 }
413
414 def mkthumbs(thumbs):
415 output = {str(e['height']): e['url'] for e in thumbs}
416 largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None)
417 return {**output, 'largest': largest}
418
419 def store_video_metadata(video_id):
420 # check if we know about it, and if not, fetch and store video metadata
421 with sqlite3.connect(cf['global']['database']) as conn:
422 c = conn.cursor()
423 c.execute("SELECT 1 from videos where id = ?", (video_id,))
424 new_video = len(c.fetchall()) < 1
425 if new_video:
426 _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
427 if meta:
428 meta = video_metadata(meta)
429 c.execute("""
430 INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
431 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
432 """, (
433 video_id,
434 meta['channel_id'],
435 meta['title'],
436 meta['length'],
437 meta['livestream'],
438 meta['premiere'],
439 meta['shorts'],
440 meta['published'],
441 meta['published'],
442 ))
443 c.execute("""
444 INSERT OR REPLACE INTO channels (id, name)
445 VALUES (?, ?)
446 """, (meta['channel_id'], meta['author']))
447
448 def fetch_video_flags(token, video_ids):
449 with sqlite3.connect(cf['global']['database']) as conn:
450 c = conn.cursor()
451 c.execute("""
452 SELECT video_id,display
453 FROM flags
454 WHERE user = ?
455 AND display IS NOT NULL
456 AND video_id IN ({})
457 -- AND display = 'pinned'
458 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
459 flags = c.fetchall()
460 pinned = [video for video,disp in flags if disp == 'pinned']
461 hidden = [video for video,disp in flags if disp == 'hidden']
462
463 return pinned, hidden
464
465 def apply_video_flags(token, rows, settings={}):
466 video_ids = [card['content']['video_id'] for card in rows if 'video_id' in card['content']]
467 pinned, hidden = fetch_video_flags(token, video_ids)
468 noshorts = settings.get('noshorts') or False
469 return sorted([
470 {'type':v['type'], 'content':{**v['content'], 'pinned': v['content']['video_id'] in pinned if 'video_id' in v['content'] else False}}
471 for v in rows
472 if (
473 'video_id' not in v['content'] or v['content']['video_id'] not in hidden
474 ) and (
475 not (noshorts and v['content'].get('shorts'))
476 )
477 ], key=lambda v:v['content']['pinned'], reverse=True)
478
479 from werkzeug.exceptions import NotFound
480 class NoFallbackException(NotFound): pass
481 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
482 """
483 finds the next route that matches the current url rule, and executes it.
484 args, kwargs: pass all arguments of the current route
485 """
486 from flask import current_app, request, g
487
488 # build a list of endpoints that match the current request's url rule:
489 matching = [
490 rule.endpoint
491 for rule in current_app.url_map.iter_rules()
492 if rule.rule == request.url_rule.rule
493 ]
494 current = matching.index(request.endpoint)
495
496 # since we can't change request.endpoint, we always get the original
497 # endpoint back. so for repeated fall throughs, we use the g object to
498 # increment how often we want to fall through.
499 if not '_fallback_next' in g:
500 g._fallback_next = 0
501 g._fallback_next += 1
502
503 next_ep = current + g._fallback_next
504
505 if next_ep < len(matching):
506 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
507 else:
508 raise NoFallbackException
509
510 def websub_url_hmac(key, feed_id, timestamp, nonce):
511 """ generate sha1 hmac, as required by websub/pubsubhubbub """
512 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
513 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
514
515 def websub_body_hmac(key, body):
516 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
517
518 def flask_logger(msg, level="warning"):
519 level = dict(
520 CRITICAL=50,
521 ERROR=40,
522 WARNING=30,
523 INFO=20,
524 DEBUG=10,
525 NOTSET=0,
526 ).get(level.upper(), 0)
527 try:
528 from flask import current_app
529 current_app.logger.log(level, msg)
530 except:
531 pass
532
533 def log_unknown_card(data):
534 import json
535 try:
536 from flask import request
537 source = request.url
538 except: source = "unknown"
539 with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f:
540 f.write(f"\n/***** {source} *****/\n")
541 json.dump(data, f, indent=2)
Imprint / Impressum