]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
load pinned/hidden state and button on search results and in playlists
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, json=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params=params, data=data, json=json, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, json, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 class G:
54 """
55 null-coalescing version of dict.get() that also works on lists.
56
57 the | operator is overloaded to achieve similar looking code to jq(1) filters.
58 the first found key is used: dict(foo=1)|G('bar','foo') returns 1.
59 """
60 def __init__(self, *keys):
61 self.keys = keys
62 def __ror__(self, other):
63 for key in self.keys:
64 try: return other[key]
65 except: continue
66 return None
67 class _Text:
68 """ parses youtube's .runs[].text and .simpleText variants """
69 def __ror__(self, other): # Note: only returning runs[0], not concat'ing all!
70 return other|G('simpleText') or other|G('runs')|G(0)|G('text')
71 text = _Text()
72
73 def fetch_xml(feed_type, feed_id):
74 # TODO: handle requests.exceptions.ConnectionError
75 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
76 feed_type: feed_id,
77 })
78 if not r.ok:
79 return None
80
81 return r.content
82
83 def parse_xml(xmldata):
84 ns = {
85 'atom':"http://www.w3.org/2005/Atom",
86 'yt': "http://www.youtube.com/xml/schemas/2015",
87 'media':"http://search.yahoo.com/mrss/",
88 'at': "http://purl.org/atompub/tombstones/1.0",
89 }
90
91 feed = ElementTree.fromstring(xmldata)
92
93 if feed.find('at:deleted-entry',ns):
94 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
95 return None, None, [{'deleted': True, 'video_id': vid}], None, None
96
97 title = feed.find('atom:title',ns).text
98 author = feed.find('atom:author/atom:name',ns).text \
99 if feed.find('atom:author',ns) else None
100 # for /user/<> endpoint: find out UC-id:
101 # for playlists: this is who created the playlist:
102 try: channel_id = feed.find('yt:channelId',ns).text
103 except:channel_id=None # XXX: why does ternary not work!?
104 # for pullsub: if this exists, we're looking at a playlist:
105 try: playlist_id = feed.find('yt:playlistId',ns).text
106 except:playlist_id=None # XXX: why does ternary not work!?
107 videos = []
108 for entry in feed.findall('atom:entry',ns):
109 videos.append({
110 'video_id': entry.find('yt:videoId',ns).text,
111 'title': entry.find('atom:title',ns).text,
112 'published': entry.find('atom:published',ns).text,
113 'channel_id': entry.find('yt:channelId',ns).text,
114 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
115 # extra fields for pull_subs/webhook:
116 'updated': entry.find('atom:updated',ns).text,
117 })
118
119 return title, author, videos, channel_id, playlist_id
120
121 def update_channel(db, xmldata, from_webhook=False):
122 if not xmldata: return False
123
124 # Note: websub does not return global author, hence taking from first video
125 title, author, videos, channel, playlist = parse_xml(xmldata)
126
127 c = db.cursor()
128 for i, video in enumerate(videos):
129 if video.get('deleted'):
130 # Note: Deletion events are not just fired for actual deletions,
131 # but also for unlisting videos and livestreams that just ended
132 # (even postLiveDVR ones). Hence, we don't follow it.
133 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
134 break
135
136 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
137 new_video = len(c.fetchall()) < 1
138 if new_video:
139 # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
140 _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
141 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
142 # video gets uploaded as unlisted on day A and set to public on day B;
143 # the webhook is sent on day B, but 'published' says A. The video
144 # therefore looks like it's just an update to an older video).
145 # g_v_i gives is the date the video was published to viewers, so we
146 # prefer that. But since g_v_i only returns the date without time,
147 # we still use xmlfeed's date if it's the same date.
148 published = dateutil.parser.parse(video['published'])
149 length = None
150 livestream = None
151 premiere = None
152 shorts = None
153 if meta:
154 meta = video_metadata(meta)
155 published2 = dateutil.parser.parse(meta['published'])
156 if published < published2: # g_v_i date is more accurate:
157 published = published2
158 length = meta['length']
159 livestream = meta['livestream']
160 premiere = meta['premiere']
161 shorts = meta['shorts']
162
163 now = datetime.now(timezone.utc)
164
165 # we pretend that all videos uploaded this week were uploaded just
166 # now, so the user sees it at the top of the feed, and it doesn't
167 # get inserted somewhere further down.
168 if (now - published).days < 7:
169 timestamp = now
170 else:#, it's just an update to an older video.
171 timestamp = published
172
173 c.execute("""
174 INSERT OR IGNORE INTO videos
175 (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
176 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
177 """, (
178 video['video_id'],
179 video['channel_id'],
180 video['title'],
181 length,
182 livestream,
183 premiere,
184 shorts,
185 published,
186 timestamp
187 ))
188 else:
189 # update video title (everything else can't change)
190 c.execute("""
191 UPDATE OR IGNORE videos
192 SET title = ?
193 WHERE id = ?
194 """, (
195 video['title'],
196 video['video_id'],
197 ))
198
199 # for channels, this is obviously always the same, but playlists can
200 # consist of videos from different channels:
201 if i == 0 or playlist:
202 c.execute("""
203 INSERT OR REPLACE INTO channels (id, name)
204 VALUES (?, ?)
205 """, (video['channel_id'], video['author']))
206
207 # keep track of which videos are in a playlist, so we can show the user
208 # why a video is in their feed:
209 if playlist:
210 c.execute("""
211 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
212 VALUES (?, ?)
213 """, (video['video_id'], playlist))
214
215 if playlist and not from_webhook: # Note: playlists can't get updated via websub
216 c.execute("""
217 INSERT OR REPLACE INTO playlists (id, name, author)
218 VALUES (?, ?, ?)
219 """, (playlist, title, channel))
220 c.execute("""
221 INSERT OR REPLACE INTO channels (id, name)
222 VALUES (?, ?)
223 """, (channel, author))
224
225 db.commit()
226
227 return True
228
229 def is_agegated(metadata):
230 playabilityStatus = metadata['playabilityStatus']
231 return bool(
232 playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
233 or playabilityStatus.get("desktopLegacyAgeGateReason")
234 )
235
236 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
237 """
238 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
239 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
240 """
241 player_error, metadata = None, None # for 'exhausted'
242 with sqlite3.connect(cf['global']['database']) as conn:
243 c = conn.cursor()
244 c.execute("SELECT * FROM captcha_cookies")
245 cookies = dict(c.fetchall())
246 today = datetime.now(timezone.utc).strftime("%Y%m%d")
247 # XXX: anticaptcha hasn't been adapted
248 key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
249 # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
250 # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
251 # TODO: unscramble TVHTML5* streams (especially &n= throttling)
252 client = {
253 (False, False): { 'clientName': 'ANDROID', 'clientVersion': '17.31.35', 'androidSdkVersion': 30},
254 (False, True): { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0' },
255 (True, False): { 'clientName': 'WEB', 'clientVersion':f'2.{today}.01.01' },
256 }[(metaOnly, _agegate_bypass)]
257 r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
258 'videoId': video_id,
259 'context': {
260 'client': {
261 'gl': 'US',
262 'hl': 'en',
263 **client,
264 },
265 'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
266 },
267 "racyCheckOk": True, # seems to do nothing, cargo-culted
268 "contentCheckOk": True, # fix "This video may be inappropriate for some users."
269 }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
270
271 if not r or r.status_code == 429:
272 return None, None, None, 'banned', 'possible IP ban'
273
274 metadata = r.json()
275 if "error" in metadata:
276 return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
277 playabilityStatus = metadata['playabilityStatus']['status']
278 if playabilityStatus != "OK":
279 playabilityReason = metadata['playabilityStatus'].get('reason',
280 '//'.join(metadata['playabilityStatus'].get('messages',[])))
281 player_error = f"{playabilityStatus}: {playabilityReason}"
282 if (is_agegated(metadata)
283 and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
284 and not _agegate_bypass
285 ):
286 _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
287 if error_embed == "player": # agegate bypass failed?
288 return None, None, metadata, 'agegated', player_error
289 elif not error_embed or error_embed in ('livestream','geolocked','scrambled'):
290 metadata = metadata_embed
291 else:
292 return None, None, metadata, error_embed, errormsg_embed
293 else:
294 # without videoDetails, there's only the error message
295 maybe_metadata = metadata if 'videoDetails' in metadata else None
296 return None, None, maybe_metadata, 'player', player_error
297
298 # livestreams have no adaptive/muxed formats:
299 is_live = metadata['videoDetails'].get('isLive', False)
300
301 if not 'formats' in metadata['streamingData'] and not is_live:
302 return None, None, metadata, 'no-url', player_error
303
304 formats = metadata['streamingData'].get('formats',[])
305 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
306 stream_map = {
307 'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
308 'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
309 'muxed': formats,
310 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
311 }
312
313 try:
314 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
315
316 # ip-locked videos can be recovered if the proxy module is loaded:
317 is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
318 except:
319 url = None
320 is_geolocked = False
321
322 is_drm = formats and 'signatureCipher' in formats[0]
323
324 nonfatal = 'livestream' if is_live \
325 else 'geolocked' if is_geolocked \
326 else 'scrambled' if is_drm \
327 else None
328
329 return url, stream_map, metadata, nonfatal, None
330
331 def video_metadata(metadata):
332 if not metadata:
333 return {}
334
335 meta1 = metadata['videoDetails']
336 # With ANDROID player API, we don't get microformat => no publishDate!
337 meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
338
339 # sometimes, we receive the notification so early that the length is not
340 # yet populated. Nothing we can do about it. meta1 and meta2 use a
341 # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
342 length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
343
344 scheduled_time = metadata.get('playabilityStatus',{}) \
345 .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
346 .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
347 .get('scheduledStartTime')
348 if scheduled_time:
349 scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
350 .strftime("%Y-%m-%dT%H:%M:%SZ")
351 published_at = (
352 meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
353 scheduled_time or
354 f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
355 )
356
357 # the actual video streams have exact information:
358 # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
359 try:
360 sd = metadata['streamingData']
361 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
362 aspect_ratio = some_stream['width'] / some_stream['height']
363 # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
364 except:
365 aspect_ratio = None
366
367 # shorts are <= 60 seconds and vertical or square. if we were unable to
368 # determine it, we set it to None.
369 is_short = (
370 None if length is None and aspect_ratio is None else
371 True if ((length or 61) <= 60) and ((aspect_ratio or 2) <= 1) else
372 False # length > 60 or aspect_ratio > 1
373 )
374
375 # Note: 'premiere' videos have livestream=False and published= will be the
376 # start of the premiere.
377 return {
378 'title': meta1['title'],
379 'author': meta1['author'],
380 'channel_id': meta1['channelId'],
381 'published': published_at,
382 'views': int(meta1['viewCount']),
383 'length': length,
384 'aspect': aspect_ratio or 16/9,
385 'livestream': meta1['isLiveContent'],
386 'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
387 'shorts': is_short,
388 }
389
390 def mkthumbs(thumbs):
391 output = {str(e['height']): e['url'] for e in thumbs}
392 largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None)
393 return {**output, 'largest': largest}
394
395 def store_video_metadata(video_id):
396 # check if we know about it, and if not, fetch and store video metadata
397 with sqlite3.connect(cf['global']['database']) as conn:
398 c = conn.cursor()
399 c.execute("SELECT 1 from videos where id = ?", (video_id,))
400 new_video = len(c.fetchall()) < 1
401 if new_video:
402 _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
403 if meta:
404 meta = video_metadata(meta)
405 c.execute("""
406 INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
407 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
408 """, (
409 video_id,
410 meta['channel_id'],
411 meta['title'],
412 meta['length'],
413 meta['livestream'],
414 meta['premiere'],
415 meta['shorts'],
416 meta['published'],
417 meta['published'],
418 ))
419 c.execute("""
420 INSERT OR REPLACE INTO channels (id, name)
421 VALUES (?, ?)
422 """, (meta['channel_id'], meta['author']))
423
424 def fetch_video_flags(token, video_ids):
425 with sqlite3.connect(cf['global']['database']) as conn:
426 c = conn.cursor()
427 c.execute("""
428 SELECT video_id,display
429 FROM flags
430 WHERE user = ?
431 AND display IS NOT NULL
432 AND video_id IN ({})
433 -- AND display = 'pinned'
434 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
435 flags = c.fetchall()
436 pinned = [video for video,disp in flags if disp == 'pinned']
437 hidden = [video for video,disp in flags if disp == 'hidden']
438
439 return pinned, hidden
440
441 def apply_video_flags(token, rows):
442 video_ids = [card['content']['video_id'] for card in rows if 'video_id' in card['content']]
443 pinned, hidden = fetch_video_flags(token, video_ids)
444 return sorted([
445 {'type':v['type'], 'content':{**v['content'], 'pinned': v['content']['video_id'] in pinned if 'video_id' in v['content'] else False}}
446 for v in rows
447 if 'video_id' not in v['content'] or v['content']['video_id'] not in hidden
448 ], key=lambda v:v['content']['pinned'], reverse=True)
449
450 from werkzeug.exceptions import NotFound
451 class NoFallbackException(NotFound): pass
452 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
453 """
454 finds the next route that matches the current url rule, and executes it.
455 args, kwargs: pass all arguments of the current route
456 """
457 from flask import current_app, request, g
458
459 # build a list of endpoints that match the current request's url rule:
460 matching = [
461 rule.endpoint
462 for rule in current_app.url_map.iter_rules()
463 if rule.rule == request.url_rule.rule
464 ]
465 current = matching.index(request.endpoint)
466
467 # since we can't change request.endpoint, we always get the original
468 # endpoint back. so for repeated fall throughs, we use the g object to
469 # increment how often we want to fall through.
470 if not '_fallback_next' in g:
471 g._fallback_next = 0
472 g._fallback_next += 1
473
474 next_ep = current + g._fallback_next
475
476 if next_ep < len(matching):
477 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
478 else:
479 raise NoFallbackException
480
481 def websub_url_hmac(key, feed_id, timestamp, nonce):
482 """ generate sha1 hmac, as required by websub/pubsubhubbub """
483 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
484 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
485
486 def websub_body_hmac(key, body):
487 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
488
489 def flask_logger(msg, level="warning"):
490 level = dict(
491 CRITICAL=50,
492 ERROR=40,
493 WARNING=30,
494 INFO=20,
495 DEBUG=10,
496 NOTSET=0,
497 ).get(level.upper(), 0)
498 try:
499 from flask import current_app
500 current_app.logger.log(level, msg)
501 except:
502 pass
503
504 def log_unknown_card(data):
505 import json
506 try:
507 from flask import request
508 source = request.url
509 except: source = "unknown"
510 with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f:
511 f.write(f"\n/***** {source} *****/\n")
512 json.dump(data, f, indent=2)
Imprint / Impressum