]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
subscription feed: filter shorts if the user enabled the 'noshorts' setting
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, json=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params=params, data=data, json=json, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, json, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 def fetch_xml(feed_type, feed_id):
54 # TODO: handle requests.exceptions.ConnectionError
55 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
56 feed_type: feed_id,
57 })
58 if not r.ok:
59 return None
60
61 return r.content
62
63 def parse_xml(xmldata):
64 ns = {
65 'atom':"http://www.w3.org/2005/Atom",
66 'yt': "http://www.youtube.com/xml/schemas/2015",
67 'media':"http://search.yahoo.com/mrss/",
68 'at': "http://purl.org/atompub/tombstones/1.0",
69 }
70
71 feed = ElementTree.fromstring(xmldata)
72
73 if feed.find('at:deleted-entry',ns):
74 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
75 return None, None, [{'deleted': True, 'video_id': vid}], None, None
76
77 title = feed.find('atom:title',ns).text
78 author = feed.find('atom:author/atom:name',ns).text \
79 if feed.find('atom:author',ns) else None
80 # for /user/<> endpoint: find out UC-id:
81 # for playlists: this is who created the playlist:
82 try: channel_id = feed.find('yt:channelId',ns).text
83 except:channel_id=None # XXX: why does ternary not work!?
84 # for pullsub: if this exists, we're looking at a playlist:
85 try: playlist_id = feed.find('yt:playlistId',ns).text
86 except:playlist_id=None # XXX: why does ternary not work!?
87 videos = []
88 for entry in feed.findall('atom:entry',ns):
89 videos.append({
90 'video_id': entry.find('yt:videoId',ns).text,
91 'title': entry.find('atom:title',ns).text,
92 'published': entry.find('atom:published',ns).text,
93 'channel_id': entry.find('yt:channelId',ns).text,
94 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
95 # extra fields for pull_subs/webhook:
96 'updated': entry.find('atom:updated',ns).text,
97 })
98
99 return title, author, videos, channel_id, playlist_id
100
101 def update_channel(db, xmldata, from_webhook=False):
102 if not xmldata: return False
103
104 # Note: websub does not return global author, hence taking from first video
105 title, author, videos, channel, playlist = parse_xml(xmldata)
106
107 c = db.cursor()
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 # Note: Deletion events are not just fired for actual deletions,
111 # but also for unlisting videos and livestreams that just ended
112 # (even postLiveDVR ones). Hence, we don't follow it.
113 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
114 break
115
116 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 if new_video:
119 # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
120 _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video).
125 # g_v_i gives is the date the video was published to viewers, so we
126 # prefer that. But since g_v_i only returns the date without time,
127 # we still use xmlfeed's date if it's the same date.
128 published = dateutil.parser.parse(video['published'])
129 length = None
130 livestream = None
131 premiere = None
132 shorts = None
133 if meta:
134 meta = video_metadata(meta)
135 published2 = dateutil.parser.parse(meta['published'])
136 if published < published2: # g_v_i date is more accurate:
137 published = published2
138 length = meta['length']
139 livestream = meta['livestream']
140 premiere = meta['premiere']
141 shorts = meta['shorts']
142
143 now = datetime.now(timezone.utc)
144
145 # we pretend that all videos uploaded this week were uploaded just
146 # now, so the user sees it at the top of the feed, and it doesn't
147 # get inserted somewhere further down.
148 if (now - published).days < 7:
149 timestamp = now
150 else:#, it's just an update to an older video.
151 timestamp = published
152
153 c.execute("""
154 INSERT OR IGNORE INTO videos
155 (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
156 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
157 """, (
158 video['video_id'],
159 video['channel_id'],
160 video['title'],
161 length,
162 livestream,
163 premiere,
164 shorts,
165 published,
166 timestamp
167 ))
168 else:
169 # update video title (everything else can't change)
170 c.execute("""
171 UPDATE OR IGNORE videos
172 SET title = ?
173 WHERE id = ?
174 """, (
175 video['title'],
176 video['video_id'],
177 ))
178
179 # for channels, this is obviously always the same, but playlists can
180 # consist of videos from different channels:
181 if i == 0 or playlist:
182 c.execute("""
183 INSERT OR REPLACE INTO channels (id, name)
184 VALUES (?, ?)
185 """, (video['channel_id'], video['author']))
186
187 # keep track of which videos are in a playlist, so we can show the user
188 # why a video is in their feed:
189 if playlist:
190 c.execute("""
191 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
192 VALUES (?, ?)
193 """, (video['video_id'], playlist))
194
195 if playlist and not from_webhook: # Note: playlists can't get updated via websub
196 c.execute("""
197 INSERT OR REPLACE INTO playlists (id, name, author)
198 VALUES (?, ?, ?)
199 """, (playlist, title, channel))
200 c.execute("""
201 INSERT OR REPLACE INTO channels (id, name)
202 VALUES (?, ?)
203 """, (channel, author))
204
205 db.commit()
206
207 return True
208
209 def is_agegated(metadata):
210 playabilityStatus = metadata['playabilityStatus']
211 return bool(
212 playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
213 or playabilityStatus.get("desktopLegacyAgeGateReason")
214 )
215
216 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
217 """
218 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
219 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
220 """
221 player_error, metadata = None, None # for 'exhausted'
222 with sqlite3.connect(cf['global']['database']) as conn:
223 c = conn.cursor()
224 c.execute("SELECT * FROM captcha_cookies")
225 cookies = dict(c.fetchall())
226 today = datetime.now(timezone.utc).strftime("%Y%m%d")
227 # XXX: anticaptcha hasn't been adapted
228 key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
229 # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
230 # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
231 # TODO: unscramble TVHTML5* streams (especially &n= throttling)
232 client = {
233 (False, False): { 'clientName': 'ANDROID', 'clientVersion': '17.31.35', 'androidSdkVersion': 30},
234 (False, True): { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0' },
235 (True, False): { 'clientName': 'WEB', 'clientVersion':f'2.{today}.01.01' },
236 }[(metaOnly, _agegate_bypass)]
237 r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
238 'videoId': video_id,
239 'context': {
240 'client': {
241 'gl': 'US',
242 'hl': 'en',
243 **client,
244 },
245 'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
246 },
247 "racyCheckOk": True, # seems to do nothing, cargo-culted
248 "contentCheckOk": True, # fix "This video may be inappropriate for some users."
249 }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
250
251 if not r or r.status_code == 429:
252 return None, None, None, 'banned', 'possible IP ban'
253
254 metadata = r.json()
255 if "error" in metadata:
256 return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
257 playabilityStatus = metadata['playabilityStatus']['status']
258 if playabilityStatus != "OK":
259 playabilityReason = metadata['playabilityStatus'].get('reason',
260 '//'.join(metadata['playabilityStatus'].get('messages',[])))
261 player_error = f"{playabilityStatus}: {playabilityReason}"
262 if (is_agegated(metadata)
263 and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
264 and not _agegate_bypass
265 ):
266 _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
267 if error_embed == "player": # agegate bypass failed?
268 return None, None, metadata, 'agegated', player_error
269 elif not error_embed or error_embed in ('livestream','geolocked','scrambled'):
270 metadata = metadata_embed
271 else:
272 return None, None, metadata, error_embed, errormsg_embed
273 else:
274 # without videoDetails, there's only the error message
275 maybe_metadata = metadata if 'videoDetails' in metadata else None
276 return None, None, maybe_metadata, 'player', player_error
277
278 # livestreams have no adaptive/muxed formats:
279 is_live = metadata['videoDetails'].get('isLive', False)
280
281 if not 'formats' in metadata['streamingData'] and not is_live:
282 return None, None, metadata, 'no-url', player_error
283
284 formats = metadata['streamingData'].get('formats',[])
285 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
286 stream_map = {
287 'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
288 'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
289 'muxed': formats,
290 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
291 }
292
293 try:
294 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
295
296 # ip-locked videos can be recovered if the proxy module is loaded:
297 is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
298 except:
299 url = None
300 is_geolocked = False
301
302 is_drm = formats and 'signatureCipher' in formats[0]
303
304 nonfatal = 'livestream' if is_live \
305 else 'geolocked' if is_geolocked \
306 else 'scrambled' if is_drm \
307 else None
308
309 return url, stream_map, metadata, nonfatal, None
310
311 def video_metadata(metadata):
312 if not metadata:
313 return {}
314
315 meta1 = metadata['videoDetails']
316 # With ANDROID player API, we don't get microformat => no publishDate!
317 meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
318
319 # sometimes, we receive the notification so early that the length is not
320 # yet populated. Nothing we can do about it. meta1 and meta2 use a
321 # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
322 length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
323
324 scheduled_time = metadata.get('playabilityStatus',{}) \
325 .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
326 .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
327 .get('scheduledStartTime')
328 if scheduled_time:
329 scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
330 .strftime("%Y-%m-%dT%H:%M:%SZ")
331 published_at = (
332 meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
333 scheduled_time or
334 f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
335 )
336
337 # the actual video streams have exact information:
338 # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
339 try:
340 sd = metadata['streamingData']
341 some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
342 aspect_ratio = some_stream['width'] / some_stream['height']
343 # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
344 except:
345 aspect_ratio = None
346
347 # shorts are <= 60 seconds and vertical or square. if we were unable to
348 # determine it, we set it to None.
349 is_short = (
350 None if length is None and aspect_ratio is None else
351 True if ((length or 61) <= 60) and ((aspect_ratio or 2) <= 1) else
352 False # length > 60 or aspect_ratio > 1
353 )
354
355 # Note: 'premiere' videos have livestream=False and published= will be the
356 # start of the premiere.
357 return {
358 'title': meta1['title'],
359 'author': meta1['author'],
360 'channel_id': meta1['channelId'],
361 'published': published_at,
362 'views': int(meta1['viewCount']),
363 'length': length,
364 'aspect': aspect_ratio or 16/9,
365 'livestream': meta1['isLiveContent'],
366 'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
367 'shorts': is_short,
368 }
369
370 def store_video_metadata(video_id):
371 # check if we know about it, and if not, fetch and store video metadata
372 with sqlite3.connect(cf['global']['database']) as conn:
373 c = conn.cursor()
374 c.execute("SELECT 1 from videos where id = ?", (video_id,))
375 new_video = len(c.fetchall()) < 1
376 if new_video:
377 _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
378 if meta:
379 meta = video_metadata(meta)
380 c.execute("""
381 INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
382 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
383 """, (
384 video_id,
385 meta['channel_id'],
386 meta['title'],
387 meta['length'],
388 meta['livestream'],
389 meta['premiere'],
390 meta['shorts'],
391 meta['published'],
392 meta['published'],
393 ))
394 c.execute("""
395 INSERT OR REPLACE INTO channels (id, name)
396 VALUES (?, ?)
397 """, (meta['channel_id'], meta['author']))
398
399 def fetch_video_flags(token, video_ids):
400 with sqlite3.connect(cf['global']['database']) as conn:
401 c = conn.cursor()
402 c.execute("""
403 SELECT video_id,display
404 FROM flags
405 WHERE user = ?
406 AND display IS NOT NULL
407 AND video_id IN ({})
408 -- AND display = 'pinned'
409 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
410 flags = c.fetchall()
411 pinned = [video for video,disp in flags if disp == 'pinned']
412 hidden = [video for video,disp in flags if disp == 'hidden']
413
414 return pinned, hidden
415
416 from werkzeug.exceptions import NotFound
417 class NoFallbackException(NotFound): pass
418 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
419 """
420 finds the next route that matches the current url rule, and executes it.
421 args, kwargs: pass all arguments of the current route
422 """
423 from flask import current_app, request, g
424
425 # build a list of endpoints that match the current request's url rule:
426 matching = [
427 rule.endpoint
428 for rule in current_app.url_map.iter_rules()
429 if rule.rule == request.url_rule.rule
430 ]
431 current = matching.index(request.endpoint)
432
433 # since we can't change request.endpoint, we always get the original
434 # endpoint back. so for repeated fall throughs, we use the g object to
435 # increment how often we want to fall through.
436 if not '_fallback_next' in g:
437 g._fallback_next = 0
438 g._fallback_next += 1
439
440 next_ep = current + g._fallback_next
441
442 if next_ep < len(matching):
443 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
444 else:
445 raise NoFallbackException
446
447 def websub_url_hmac(key, feed_id, timestamp, nonce):
448 """ generate sha1 hmac, as required by websub/pubsubhubbub """
449 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
450 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
451
452 def websub_body_hmac(key, body):
453 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
454
455 def flask_logger(msg, level="warning"):
456 level = dict(
457 CRITICAL=50,
458 ERROR=40,
459 WARNING=30,
460 INFO=20,
461 DEBUG=10,
462 NOTSET=0,
463 ).get(level.upper(), 0)
464 try:
465 from flask import current_app
466 current_app.logger.log(level, msg)
467 except:
468 pass
469
470 def pp(*args):
471 from pprint import pprint
472 import sys, codecs
473 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum