]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
watch: show error when video urls are scrambled/cyphered
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params, data, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 def fetch_xml(feed_type, feed_id):
54 # TODO: handle requests.exceptions.ConnectionError
55 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
56 feed_type: feed_id,
57 })
58 if not r.ok:
59 return None
60
61 return r.content
62
63 def parse_xml(xmldata):
64 ns = {
65 'atom':"http://www.w3.org/2005/Atom",
66 'yt': "http://www.youtube.com/xml/schemas/2015",
67 'media':"http://search.yahoo.com/mrss/",
68 'at': "http://purl.org/atompub/tombstones/1.0",
69 }
70
71 feed = ElementTree.fromstring(xmldata)
72
73 if feed.find('at:deleted-entry',ns):
74 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
75 return None, None, [{'deleted': True, 'video_id': vid}], None, None
76
77 title = feed.find('atom:title',ns).text
78 author = feed.find('atom:author/atom:name',ns).text \
79 if feed.find('atom:author',ns) else None
80 # for /user/<> endpoint: find out UC-id:
81 # for playlists: this is who created the playlist:
82 try: channel_id = feed.find('yt:channelId',ns).text
83 except:channel_id=None # XXX: why does ternary not work!?
84 # for pullsub: if this exists, we're looking at a playlist:
85 try: playlist_id = feed.find('yt:playlistId',ns).text
86 except:playlist_id=None # XXX: why does ternary not work!?
87 videos = []
88 for entry in feed.findall('atom:entry',ns):
89 videos.append({
90 'video_id': entry.find('yt:videoId',ns).text,
91 'title': entry.find('atom:title',ns).text,
92 'published': entry.find('atom:published',ns).text,
93 'channel_id': entry.find('yt:channelId',ns).text,
94 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
95 # extra fields for pull_subs/webhook:
96 'updated': entry.find('atom:updated',ns).text,
97 })
98
99 return title, author, videos, channel_id, playlist_id
100
101 def update_channel(db, xmldata, from_webhook=False):
102 if not xmldata: return False
103
104 # Note: websub does not return global author, hence taking from first video
105 title, author, videos, channel, playlist = parse_xml(xmldata)
106
107 c = db.cursor()
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 # Note: Deletion events are not just fired for actual deletions,
111 # but also for unlisting videos and livestreams that just ended
112 # (even postLiveDVR ones). Hence, we don't follow it.
113 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
114 break
115
116 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 if new_video:
119 # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
120 _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video).
125 # g_v_i gives is the date the video was published to viewers, so we
126 # prefer that. But since g_v_i only returns the date without time,
127 # we still use xmlfeed's date if it's the same date.
128 published = dateutil.parser.parse(video['published'])
129 length = None
130 livestream = None
131 premiere = None
132 if meta:
133 meta = video_metadata(meta)
134 published2 = dateutil.parser.parse(meta['published'])
135 if published < published2: # g_v_i date is more accurate:
136 published = published2
137 length = meta['length']
138 livestream = meta['livestream']
139 premiere = meta['premiere']
140
141 now = datetime.now(timezone.utc)
142
143 # we pretend that all videos uploaded this week were uploaded just
144 # now, so the user sees it at the top of the feed, and it doesn't
145 # get inserted somewhere further down.
146 if (now - published).days < 7:
147 timestamp = now
148 else:#, it's just an update to an older video.
149 timestamp = published
150
151 c.execute("""
152 INSERT OR IGNORE INTO videos
153 (id, channel_id, title, length, livestream, premiere, published, crawled)
154 VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
155 """, (
156 video['video_id'],
157 video['channel_id'],
158 video['title'],
159 length,
160 livestream,
161 premiere,
162 published,
163 timestamp
164 ))
165 else:
166 # update video title (everything else can't change)
167 c.execute("""
168 UPDATE OR IGNORE videos
169 SET title = ?
170 WHERE id = ?
171 """, (
172 video['title'],
173 video['video_id'],
174 ))
175
176 # for channels, this is obviously always the same, but playlists can
177 # consist of videos from different channels:
178 if i == 0 or playlist:
179 c.execute("""
180 INSERT OR REPLACE INTO channels (id, name)
181 VALUES (?, ?)
182 """, (video['channel_id'], video['author']))
183
184 # keep track of which videos are in a playlist, so we can show the user
185 # why a video is in their feed:
186 if playlist:
187 c.execute("""
188 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
189 VALUES (?, ?)
190 """, (video['video_id'], playlist))
191
192 if playlist and not from_webhook: # Note: playlists can't get updated via websub
193 c.execute("""
194 INSERT OR REPLACE INTO playlists (id, name, author)
195 VALUES (?, ?, ?)
196 """, (playlist, title, channel))
197 c.execute("""
198 INSERT OR REPLACE INTO channels (id, name)
199 VALUES (?, ?)
200 """, (channel, author))
201
202 db.commit()
203
204 return True
205
206 def is_agegated(metadata):
207 playabilityStatus = metadata['playabilityStatus']
208 return bool(
209 playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
210 or playabilityStatus.get("desktopLegacyAgeGateReason")
211 )
212
213 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
214 """
215 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
216 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
217 """
218 player_error, metadata = None, None # for 'exhausted'
219 with sqlite3.connect(cf['global']['database']) as conn:
220 c = conn.cursor()
221 c.execute("SELECT * FROM captcha_cookies")
222 cookies = dict(c.fetchall())
223 today = datetime.now(timezone.utc).strftime("%Y%m%d")
224 # XXX: anticaptcha hasn't been adapted
225 key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
226 # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
227 # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
228 # TODO: unscramble TVHTML5* streams (especially &n= throttling)
229 client = {
230 (False, False): { 'clientName': 'ANDROID', 'clientVersion': '17.31.35', 'androidSdkVersion': 30},
231 (False, True): { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0' },
232 (True, False): { 'clientName': 'WEB', 'clientVersion':f'2.{today}.01.01' },
233 }[(metaOnly, _agegate_bypass)]
234 r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
235 'videoId': video_id,
236 'context': {
237 'client': {
238 'gl': 'US',
239 'hl': 'en',
240 **client,
241 },
242 'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
243 },
244 "racyCheckOk": True, # seems to do nothing, cargo-culted
245 "contentCheckOk": True, # fix "This video may be inappropriate for some users."
246 }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
247
248 if not r or r.status_code == 429:
249 return None, None, None, 'banned', 'possible IP ban'
250
251 metadata = r.json()
252 if "error" in metadata:
253 return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
254 playabilityStatus = metadata['playabilityStatus']['status']
255 if playabilityStatus != "OK":
256 playabilityReason = metadata['playabilityStatus'].get('reason',
257 '//'.join(metadata['playabilityStatus'].get('messages',[])))
258 player_error = f"{playabilityStatus}: {playabilityReason}"
259 if (is_agegated(metadata)
260 and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
261 and not _agegate_bypass
262 ):
263 _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
264 if error_embed == "player": # agegate bypass failed?
265 return None, None, metadata, 'agegated', player_error
266 elif not error_embed or error_embed in ('livestream','geolocked'):
267 metadata = metadata_embed
268 else:
269 return None, None, metadata, error_embed, errormsg_embed
270 else:
271 # without videoDetails, there's only the error message
272 maybe_metadata = metadata if 'videoDetails' in metadata else None
273 return None, None, maybe_metadata, 'player', player_error
274
275 # livestreams have no adaptive/muxed formats:
276 is_live = metadata['videoDetails'].get('isLive', False)
277
278 if not 'formats' in metadata['streamingData'] and not is_live:
279 return None, None, metadata, 'no-url', player_error
280
281 formats = metadata['streamingData'].get('formats',[])
282 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
283 stream_map = {
284 'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
285 'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
286 'muxed': formats,
287 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
288 }
289
290 try:
291 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
292
293 # ip-locked videos can be recovered if the proxy module is loaded:
294 is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
295 except:
296 url = None
297 is_geolocked = False
298
299 is_drm = formats and 'signatureCipher' in formats[0]
300
301 nonfatal = 'livestream' if is_live \
302 else 'geolocked' if is_geolocked \
303 else 'scrambled' if is_drm \
304 else None
305
306 return url, stream_map, metadata, nonfatal, None
307
308 def video_metadata(metadata):
309 if not metadata:
310 return {}
311
312 meta1 = metadata['videoDetails']
313 # With ANDROID player API, we don't get microformat => no publishDate!
314 meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
315
316 # sometimes, we receive the notification so early that the length is not
317 # yet populated. Nothing we can do about it.
318 length = int(meta2.get('lengthSeconds',0)) or int(meta1.get('lengthSeconds',0)) or None
319
320 scheduled_time = metadata.get('playabilityStatus',{}) \
321 .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
322 .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
323 .get('scheduledStartTime')
324 if scheduled_time:
325 scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
326 .strftime("%Y-%m-%dT%H:%M:%SZ")
327 published_at = (
328 meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
329 scheduled_time or
330 f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
331 )
332
333 # Note: 'premiere' videos have livestream=False and published= will be the
334 # start of the premiere.
335 return {
336 'title': meta1['title'],
337 'author': meta1['author'],
338 'channel_id': meta1['channelId'],
339 'published': published_at,
340 'views': int(meta1['viewCount']),
341 'length': length,
342 'livestream': meta1['isLiveContent'],
343 'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
344 }
345
346 def store_video_metadata(video_id):
347 # check if we know about it, and if not, fetch and store video metadata
348 with sqlite3.connect(cf['global']['database']) as conn:
349 c = conn.cursor()
350 c.execute("SELECT 1 from videos where id = ?", (video_id,))
351 new_video = len(c.fetchall()) < 1
352 if new_video:
353 _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
354 if meta:
355 meta = video_metadata(meta)
356 c.execute("""
357 INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, published, crawled)
358 VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
359 """, (
360 video_id,
361 meta['channel_id'],
362 meta['title'],
363 meta['length'],
364 meta['livestream'],
365 meta['premiere'],
366 meta['published'],
367 meta['published'],
368 ))
369 c.execute("""
370 INSERT OR REPLACE INTO channels (id, name)
371 VALUES (?, ?)
372 """, (meta['channel_id'], meta['author']))
373
374 def fetch_video_flags(token, video_ids):
375 with sqlite3.connect(cf['global']['database']) as conn:
376 c = conn.cursor()
377 c.execute("""
378 SELECT video_id,display
379 FROM flags
380 WHERE user = ?
381 AND display IS NOT NULL
382 AND video_id IN ({})
383 -- AND display = 'pinned'
384 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
385 flags = c.fetchall()
386 pinned = [video for video,disp in flags if disp == 'pinned']
387 hidden = [video for video,disp in flags if disp == 'hidden']
388
389 return pinned, hidden
390
391 from werkzeug.exceptions import NotFound
392 class NoFallbackException(NotFound): pass
393 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
394 """
395 finds the next route that matches the current url rule, and executes it.
396 args, kwargs: pass all arguments of the current route
397 """
398 from flask import current_app, request, g
399
400 # build a list of endpoints that match the current request's url rule:
401 matching = [
402 rule.endpoint
403 for rule in current_app.url_map.iter_rules()
404 if rule.rule == request.url_rule.rule
405 ]
406 current = matching.index(request.endpoint)
407
408 # since we can't change request.endpoint, we always get the original
409 # endpoint back. so for repeated fall throughs, we use the g object to
410 # increment how often we want to fall through.
411 if not '_fallback_next' in g:
412 g._fallback_next = 0
413 g._fallback_next += 1
414
415 next_ep = current + g._fallback_next
416
417 if next_ep < len(matching):
418 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
419 else:
420 raise NoFallbackException
421
422 def websub_url_hmac(key, feed_id, timestamp, nonce):
423 """ generate sha1 hmac, as required by websub/pubsubhubbub """
424 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
425 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
426
427 def websub_body_hmac(key, body):
428 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
429
430 def flask_logger(msg, level="warning"):
431 level = dict(
432 CRITICAL=50,
433 ERROR=40,
434 WARNING=30,
435 INFO=20,
436 DEBUG=10,
437 NOTSET=0,
438 ).get(level.upper(), 0)
439 try:
440 from flask import current_app
441 current_app.logger.log(level, msg)
442 except:
443 pass
444
445 def pp(*args):
446 from pprint import pprint
447 import sys, codecs
448 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum