]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
Revert "Revert "remove useless for loop, implement age-gate bypass""
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params, data, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 def fetch_xml(feed_type, feed_id):
54 # TODO: handle requests.exceptions.ConnectionError
55 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
56 feed_type: feed_id,
57 })
58 if not r.ok:
59 return None
60
61 return r.content
62
63 def parse_xml(xmldata):
64 ns = {
65 'atom':"http://www.w3.org/2005/Atom",
66 'yt': "http://www.youtube.com/xml/schemas/2015",
67 'media':"http://search.yahoo.com/mrss/",
68 'at': "http://purl.org/atompub/tombstones/1.0",
69 }
70
71 feed = ElementTree.fromstring(xmldata)
72
73 if feed.find('at:deleted-entry',ns):
74 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
75 return None, None, [{'deleted': True, 'video_id': vid}], None, None
76
77 title = feed.find('atom:title',ns).text
78 author = feed.find('atom:author/atom:name',ns).text \
79 if feed.find('atom:author',ns) else None
80 # for /user/<> endpoint: find out UC-id:
81 # for playlists: this is who created the playlist:
82 try: channel_id = feed.find('yt:channelId',ns).text
83 except:channel_id=None # XXX: why does ternary not work!?
84 # for pullsub: if this exists, we're looking at a playlist:
85 try: playlist_id = feed.find('yt:playlistId',ns).text
86 except:playlist_id=None # XXX: why does ternary not work!?
87 videos = []
88 for entry in feed.findall('atom:entry',ns):
89 videos.append({
90 'video_id': entry.find('yt:videoId',ns).text,
91 'title': entry.find('atom:title',ns).text,
92 'published': entry.find('atom:published',ns).text,
93 'channel_id': entry.find('yt:channelId',ns).text,
94 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
95 # extra fields for pull_subs/webhook:
96 'updated': entry.find('atom:updated',ns).text,
97 })
98
99 return title, author, videos, channel_id, playlist_id
100
101 def update_channel(db, xmldata, from_webhook=False):
102 if not xmldata: return False
103
104 # Note: websub does not return global author, hence taking from first video
105 title, author, videos, channel, playlist = parse_xml(xmldata)
106
107 c = db.cursor()
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 # Note: Deletion events are not just fired for actual deletions,
111 # but also for unlisting videos and livestreams that just ended
112 # (even postLiveDVR ones). Hence, we don't follow it.
113 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
114 break
115
116 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 if new_video:
119 flask_logger(f"new video {video['video_id']}")
120 _, _, meta, _, _ = get_video_info(video['video_id'])
121 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
122 # video gets uploaded as unlisted on day A and set to public on day B;
123 # the webhook is sent on day B, but 'published' says A. The video
124 # therefore looks like it's just an update to an older video).
125 # g_v_i gives is the date the video was published to viewers, so we
126 # prefer that. But since g_v_i only returns the date without time,
127 # we still use xmlfeed's date if it's the same date.
128 published = dateutil.parser.parse(video['published'])
129 length = None
130 livestream = None
131 if meta:
132 meta = video_metadata(meta)
133 published2 = dateutil.parser.parse(meta['published'])
134 flask_logger(f"published {published} / {published2}")
135 if published < published2: # g_v_i date is more accurate:
136 published = published2
137 length = meta['length']
138 livestream = meta['livestream']
139
140 now = datetime.now(timezone.utc)
141
142 # we pretend that all videos uploaded this week were uploaded just
143 # now, so the user sees it at the top of the feed, and it doesn't
144 # get inserted somewhere further down.
145 if (now - published).days < 7:
146 timestamp = now
147 else:#, it's just an update to an older video.
148 timestamp = published
149
150 c.execute("""
151 INSERT OR IGNORE INTO videos
152 (id, channel_id, title, length, livestream, published, crawled)
153 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
154 """, (
155 video['video_id'],
156 video['channel_id'],
157 video['title'],
158 length,
159 livestream,
160 video['published'],
161 timestamp
162 ))
163 else:
164 # update video title (everything else can't change)
165 c.execute("""
166 UPDATE OR IGNORE videos
167 SET title = ?
168 WHERE id = ?
169 """, (
170 video['title'],
171 video['video_id'],
172 ))
173
174 # for channels, this is obviously always the same, but playlists can
175 # consist of videos from different channels:
176 if i == 0 or playlist:
177 c.execute("""
178 INSERT OR REPLACE INTO channels (id, name)
179 VALUES (?, ?)
180 """, (video['channel_id'], video['author']))
181
182 # keep track of which videos are in a playlist, so we can show the user
183 # why a video is in their feed:
184 if playlist:
185 c.execute("""
186 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
187 VALUES (?, ?)
188 """, (video['video_id'], playlist))
189
190 if playlist and not from_webhook: # Note: playlists can't get updated via websub
191 c.execute("""
192 INSERT OR REPLACE INTO playlists (id, name, author)
193 VALUES (?, ?, ?)
194 """, (playlist, title, channel))
195 c.execute("""
196 INSERT OR REPLACE INTO channels (id, name)
197 VALUES (?, ?)
198 """, (channel, author))
199
200 db.commit()
201
202 return True
203
204 def get_video_info(video_id, sts=0, algo=""):
205 """
206 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
207 error types: player, malformed, livestream, geolocked, exhausted
208 """
209 player_error, metadata = None, None # for 'exhausted'
210 with sqlite3.connect(cf['global']['database']) as conn:
211 c = conn.cursor()
212 c.execute("SELECT * FROM captcha_cookies")
213 cookies = dict(c.fetchall())
214 today = datetime.now(timezone.utc).strftime("%Y%m%d")
215 # XXX: anticaptcha hasn't been adapted
216 # XXX: this is not cached any more!
217 # XXX: age-gated now broken: HtVdAasjOgU (embed ok), XgnwCQzjau8 (no embed)
218 r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
219 'videoId': video_id,
220 'context': {
221 'client': {
222 'gl': 'US',
223 'hl': 'en',
224 'clientName': 'WEB',
225 'clientVersion': f'2.{today}.01.01',
226 }
227 },
228 'playbackContext': {'contentPlaybackContext': {'signatureTimestamp': sts}}
229 }, cookies=cookies)
230
231 if r.status_code == 429:
232 return None, None, None, 'banned', 'possible IP ban'
233
234 metadata = r.json()
235 playabilityStatus = metadata['playabilityStatus']['status']
236 if playabilityStatus != "OK":
237 playabilityReason = metadata['playabilityStatus'].get('reason',
238 '//'.join(metadata['playabilityStatus'].get('messages',[])))
239 player_error = f"{playabilityStatus}: {playabilityReason}"
240 #if playabilityStatus == "UNPLAYABLE": XXX: do we need that still?
241 if playabilityStatus == "LOGIN_REQUIRED" and metadata['playabilityStatus'].get('reason') == "Sign in to confirm your age" and sts != 0:
242 r = requests.get("https://www.youtube.com/get_video_info?html5=1&video_id="+video_id, {
243 "video_id": video_id,
244 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
245 "el": "embedded",
246 "sts": sts,
247 "hl": "en_US",
248 })
249 params = parse_qs(r.text)
250 if 'errorcode' in params: # status=fail
251 return None, None, None, 'malformed', params['reason'][0]
252 from flask import current_app
253 current_app.logger.error(r.text)
254 metadata = json.loads(params.get('player_response')[0])
255
256 # without videoDetails, there's only the error message
257 maybe_metadata = metadata if 'videoDetails' in metadata else None
258 return None, None, maybe_metadata, 'player', player_error
259
260 # livestreams have no adaptive/muxed formats:
261 is_live = metadata['videoDetails'].get('isLive', False)
262
263 if not 'formats' in metadata['streamingData'] and not is_live:
264 return None, None, metadata, 'no-url', player_error
265
266 formats = metadata['streamingData'].get('formats',[])
267 for (i,v) in enumerate(formats):
268 if not ('cipher' in v or 'signatureCipher' in v): continue
269 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
270 formats[i]['url'] = unscramble(cipher, algo)
271
272 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
273 for (i,v) in enumerate(adaptive):
274 if not ('cipher' in v or 'signatureCipher' in v): continue
275 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
276 adaptive[i]['url'] = unscramble(cipher, algo)
277
278 stream_map = {
279 'adaptive': adaptive, 'muxed': formats,
280 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
281 }
282
283 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] \
284 if not is_live else None
285
286 # ip-locked videos can be recovered if the proxy module is loaded:
287 is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
288
289 nonfatal = 'livestream' if is_live \
290 else 'geolocked' if is_geolocked \
291 else None
292
293 return url, stream_map, metadata, nonfatal, None
294
295 def unscramble(cipher, algo):
296 signature = list(cipher['s'][0])
297 for c in algo.split():
298 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
299 ix = int(ix) % len(signature) if ix else 0
300 if op == 'r': signature = list(reversed(signature))
301 if op == 's': signature = signature[ix:]
302 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
303 sp = cipher.get('sp', ['signature'])[0]
304 sig = cipher.get('sig', [''.join(signature)])[0]
305 return f"{cipher['url'][0]}&{sp}={sig}"
306
307 def video_metadata(metadata):
308 if not metadata:
309 return {}
310
311 meta1 = metadata['videoDetails']
312 meta2 = metadata['microformat']['playerMicroformatRenderer']
313
314 # sometimes, we receive the notification so early that the length is not
315 # yet populated. Nothing we can do about it.
316 length = int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']) or None
317
318 published_at = meta2.get('liveBroadcastDetails',{}) \
319 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
320
321 # Note: 'premiere' videos have livestream=False and published= will be the
322 # start of the premiere.
323 return {
324 'title': meta1['title'],
325 'author': meta1['author'],
326 'channel_id': meta1['channelId'],
327 'published': published_at,
328 'views': int(meta1['viewCount']),
329 'length': length,
330 'livestream': meta1['isLiveContent'],
331 }
332
333 def store_video_metadata(video_id):
334 # check if we know about it, and if not, fetch and store video metadata
335 with sqlite3.connect(cf['global']['database']) as conn:
336 c = conn.cursor()
337 c.execute("SELECT 1 from videos where id = ?", (video_id,))
338 new_video = len(c.fetchall()) < 1
339 if new_video:
340 _, _, meta, _, _ = get_video_info(video_id)
341 if meta:
342 meta = video_metadata(meta)
343 c.execute("""
344 INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
345 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
346 """, (
347 video_id,
348 meta['channel_id'],
349 meta['title'],
350 meta['length'],
351 meta['published'],
352 meta['published'],
353 ))
354 c.execute("""
355 INSERT OR REPLACE INTO channels (id, name)
356 VALUES (?, ?)
357 """, (meta['channel_id'], meta['author']))
358
359 def fetch_video_flags(token, video_ids):
360 with sqlite3.connect(cf['global']['database']) as conn:
361 c = conn.cursor()
362 c.execute("""
363 SELECT video_id,display
364 FROM flags
365 WHERE user = ?
366 AND display IS NOT NULL
367 AND video_id IN ({})
368 -- AND display = 'pinned'
369 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
370 flags = c.fetchall()
371 pinned = [video for video,disp in flags if disp == 'pinned']
372 hidden = [video for video,disp in flags if disp == 'hidden']
373
374 return pinned, hidden
375
376 from werkzeug.exceptions import NotFound
377 class NoFallbackException(NotFound): pass
378 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
379 """
380 finds the next route that matches the current url rule, and executes it.
381 args, kwargs: pass all arguments of the current route
382 """
383 from flask import current_app, request, g
384
385 # build a list of endpoints that match the current request's url rule:
386 matching = [
387 rule.endpoint
388 for rule in current_app.url_map.iter_rules()
389 if rule.rule == request.url_rule.rule
390 ]
391 current = matching.index(request.endpoint)
392
393 # since we can't change request.endpoint, we always get the original
394 # endpoint back. so for repeated fall throughs, we use the g object to
395 # increment how often we want to fall through.
396 if not '_fallback_next' in g:
397 g._fallback_next = 0
398 g._fallback_next += 1
399
400 next_ep = current + g._fallback_next
401
402 if next_ep < len(matching):
403 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
404 else:
405 raise NoFallbackException
406
407 def websub_url_hmac(key, feed_id, timestamp, nonce):
408 """ generate sha1 hmac, as required by websub/pubsubhubbub """
409 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
410 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
411
412 def websub_body_hmac(key, body):
413 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
414
415 def flask_logger(msg, level="warning"):
416 level = dict(
417 CRITICAL=50,
418 ERROR=40,
419 WARNING=30,
420 INFO=20,
421 DEBUG=10,
422 NOTSET=0,
423 ).get(level.upper(), 0)
424 try:
425 from flask import current_app
426 current_app.logger.log(level, msg)
427 except:
428 pass
429
430 def pp(*args):
431 from pprint import pprint
432 import sys, codecs
433 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum