]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
use ANDROID instead of TVHTML5 for 2nd order fallback
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
23
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading import Timer
28 def purge_cache(sec):
29 requests_cache.remove_expired_responses()
30 t = Timer(sec, purge_cache, args=(sec,))
31 t.setDaemon(True)
32 t.start()
33 purge_cache(10*60)
34
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
37 from flask import g
38 import requests
39 from requests import Session as OriginalSession
40 class _NSASession(OriginalSession):
41 def request(self, method, url, params=None, data=None, **kwargs):
42 response = super(_NSASession, self).request(
43 method, url, params, data, **kwargs
44 )
45 try:
46 if 'api_requests' not in g:
47 g.api_requests = []
48 g.api_requests.append((url, params, response.text))
49 except RuntimeError: pass # not within flask (e.g. utils.py)
50 return response
51 requests.Session = requests.sessions.Session = _NSASession
52
53 def fetch_xml(feed_type, feed_id):
54 # TODO: handle requests.exceptions.ConnectionError
55 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
56 feed_type: feed_id,
57 })
58 if not r.ok:
59 return None
60
61 return r.content
62
63 def parse_xml(xmldata):
64 ns = {
65 'atom':"http://www.w3.org/2005/Atom",
66 'yt': "http://www.youtube.com/xml/schemas/2015",
67 'media':"http://search.yahoo.com/mrss/",
68 'at': "http://purl.org/atompub/tombstones/1.0",
69 }
70
71 feed = ElementTree.fromstring(xmldata)
72
73 if feed.find('at:deleted-entry',ns):
74 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
75 return None, None, [{'deleted': True, 'video_id': vid}], None, None
76
77 title = feed.find('atom:title',ns).text
78 author = feed.find('atom:author/atom:name',ns).text \
79 if feed.find('atom:author',ns) else None
80 # for /user/<> endpoint: find out UC-id:
81 # for playlists: this is who created the playlist:
82 try: channel_id = feed.find('yt:channelId',ns).text
83 except:channel_id=None # XXX: why does ternary not work!?
84 # for pullsub: if this exists, we're looking at a playlist:
85 try: playlist_id = feed.find('yt:playlistId',ns).text
86 except:playlist_id=None # XXX: why does ternary not work!?
87 videos = []
88 for entry in feed.findall('atom:entry',ns):
89 videos.append({
90 'video_id': entry.find('yt:videoId',ns).text,
91 'title': entry.find('atom:title',ns).text,
92 'published': entry.find('atom:published',ns).text,
93 'channel_id': entry.find('yt:channelId',ns).text,
94 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
95 # extra fields for pull_subs/webhook:
96 'updated': entry.find('atom:updated',ns).text,
97 })
98
99 return title, author, videos, channel_id, playlist_id
100
101 def update_channel(db, xmldata, from_webhook=False):
102 if not xmldata: return False
103
104 # Note: websub does not return global author, hence taking from first video
105 title, author, videos, channel, playlist = parse_xml(xmldata)
106
107 c = db.cursor()
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 # Note: Deletion events are not just fired for actual deletions,
111 # but also for unlisting videos and livestreams that just ended
112 # (even postLiveDVR ones). Hence, we don't follow it.
113 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
114 break
115
116 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
117 new_video = len(c.fetchall()) < 1
118 if new_video:
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 livestream = None
130 if meta:
131 meta = video_metadata(meta)
132 published2 = dateutil.parser.parse(meta['published'])
133 if published < published2: # g_v_i date is more accurate:
134 published = published2
135 length = meta['length']
136 livestream = meta['livestream']
137
138 now = datetime.now(timezone.utc)
139
140 # we pretend that all videos uploaded this week were uploaded just
141 # now, so the user sees it at the top of the feed, and it doesn't
142 # get inserted somewhere further down.
143 if (now - published).days < 7:
144 timestamp = now
145 else:#, it's just an update to an older video.
146 timestamp = published
147
148 c.execute("""
149 INSERT OR IGNORE INTO videos
150 (id, channel_id, title, length, livestream, published, crawled)
151 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
152 """, (
153 video['video_id'],
154 video['channel_id'],
155 video['title'],
156 length,
157 livestream,
158 video['published'],
159 timestamp
160 ))
161 else:
162 # update video title (everything else can't change)
163 c.execute("""
164 UPDATE OR IGNORE videos
165 SET title = ?
166 WHERE id = ?
167 """, (
168 video['title'],
169 video['video_id'],
170 ))
171
172 # for channels, this is obviously always the same, but playlists can
173 # consist of videos from different channels:
174 if i == 0 or playlist:
175 c.execute("""
176 INSERT OR REPLACE INTO channels (id, name)
177 VALUES (?, ?)
178 """, (video['channel_id'], video['author']))
179
180 # keep track of which videos are in a playlist, so we can show the user
181 # why a video is in their feed:
182 if playlist:
183 c.execute("""
184 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
185 VALUES (?, ?)
186 """, (video['video_id'], playlist))
187
188 if playlist and not from_webhook: # Note: playlists can't get updated via websub
189 c.execute("""
190 INSERT OR REPLACE INTO playlists (id, name, author)
191 VALUES (?, ?, ?)
192 """, (playlist, title, channel))
193 c.execute("""
194 INSERT OR REPLACE INTO channels (id, name)
195 VALUES (?, ?)
196 """, (channel, author))
197
198 db.commit()
199
200 return True
201
202 def get_video_info(video_id, sts=0, algo="", _embed=False):
203 """
204 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
205 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
206 """
207 player_error, metadata = None, None # for 'exhausted'
208 with sqlite3.connect(cf['global']['database']) as conn:
209 c = conn.cursor()
210 c.execute("SELECT * FROM captcha_cookies")
211 cookies = dict(c.fetchall())
212 today = datetime.now(timezone.utc).strftime("%Y%m%d")
213 # XXX: anticaptcha hasn't been adapted
214 # XXX: this is not cached any more!
215 # note: age-gated works as long as it's embeddable (HtVdAasjOgU ok, XgnwCQzjau8 bad, SkRSXFQerZs tvhtml5-only)
216 r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
217 'videoId': video_id,
218 'context': {
219 'client': {
220 'gl': 'US',
221 'hl': 'en',
222 'clientName': 'WEB_EMBEDDED_PLAYER' if _embed else 'WEB',
223 'clientVersion': f'2.{today}.01.01',
224 #"clientName": "ANDROID",
225 #"clientVersion": "16.02",
226 }
227 },
228 'playbackContext': {'contentPlaybackContext': {'signatureTimestamp': sts}}
229 }, cookies=cookies)
230
231 if not r or r.status_code == 429:
232 return None, None, None, 'banned', 'possible IP ban'
233
234 metadata = r.json()
235 if "error" in metadata:
236 return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
237 playabilityStatus = metadata['playabilityStatus']['status']
238 if playabilityStatus != "OK":
239 playabilityReason = metadata['playabilityStatus'].get('reason',
240 '//'.join(metadata['playabilityStatus'].get('messages',[])))
241 player_error = f"{playabilityStatus}: {playabilityReason}"
242 #if playabilityStatus == "UNPLAYABLE": XXX: do we need that still?
243 if (playabilityStatus == "LOGIN_REQUIRED"
244 and "confirm your age" in metadata['playabilityStatus'].get('reason','')
245 and sts != 0 # only need metadata when no sts (via pubsubhubbub)
246 and not _embed
247 ):
248 _, _, metadata_embed, error_embed, _ = get_video_info(video_id, sts, algo, True)
249 if not error_embed:
250 metadata['streamingData'] = metadata_embed['streamingData']
251 metadata['playabilityStatus'] = metadata_embed['playabilityStatus']
252 else:
253 try:
254 r2 = requests.get("https://www.youtube.com/get_video_info", dict(
255 video_id=video_id,
256 html5="1",
257 c="ANDROID", # XXX: randomly 404's as well
258 cver="16.02",
259 el="embedded",
260 eurl=f"https://youtube.googleapis.com/v/{video_id}",
261 ))
262 metadata_tvhtml5 = json.loads(parse_qs(r2.text).get('player_response',['{}'])[0])
263 # has 'playabilityStatus', 'streamingData', 'videoDetails', but not 'microformat' keys
264 metadata['streamingData'] = metadata_tvhtml5['streamingData']
265 metadata['playabilityStatus'] = metadata_tvhtml5['playabilityStatus']
266 except:
267 return None, None, metadata, 'agegated', player_error
268 else:
269 # without videoDetails, there's only the error message
270 maybe_metadata = metadata if 'videoDetails' in metadata else None
271 return None, None, maybe_metadata, 'player', player_error
272
273 # livestreams have no adaptive/muxed formats:
274 is_live = metadata['videoDetails'].get('isLive', False)
275
276 if not 'formats' in metadata['streamingData'] and not is_live:
277 return None, None, metadata, 'no-url', player_error
278
279 formats = metadata['streamingData'].get('formats',[])
280 for (i,v) in enumerate(formats):
281 if not ('cipher' in v or 'signatureCipher' in v): continue
282 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
283 formats[i]['url'] = unscramble(cipher, algo)
284
285 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
286 for (i,v) in enumerate(adaptive):
287 if not ('cipher' in v or 'signatureCipher' in v): continue
288 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
289 adaptive[i]['url'] = unscramble(cipher, algo)
290
291 stream_map = {
292 'adaptive': adaptive, 'muxed': formats,
293 'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
294 }
295
296 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] \
297 if not is_live else None
298
299 # ip-locked videos can be recovered if the proxy module is loaded:
300 is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
301
302 nonfatal = 'livestream' if is_live \
303 else 'geolocked' if is_geolocked \
304 else None
305
306 return url, stream_map, metadata, nonfatal, None
307
308 def unscramble(cipher, algo):
309 signature = list(cipher['s'][0])
310 for c in algo.split():
311 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
312 ix = int(ix) % len(signature) if ix else 0
313 if op == 'r': signature = list(reversed(signature))
314 if op == 's': signature = signature[ix:]
315 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
316 sp = cipher.get('sp', ['signature'])[0]
317 sig = cipher.get('sig', [''.join(signature)])[0]
318 return f"{cipher['url'][0]}&{sp}={sig}"
319
320 def video_metadata(metadata):
321 if not metadata:
322 return {}
323
324 meta1 = metadata['videoDetails']
325 meta2 = metadata['microformat']['playerMicroformatRenderer']
326
327 # sometimes, we receive the notification so early that the length is not
328 # yet populated. Nothing we can do about it.
329 length = int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']) or None
330
331 published_at = meta2.get('liveBroadcastDetails',{}) \
332 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
333
334 # Note: 'premiere' videos have livestream=False and published= will be the
335 # start of the premiere.
336 return {
337 'title': meta1['title'],
338 'author': meta1['author'],
339 'channel_id': meta1['channelId'],
340 'published': published_at,
341 'views': int(meta1['viewCount']),
342 'length': length,
343 'livestream': meta1['isLiveContent'],
344 }
345
346 def store_video_metadata(video_id):
347 # check if we know about it, and if not, fetch and store video metadata
348 with sqlite3.connect(cf['global']['database']) as conn:
349 c = conn.cursor()
350 c.execute("SELECT 1 from videos where id = ?", (video_id,))
351 new_video = len(c.fetchall()) < 1
352 if new_video:
353 _, _, meta, _, _ = get_video_info(video_id)
354 if meta:
355 meta = video_metadata(meta)
356 c.execute("""
357 INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
358 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
359 """, (
360 video_id,
361 meta['channel_id'],
362 meta['title'],
363 meta['length'],
364 meta['published'],
365 meta['published'],
366 ))
367 c.execute("""
368 INSERT OR REPLACE INTO channels (id, name)
369 VALUES (?, ?)
370 """, (meta['channel_id'], meta['author']))
371
372 def fetch_video_flags(token, video_ids):
373 with sqlite3.connect(cf['global']['database']) as conn:
374 c = conn.cursor()
375 c.execute("""
376 SELECT video_id,display
377 FROM flags
378 WHERE user = ?
379 AND display IS NOT NULL
380 AND video_id IN ({})
381 -- AND display = 'pinned'
382 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
383 flags = c.fetchall()
384 pinned = [video for video,disp in flags if disp == 'pinned']
385 hidden = [video for video,disp in flags if disp == 'hidden']
386
387 return pinned, hidden
388
389 from werkzeug.exceptions import NotFound
390 class NoFallbackException(NotFound): pass
391 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
392 """
393 finds the next route that matches the current url rule, and executes it.
394 args, kwargs: pass all arguments of the current route
395 """
396 from flask import current_app, request, g
397
398 # build a list of endpoints that match the current request's url rule:
399 matching = [
400 rule.endpoint
401 for rule in current_app.url_map.iter_rules()
402 if rule.rule == request.url_rule.rule
403 ]
404 current = matching.index(request.endpoint)
405
406 # since we can't change request.endpoint, we always get the original
407 # endpoint back. so for repeated fall throughs, we use the g object to
408 # increment how often we want to fall through.
409 if not '_fallback_next' in g:
410 g._fallback_next = 0
411 g._fallback_next += 1
412
413 next_ep = current + g._fallback_next
414
415 if next_ep < len(matching):
416 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
417 else:
418 raise NoFallbackException
419
420 def websub_url_hmac(key, feed_id, timestamp, nonce):
421 """ generate sha1 hmac, as required by websub/pubsubhubbub """
422 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
423 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
424
425 def websub_body_hmac(key, body):
426 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
427
428 def flask_logger(msg, level="warning"):
429 level = dict(
430 CRITICAL=50,
431 ERROR=40,
432 WARNING=30,
433 INFO=20,
434 DEBUG=10,
435 NOTSET=0,
436 ).get(level.upper(), 0)
437 try:
438 from flask import current_app
439 current_app.logger.log(level, msg)
440 except:
441 pass
442
443 def pp(*args):
444 from pprint import pprint
445 import sys, codecs
446 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum