]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
we're blockable :(
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 for i, video in enumerate(videos):
108 if video.get('deleted'):
109 # Note: Deletion events are not just fired for actual deletions,
110 # but also for unlisting videos and livestreams that just ended
111 # (even postLiveDVR ones). Hence, we don't follow it.
112 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
113 break
114
115 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
116 new_video = len(c.fetchall()) < 1
117 if new_video:
118 flask_logger(f"new video {video['video_id']}")
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 livestream = None
130 if meta:
131 meta = video_metadata(meta)
132 published2 = dateutil.parser.parse(meta['published'])
133 flask_logger(f"published {published} / {published2}")
134 if published < published2: # g_v_i date is more accurate:
135 published = published2
136 length = meta['length']
137 livestream = meta['livestream']
138
139 now = datetime.now(timezone.utc)
140
141 # we pretend that all videos uploaded this week were uploaded just
142 # now, so the user sees it at the top of the feed, and it doesn't
143 # get inserted somewhere further down.
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos
151 (id, channel_id, title, length, livestream, published, crawled)
152 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
153 """, (
154 video['video_id'],
155 video['channel_id'],
156 video['title'],
157 length,
158 livestream,
159 video['published'],
160 timestamp
161 ))
162 else:
163 # update video title (everything else can't change)
164 c.execute("""
165 UPDATE OR IGNORE videos
166 SET title = ?
167 WHERE id = ?
168 """, (
169 video['title'],
170 video['video_id'],
171 ))
172
173 # for channels, this is obviously always the same, but playlists can
174 # consist of videos from different channels:
175 if i == 0 or playlist:
176 c.execute("""
177 INSERT OR REPLACE INTO channels (id, name)
178 VALUES (?, ?)
179 """, (video['channel_id'], video['author']))
180
181 # keep track of which videos are in a playlist, so we can show the user
182 # why a video is in their feed:
183 if playlist:
184 c.execute("""
185 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
186 VALUES (?, ?)
187 """, (video['video_id'], playlist))
188
189 if playlist and not from_webhook: # Note: playlists can't get updated via websub
190 c.execute("""
191 INSERT OR REPLACE INTO playlists (id, name, author)
192 VALUES (?, ?, ?)
193 """, (playlist, title, channel))
194 c.execute("""
195 INSERT OR REPLACE INTO channels (id, name)
196 VALUES (?, ?)
197 """, (channel, author))
198
199 db.commit()
200
201 return True
202
203 def get_video_info(video_id, sts=0, algo=""):
204 """
205 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
206 error types: player, malformed, livestream, geolocked, exhausted
207 """
208 player_error = None # for 'exhausted'
209 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
210 r = requests.get("https://www.youtube.com/get_video_info", {
211 "video_id": video_id,
212 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
213 "el": el,
214 "sts": sts,
215 "hl": "en_US",
216 })
217
218 if r.status_code == 429:
219 return None, None, None, 'banned', 'possible IP ban'
220
221 params = parse_qs(r.text)
222 if 'errorcode' in params: # status=fail
223 return None, None, None, 'malformed', params['reason'][0]
224
225 metadata = json.loads(params.get('player_response')[0])
226 playabilityStatus = metadata['playabilityStatus']['status']
227 if playabilityStatus != "OK":
228 playabilityReason = metadata['playabilityStatus'].get('reason',
229 '//'.join(metadata['playabilityStatus'].get('messages',[])))
230 player_error = f"{playabilityStatus}: {playabilityReason}"
231 if playabilityStatus == "UNPLAYABLE":
232 continue # try again with next el value (or fail as exhausted)
233 # without videoDetails, there's only the error message
234 maybe_metadata = metadata if 'videoDetails' in metadata else None
235 return None, None, maybe_metadata, 'player', player_error
236 if metadata['videoDetails'].get('isLive', False):
237 return None, None, metadata, 'livestream', None
238
239 if not 'formats' in metadata['streamingData']:
240 continue # no urls
241
242 formats = metadata['streamingData'].get('formats',[])
243 for (i,v) in enumerate(formats):
244 if not ('cipher' in v or 'signatureCipher' in v): continue
245 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
246 formats[i]['url'] = unscramble(cipher, algo)
247
248 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
249 for (i,v) in enumerate(adaptive):
250 if not ('cipher' in v or 'signatureCipher' in v): continue
251 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
252 adaptive[i]['url'] = unscramble(cipher, algo)
253
254 stream_map = {'adaptive': adaptive, 'muxed': formats}
255
256 # todo: check if we have urls or try again
257 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
258
259 # ip-locked videos can be recovered if the proxy module is loaded:
260 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
261
262 return url, stream_map, metadata, is_geolocked, None
263 else:
264 return None, None, metadata, 'exhausted', player_error
265
266 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
267 signature = list(cipher['s'][0])
268 for c in algo.split():
269 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
270 ix = int(ix) % len(signature) if ix else 0
271 if not op: continue
272 if op == 'r': signature = list(reversed(signature))
273 if op == 's': signature = signature[ix:]
274 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
275 sp = cipher.get('sp', ['signature'])[0]
276 sig = cipher.get('sig', [''.join(signature)])[0]
277 return f"{cipher['url'][0]}&{sp}={sig}"
278
279 def video_metadata(metadata):
280 if not metadata:
281 return {}
282
283 meta1 = metadata['videoDetails']
284 meta2 = metadata['microformat']['playerMicroformatRenderer']
285
286 published_at = meta2.get('liveBroadcastDetails',{}) \
287 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
288
289 # Note: 'premiere' videos have livestream=False and published= will be the
290 # start of the premiere.
291 return {
292 'title': meta1['title'],
293 'author': meta1['author'],
294 'channel_id': meta1['channelId'],
295 'published': published_at,
296 'views': int(meta1['viewCount']) or int(meta2['lengthSeconds']),
297 'length': int(meta1['lengthSeconds']),
298 'livestream': meta1['isLiveContent'],
299 }
300
301 def store_video_metadata(video_id):
302 # check if we know about it, and if not, fetch and store video metadata
303 with sqlite3.connect(cf['global']['database']) as conn:
304 c = conn.cursor()
305 c.execute("SELECT 1 from videos where id = ?", (video_id,))
306 new_video = len(c.fetchall()) < 1
307 if new_video:
308 _, _, meta, _, _ = get_video_info(video_id)
309 if meta:
310 meta = video_metadata(meta)
311 c.execute("""
312 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
313 VALUES (?, ?, ?, datetime(?), datetime(?))
314 """, (
315 video_id,
316 meta['channel_id'],
317 meta['title'],
318 meta['published'],
319 meta['published'],
320 ))
321 c.execute("""
322 INSERT OR REPLACE INTO channels (id, name)
323 VALUES (?, ?)
324 """, (meta['channel_id'], meta['author']))
325
326 from werkzeug.exceptions import NotFound
327 class NoFallbackException(NotFound): pass
328 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
329 """
330 finds the next route that matches the current url rule, and executes it.
331 args, kwargs: pass all arguments of the current route
332 """
333 from flask import current_app, request, g
334
335 # build a list of endpoints that match the current request's url rule:
336 matching = [
337 rule.endpoint
338 for rule in current_app.url_map.iter_rules()
339 if rule.rule == request.url_rule.rule
340 ]
341 current = matching.index(request.endpoint)
342
343 # since we can't change request.endpoint, we always get the original
344 # endpoint back. so for repeated fall throughs, we use the g object to
345 # increment how often we want to fall through.
346 if not '_fallback_next' in g:
347 g._fallback_next = 0
348 g._fallback_next += 1
349
350 next_ep = current + g._fallback_next
351
352 if next_ep < len(matching):
353 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
354 else:
355 raise NoFallbackException
356
357 def websub_url_hmac(key, feed_id, timestamp, nonce):
358 """ generate sha1 hmac, as required by websub/pubsubhubbub """
359 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
360 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
361
362 def websub_body_hmac(key, body):
363 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
364
365 def flask_logger(msg, level="warning"):
366 try:
367 from flask import current_app
368 current_app.logger.log(level, msg)
369 except:
370 pass
371
372 def pp(*args):
373 from pprint import pprint
374 import sys, codecs
375 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum