]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
polish some stuff
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 for i, video in enumerate(videos):
108 if video.get('deleted'):
109 # Note: Deletion events are not just fired for actual deletions,
110 # but also for unlisting videos and livestreams that just ended
111 # (even postLiveDVR ones). Hence, we don't follow it.
112 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
113 break
114
115 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
116 new_video = len(c.fetchall()) < 1
117 if new_video:
118 flask_logger(f"new video {video['video_id']}")
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 livestream = None
130 if meta:
131 meta = video_metadata(meta)
132 published2 = dateutil.parser.parse(meta['published'])
133 flask_logger(f"published {published} / {published2}")
134 if published < published2: # g_v_i date is more accurate:
135 published = published2
136 length = meta['length']
137 livestream = meta['livestream']
138
139 now = datetime.now(timezone.utc)
140
141 # we pretend that all videos uploaded this week were uploaded just
142 # now, so the user sees it at the top of the feed, and it doesn't
143 # get inserted somewhere further down.
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos
151 (id, channel_id, title, length, livestream, published, crawled)
152 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
153 """, (
154 video['video_id'],
155 video['channel_id'],
156 video['title'],
157 length,
158 livestream,
159 video['published'],
160 timestamp
161 ))
162 else:
163 # update video title (everything else can't change)
164 c.execute("""
165 UPDATE OR IGNORE videos
166 SET title = ?
167 WHERE id = ?
168 """, (
169 video['title'],
170 video['video_id'],
171 ))
172
173 # for channels, this is obviously always the same, but playlists can
174 # consist of videos from different channels:
175 if i == 0 or playlist:
176 c.execute("""
177 INSERT OR REPLACE INTO channels (id, name)
178 VALUES (?, ?)
179 """, (video['channel_id'], video['author']))
180
181 # keep track of which videos are in a playlist, so we can show the user
182 # why a video is in their feed:
183 if playlist:
184 c.execute("""
185 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
186 VALUES (?, ?)
187 """, (video['video_id'], playlist))
188
189 if playlist and not from_webhook: # Note: playlists can't get updated via websub
190 c.execute("""
191 INSERT OR REPLACE INTO playlists (id, name, author)
192 VALUES (?, ?, ?)
193 """, (playlist, title, channel))
194 c.execute("""
195 INSERT OR REPLACE INTO channels (id, name)
196 VALUES (?, ?)
197 """, (channel, author))
198
199 db.commit()
200
201 return True
202
203 def get_video_info(video_id, sts=0, algo=""):
204 """
205 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
206 error types: player, malformed, livestream, geolocked, exhausted
207 """
208 player_error = None # for 'exhausted'
209 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
210 r = requests.get("https://www.youtube.com/get_video_info", {
211 "video_id": video_id,
212 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
213 "el": el,
214 "sts": sts,
215 "hl": "en_US",
216 })
217
218 if r.status_code == 429:
219 return None, None, None, 'banned', 'possible IP ban'
220
221 params = parse_qs(r.text)
222 if 'errorcode' in params: # status=fail
223 return None, None, None, 'malformed', params['reason'][0]
224
225 metadata = json.loads(params.get('player_response')[0])
226 playabilityStatus = metadata['playabilityStatus']['status']
227 if playabilityStatus != "OK":
228 playabilityReason = metadata['playabilityStatus'].get('reason',
229 '//'.join(metadata['playabilityStatus'].get('messages',[])))
230 player_error = f"{playabilityStatus}: {playabilityReason}"
231 if playabilityStatus == "UNPLAYABLE":
232 continue # try again with next el value (or fail as exhausted)
233 # without videoDetails, there's only the error message
234 maybe_metadata = metadata if 'videoDetails' in metadata else None
235 return None, None, maybe_metadata, 'player', player_error
236 if metadata['videoDetails'].get('isLive', False):
237 return None, None, metadata, 'livestream', None
238
239 if not 'formats' in metadata['streamingData']:
240 continue # no urls
241
242 formats = metadata['streamingData'].get('formats',[])
243 for (i,v) in enumerate(formats):
244 if not ('cipher' in v or 'signatureCipher' in v): continue
245 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
246 formats[i]['url'] = unscramble(cipher, algo)
247
248 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
249 for (i,v) in enumerate(adaptive):
250 if not ('cipher' in v or 'signatureCipher' in v): continue
251 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
252 adaptive[i]['url'] = unscramble(cipher, algo)
253
254 stream_map = {'adaptive': adaptive, 'muxed': formats}
255 stream_map.update({'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl')})
256 stream_map.update({'dashManifestUrl': metadata['streamingData'].get('dashManifestUrl')})
257
258 # todo: check if we have urls or try again
259 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
260
261 # ip-locked videos can be recovered if the proxy module is loaded:
262 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
263
264 return url, stream_map, metadata, is_geolocked, None
265 else:
266 return None, None, metadata, 'exhausted', player_error
267
268 def unscramble(cipher, algo):
269 signature = list(cipher['s'][0])
270 for c in algo.split():
271 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
272 ix = int(ix) % len(signature) if ix else 0
273 if op == 'r': signature = list(reversed(signature))
274 if op == 's': signature = signature[ix:]
275 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
276 sp = cipher.get('sp', ['signature'])[0]
277 sig = cipher.get('sig', [''.join(signature)])[0]
278 return f"{cipher['url'][0]}&{sp}={sig}"
279
280 def video_metadata(metadata):
281 if not metadata:
282 return {}
283
284 meta1 = metadata['videoDetails']
285 meta2 = metadata['microformat']['playerMicroformatRenderer']
286
287 published_at = meta2.get('liveBroadcastDetails',{}) \
288 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
289
290 # Note: 'premiere' videos have livestream=False and published= will be the
291 # start of the premiere.
292 return {
293 'title': meta1['title'],
294 'author': meta1['author'],
295 'channel_id': meta1['channelId'],
296 'published': published_at,
297 'views': int(meta1['viewCount']),
298 'length': int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']),
299 'livestream': meta1['isLiveContent'],
300 }
301
302 def store_video_metadata(video_id):
303 # check if we know about it, and if not, fetch and store video metadata
304 with sqlite3.connect(cf['global']['database']) as conn:
305 c = conn.cursor()
306 c.execute("SELECT 1 from videos where id = ?", (video_id,))
307 new_video = len(c.fetchall()) < 1
308 if new_video:
309 _, _, meta, _, _ = get_video_info(video_id)
310 if meta:
311 meta = video_metadata(meta)
312 c.execute("""
313 INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
314 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
315 """, (
316 video_id,
317 meta['channel_id'],
318 meta['title'],
319 meta['length'],
320 meta['published'],
321 meta['published'],
322 ))
323 c.execute("""
324 INSERT OR REPLACE INTO channels (id, name)
325 VALUES (?, ?)
326 """, (meta['channel_id'], meta['author']))
327
328 def fetch_video_flags(token, video_ids):
329 with sqlite3.connect(cf['global']['database']) as conn:
330 c = conn.cursor()
331 c.execute("""
332 SELECT video_id,display
333 FROM flags
334 WHERE user = ?
335 AND display IS NOT NULL
336 AND video_id IN ({})
337 -- AND display = 'pinned'
338 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
339 flags = c.fetchall()
340 pinned = [video for video,disp in flags if disp == 'pinned']
341 hidden = [video for video,disp in flags if disp == 'hidden']
342
343 return pinned, hidden
344
345 from werkzeug.exceptions import NotFound
346 class NoFallbackException(NotFound): pass
347 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
348 """
349 finds the next route that matches the current url rule, and executes it.
350 args, kwargs: pass all arguments of the current route
351 """
352 from flask import current_app, request, g
353
354 # build a list of endpoints that match the current request's url rule:
355 matching = [
356 rule.endpoint
357 for rule in current_app.url_map.iter_rules()
358 if rule.rule == request.url_rule.rule
359 ]
360 current = matching.index(request.endpoint)
361
362 # since we can't change request.endpoint, we always get the original
363 # endpoint back. so for repeated fall throughs, we use the g object to
364 # increment how often we want to fall through.
365 if not '_fallback_next' in g:
366 g._fallback_next = 0
367 g._fallback_next += 1
368
369 next_ep = current + g._fallback_next
370
371 if next_ep < len(matching):
372 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
373 else:
374 raise NoFallbackException
375
376 def websub_url_hmac(key, feed_id, timestamp, nonce):
377 """ generate sha1 hmac, as required by websub/pubsubhubbub """
378 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
379 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
380
381 def websub_body_hmac(key, body):
382 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
383
384 def flask_logger(msg, level="warning"):
385 try:
386 from flask import current_app
387 current_app.logger.log(level, msg)
388 except:
389 pass
390
391 def pp(*args):
392 from pprint import pprint
393 import sys, codecs
394 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum