]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
add display flags to /c/*; dedup flag retrieval code
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 for i, video in enumerate(videos):
108 if video.get('deleted'):
109 # Note: Deletion events are not just fired for actual deletions,
110 # but also for unlisting videos and livestreams that just ended
111 # (even postLiveDVR ones). Hence, we don't follow it.
112 flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
113 break
114
115 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
116 new_video = len(c.fetchall()) < 1
117 if new_video:
118 flask_logger(f"new video {video['video_id']}")
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 livestream = None
130 if meta:
131 meta = video_metadata(meta)
132 published2 = dateutil.parser.parse(meta['published'])
133 flask_logger(f"published {published} / {published2}")
134 if published < published2: # g_v_i date is more accurate:
135 published = published2
136 length = meta['length']
137 livestream = meta['livestream']
138
139 now = datetime.now(timezone.utc)
140
141 # we pretend that all videos uploaded this week were uploaded just
142 # now, so the user sees it at the top of the feed, and it doesn't
143 # get inserted somewhere further down.
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos
151 (id, channel_id, title, length, livestream, published, crawled)
152 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
153 """, (
154 video['video_id'],
155 video['channel_id'],
156 video['title'],
157 length,
158 livestream,
159 video['published'],
160 timestamp
161 ))
162 else:
163 # update video title (everything else can't change)
164 c.execute("""
165 UPDATE OR IGNORE videos
166 SET title = ?
167 WHERE id = ?
168 """, (
169 video['title'],
170 video['video_id'],
171 ))
172
173 # for channels, this is obviously always the same, but playlists can
174 # consist of videos from different channels:
175 if i == 0 or playlist:
176 c.execute("""
177 INSERT OR REPLACE INTO channels (id, name)
178 VALUES (?, ?)
179 """, (video['channel_id'], video['author']))
180
181 # keep track of which videos are in a playlist, so we can show the user
182 # why a video is in their feed:
183 if playlist:
184 c.execute("""
185 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
186 VALUES (?, ?)
187 """, (video['video_id'], playlist))
188
189 if playlist and not from_webhook: # Note: playlists can't get updated via websub
190 c.execute("""
191 INSERT OR REPLACE INTO playlists (id, name, author)
192 VALUES (?, ?, ?)
193 """, (playlist, title, channel))
194 c.execute("""
195 INSERT OR REPLACE INTO channels (id, name)
196 VALUES (?, ?)
197 """, (channel, author))
198
199 db.commit()
200
201 return True
202
203 def get_video_info(video_id, sts=0, algo=""):
204 """
205 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
206 error types: player, malformed, livestream, geolocked, exhausted
207 """
208 player_error = None # for 'exhausted'
209 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
210 r = requests.get("https://www.youtube.com/get_video_info", {
211 "video_id": video_id,
212 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
213 "el": el,
214 "sts": sts,
215 "hl": "en_US",
216 })
217
218 if r.status_code == 429:
219 return None, None, None, 'banned', 'possible IP ban'
220
221 params = parse_qs(r.text)
222 if 'errorcode' in params: # status=fail
223 return None, None, None, 'malformed', params['reason'][0]
224
225 metadata = json.loads(params.get('player_response')[0])
226 playabilityStatus = metadata['playabilityStatus']['status']
227 if playabilityStatus != "OK":
228 playabilityReason = metadata['playabilityStatus'].get('reason',
229 '//'.join(metadata['playabilityStatus'].get('messages',[])))
230 player_error = f"{playabilityStatus}: {playabilityReason}"
231 if playabilityStatus == "UNPLAYABLE":
232 continue # try again with next el value (or fail as exhausted)
233 # without videoDetails, there's only the error message
234 maybe_metadata = metadata if 'videoDetails' in metadata else None
235 return None, None, maybe_metadata, 'player', player_error
236 if metadata['videoDetails'].get('isLive', False):
237 return None, None, metadata, 'livestream', None
238
239 if not 'formats' in metadata['streamingData']:
240 continue # no urls
241
242 formats = metadata['streamingData'].get('formats',[])
243 for (i,v) in enumerate(formats):
244 if not ('cipher' in v or 'signatureCipher' in v): continue
245 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
246 formats[i]['url'] = unscramble(cipher, algo)
247
248 adaptive = metadata['streamingData'].get('adaptiveFormats',[])
249 for (i,v) in enumerate(adaptive):
250 if not ('cipher' in v or 'signatureCipher' in v): continue
251 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
252 adaptive[i]['url'] = unscramble(cipher, algo)
253
254 stream_map = {'adaptive': adaptive, 'muxed': formats}
255 stream_map.update({'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl')})
256 stream_map.update({'dashManifestUrl': metadata['streamingData'].get('dashManifestUrl')})
257
258 # todo: check if we have urls or try again
259 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
260
261 # ip-locked videos can be recovered if the proxy module is loaded:
262 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
263
264 return url, stream_map, metadata, is_geolocked, None
265 else:
266 return None, None, metadata, 'exhausted', player_error
267
268 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
269 signature = list(cipher['s'][0])
270 for c in algo.split():
271 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
272 ix = int(ix) % len(signature) if ix else 0
273 if not op: continue
274 if op == 'r': signature = list(reversed(signature))
275 if op == 's': signature = signature[ix:]
276 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
277 sp = cipher.get('sp', ['signature'])[0]
278 sig = cipher.get('sig', [''.join(signature)])[0]
279 return f"{cipher['url'][0]}&{sp}={sig}"
280
281 def video_metadata(metadata):
282 if not metadata:
283 return {}
284
285 meta1 = metadata['videoDetails']
286 meta2 = metadata['microformat']['playerMicroformatRenderer']
287
288 published_at = meta2.get('liveBroadcastDetails',{}) \
289 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
290
291 # Note: 'premiere' videos have livestream=False and published= will be the
292 # start of the premiere.
293 return {
294 'title': meta1['title'],
295 'author': meta1['author'],
296 'channel_id': meta1['channelId'],
297 'published': published_at,
298 'views': int(meta1['viewCount']),
299 'length': int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']),
300 'livestream': meta1['isLiveContent'],
301 }
302
303 def store_video_metadata(video_id):
304 # check if we know about it, and if not, fetch and store video metadata
305 with sqlite3.connect(cf['global']['database']) as conn:
306 c = conn.cursor()
307 c.execute("SELECT 1 from videos where id = ?", (video_id,))
308 new_video = len(c.fetchall()) < 1
309 if new_video:
310 _, _, meta, _, _ = get_video_info(video_id)
311 if meta:
312 meta = video_metadata(meta)
313 c.execute("""
314 INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
315 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
316 """, (
317 video_id,
318 meta['channel_id'],
319 meta['title'],
320 meta['length'],
321 meta['published'],
322 meta['published'],
323 ))
324 c.execute("""
325 INSERT OR REPLACE INTO channels (id, name)
326 VALUES (?, ?)
327 """, (meta['channel_id'], meta['author']))
328
329 def fetch_video_flags(token, video_ids):
330 with sqlite3.connect(cf['global']['database']) as conn:
331 c = conn.cursor()
332 c.execute("""
333 SELECT video_id,display
334 FROM flags
335 WHERE user = ?
336 AND display IS NOT NULL
337 AND video_id IN ({})
338 -- AND display = 'pinned'
339 """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
340 flags = c.fetchall()
341 pinned = [video for video,disp in flags if disp == 'pinned']
342 hidden = [video for video,disp in flags if disp == 'hidden']
343
344 return pinned, hidden
345
346 from werkzeug.exceptions import NotFound
347 class NoFallbackException(NotFound): pass
348 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
349 """
350 finds the next route that matches the current url rule, and executes it.
351 args, kwargs: pass all arguments of the current route
352 """
353 from flask import current_app, request, g
354
355 # build a list of endpoints that match the current request's url rule:
356 matching = [
357 rule.endpoint
358 for rule in current_app.url_map.iter_rules()
359 if rule.rule == request.url_rule.rule
360 ]
361 current = matching.index(request.endpoint)
362
363 # since we can't change request.endpoint, we always get the original
364 # endpoint back. so for repeated fall throughs, we use the g object to
365 # increment how often we want to fall through.
366 if not '_fallback_next' in g:
367 g._fallback_next = 0
368 g._fallback_next += 1
369
370 next_ep = current + g._fallback_next
371
372 if next_ep < len(matching):
373 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
374 else:
375 raise NoFallbackException
376
377 def websub_url_hmac(key, feed_id, timestamp, nonce):
378 """ generate sha1 hmac, as required by websub/pubsubhubbub """
379 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
380 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
381
382 def websub_body_hmac(key, body):
383 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
384
385 def flask_logger(msg, level="warning"):
386 try:
387 from flask import current_app
388 current_app.logger.log(level, msg)
389 except:
390 pass
391
392 def pp(*args):
393 from pprint import pprint
394 import sys, codecs
395 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum