]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
move parse_metadata to youtube blueprint
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 from flask import current_app # XXX: remove
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
111 # TODO: enable once we enforce hmac validation:
112 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
113 break
114
115 now = datetime.now(timezone.utc)
116 updated = dateutil.parser.parse(video['updated'])
117 published = dateutil.parser.parse(video['published'])
118 # if update and published time are near-identical, we assume it's new.
119 # checking if it was posted this week is necessary during xmlfeed pulling.
120 if (updated - published).seconds < 60 and (now - published).days < 7:
121 timestamp = now
122 if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
123 else:#, it might just an update to an older video, or a previously unlisted one.
124 # first, assume it's an older video (correct when pulling xmlfeeds)
125 timestamp = published
126 # then, check if we don't know about it and if so, look up the real date.
127
128 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
129 # video gets uploaded as unlisted on day A and set to public on day B;
130 # the webhook is sent on day B, but 'published' says A. The video
131 # therefore looks like it's just an update to an older video). If
132 # that's the case, we fetch get_video_info and double-check.
133 # We only need to do this to not-yet-in-the-database videos.
134 c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
135 new_video = len(c.fetchall()) < 1
136 if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
137 if from_webhook and new_video:
138 if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove
139 _, _, meta, _, _ = get_video_info(video['video_id'])
140 if meta:
141 meta = video_metadata(meta)
142 published = dateutil.parser.parse(meta['published'])
143 if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
151 VALUES (?, ?, ?, datetime(?), datetime(?))
152 """, (
153 video['video_id'],
154 video['channel_id'],
155 video['title'],
156 video['published'],
157 timestamp
158 ))
159
160 # for channels, this is obviously always the same, but playlists can
161 # consist of videos from different channels:
162 if i == 0 or playlist:
163 c.execute("""
164 INSERT OR REPLACE INTO channels (id, name)
165 VALUES (?, ?)
166 """, (video['channel_id'], video['author']))
167
168 # keep track of which videos are in a playlist, so we can show the user
169 # why a video is in their feed:
170 if playlist:
171 c.execute("""
172 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
173 VALUES (?, ?)
174 """, (video['video_id'], playlist))
175
176 if playlist and not from_webhook: # Note: playlists can't get updated via websub
177 c.execute("""
178 INSERT OR REPLACE INTO playlists (id, name, author)
179 VALUES (?, ?, ?)
180 """, (playlist, title, channel))
181 c.execute("""
182 INSERT OR REPLACE INTO channels (id, name)
183 VALUES (?, ?)
184 """, (channel, author))
185
186 db.commit()
187
188 return True
189
190 def get_video_info(video_id, sts=0, algo=""):
191 """
192 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
193 error types: player, malformed, livestream, geolocked, exhausted
194 """
195 player_error = None # for 'exhausted'
196 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
197 r = requests.get("https://www.youtube.com/get_video_info", {
198 "video_id": video_id,
199 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
200 "el": el,
201 "sts": sts,
202 "hl": "en_US",
203 })
204 params = parse_qs(r.text)
205 if 'errorcode' in params: # status=fail
206 return None, None, None, 'malformed', params['reason'][0]
207
208 metadata = json.loads(params.get('player_response')[0])
209 playabilityStatus = metadata['playabilityStatus']['status']
210 if playabilityStatus != "OK":
211 playabilityReason = metadata['playabilityStatus'].get('reason',
212 '//'.join(metadata['playabilityStatus'].get('messages',[])))
213 player_error = f"{playabilityStatus}: {playabilityReason}"
214 if playabilityStatus == "UNPLAYABLE":
215 continue # try again with next el value (or fail as exhausted)
216 # without videoDetails, there's only the error message
217 maybe_metadata = metadata if 'videoDetails' in metadata else None
218 return None, None, maybe_metadata, 'player', player_error
219 if metadata['videoDetails'].get('isLive', False):
220 return None, None, metadata, 'livestream', None
221
222 if not 'formats' in metadata['streamingData']:
223 continue # no urls
224
225 formats = metadata['streamingData']['formats']
226 for (i,v) in enumerate(formats):
227 if not ('cipher' in v or 'signatureCipher' in v): continue
228 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
229 formats[i]['url'] = unscramble(cipher, algo)
230
231 adaptive = metadata['streamingData']['adaptiveFormats']
232 for (i,v) in enumerate(adaptive):
233 if not ('cipher' in v or 'signatureCipher' in v): continue
234 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
235 adaptive[i]['url'] = unscramble(cipher, algo)
236
237 stream_map = {'adaptive': adaptive, 'muxed': formats}
238
239 # todo: check if we have urls or try again
240 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
241
242 # ip-locked videos can be recovered if the proxy module is loaded:
243 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
244
245 return url, stream_map, metadata, is_geolocked, None
246 else:
247 return None, None, metadata, 'exhausted', player_error
248
249 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
250 signature = list(cipher['s'][0])
251 for c in algo.split():
252 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
253 ix = int(ix) % len(signature) if ix else 0
254 if not op: continue
255 if op == 'r': signature = list(reversed(signature))
256 if op == 's': signature = signature[ix:]
257 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
258 sp = cipher.get('sp', ['signature'])[0]
259 sig = cipher.get('sig', [''.join(signature)])[0]
260 return f"{cipher['url'][0]}&{sp}={sig}"
261
262 def video_metadata(metadata):
263 if not metadata:
264 return {}
265
266 meta1 = metadata['videoDetails']
267 meta2 = metadata['microformat']['playerMicroformatRenderer']
268
269 published_at = meta2.get('liveBroadcastDetails',{}) \
270 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
271
272 return {
273 'title': meta1['title'],
274 'author': meta1['author'],
275 'channel_id': meta1['channelId'],
276 'published': published_at,
277 'views': int(meta1['viewCount']),
278 'length': int(meta1['lengthSeconds']),
279 }
280
281 def store_video_metadata(video_id):
282 # check if we know about it, and if not, fetch and store video metadata
283 with sqlite3.connect(cf['global']['database']) as conn:
284 c = conn.cursor()
285 c.execute("SELECT 1 from videos where id = ?", (video_id,))
286 new_video = len(c.fetchall()) < 1
287 if new_video:
288 _, _, meta, _, _ = get_video_info(video_id)
289 if meta:
290 meta = video_metadata(meta)
291 c.execute("""
292 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
293 VALUES (?, ?, ?, datetime(?), datetime(?))
294 """, (
295 video_id,
296 meta['channel_id'],
297 meta['title'],
298 meta['published'],
299 meta['published'],
300 ))
301 c.execute("""
302 INSERT OR REPLACE INTO channels (id, name)
303 VALUES (?, ?)
304 """, (meta['channel_id'], meta['author']))
305
306 from werkzeug.exceptions import NotFound
307 class NoFallbackException(NotFound): pass
308 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
309 """
310 finds the next route that matches the current url rule, and executes it.
311 args, kwargs: pass all arguments of the current route
312 """
313 from flask import current_app, request, g
314
315 # build a list of endpoints that match the current request's url rule:
316 matching = [
317 rule.endpoint
318 for rule in current_app.url_map.iter_rules()
319 if rule.rule == request.url_rule.rule
320 ]
321 current = matching.index(request.endpoint)
322
323 # since we can't change request.endpoint, we always get the original
324 # endpoint back. so for repeated fall throughs, we use the g object to
325 # increment how often we want to fall through.
326 if not '_fallback_next' in g:
327 g._fallback_next = 0
328 g._fallback_next += 1
329
330 next_ep = current + g._fallback_next
331
332 if next_ep < len(matching):
333 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
334 else:
335 raise NoFallbackException
336
337 def websub_url_hmac(key, feed_id, timestamp, nonce):
338 """ generate sha1 hmac, as required by websub/pubsubhubbub """
339 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
340 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
341
342 def websub_body_hmac(key, body):
343 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
344
345 def pp(*args):
346 from pprint import pprint
347 import sys, codecs
348 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum