]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
fetch length for new subscription videos
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 from flask import current_app # XXX: remove
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
111 # TODO: enable once we enforce hmac validation:
112 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
113 break
114
115 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
116 new_video = len(c.fetchall()) < 1
117 if new_video:
118 if from_webhook:current_app.logger.warning(f"new video {video['video_id']}")
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 if meta:
130 meta = video_metadata(meta)
131 published2 = dateutil.parser.parse(meta['published'])
132 if from_webhook:current_app.logger.warning(f"published {published} / {published2}")
133 if published < published2: # g_v_i date is more accurate:
134 published = published2
135 length = meta['length']
136
137 now = datetime.now(timezone.utc)
138
139 # we pretend that all videos uploaded this week were uploaded just
140 # now, so the user sees it at the top of the feed, and it doesn't
141 # get inserted somewhere further down.
142 if (now - published).days < 7:
143 timestamp = now
144 else:#, it's just an update to an older video.
145 timestamp = published
146
147 c.execute("""
148 INSERT OR IGNORE INTO videos
149 (id, channel_id, title, length, published, crawled)
150 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
151 """, (
152 video['video_id'],
153 video['channel_id'],
154 video['title'],
155 length,
156 video['published'],
157 timestamp
158 ))
159 else:
160 # update video title (everything else can't change)
161 c.execute("""
162 UPDATE OR IGNORE videos
163 SET title = ?
164 WHERE id = ?
165 """, (
166 video['title'],
167 video['video_id'],
168 ))
169
170 # for channels, this is obviously always the same, but playlists can
171 # consist of videos from different channels:
172 if i == 0 or playlist:
173 c.execute("""
174 INSERT OR REPLACE INTO channels (id, name)
175 VALUES (?, ?)
176 """, (video['channel_id'], video['author']))
177
178 # keep track of which videos are in a playlist, so we can show the user
179 # why a video is in their feed:
180 if playlist:
181 c.execute("""
182 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
183 VALUES (?, ?)
184 """, (video['video_id'], playlist))
185
186 if playlist and not from_webhook: # Note: playlists can't get updated via websub
187 c.execute("""
188 INSERT OR REPLACE INTO playlists (id, name, author)
189 VALUES (?, ?, ?)
190 """, (playlist, title, channel))
191 c.execute("""
192 INSERT OR REPLACE INTO channels (id, name)
193 VALUES (?, ?)
194 """, (channel, author))
195
196 db.commit()
197
198 return True
199
200 def get_video_info(video_id, sts=0, algo=""):
201 """
202 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
203 error types: player, malformed, livestream, geolocked, exhausted
204 """
205 player_error = None # for 'exhausted'
206 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
207 r = requests.get("https://www.youtube.com/get_video_info", {
208 "video_id": video_id,
209 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
210 "el": el,
211 "sts": sts,
212 "hl": "en_US",
213 })
214 params = parse_qs(r.text)
215 if 'errorcode' in params: # status=fail
216 return None, None, None, 'malformed', params['reason'][0]
217
218 metadata = json.loads(params.get('player_response')[0])
219 playabilityStatus = metadata['playabilityStatus']['status']
220 if playabilityStatus != "OK":
221 playabilityReason = metadata['playabilityStatus'].get('reason',
222 '//'.join(metadata['playabilityStatus'].get('messages',[])))
223 player_error = f"{playabilityStatus}: {playabilityReason}"
224 if playabilityStatus == "UNPLAYABLE":
225 continue # try again with next el value (or fail as exhausted)
226 # without videoDetails, there's only the error message
227 maybe_metadata = metadata if 'videoDetails' in metadata else None
228 return None, None, maybe_metadata, 'player', player_error
229 if metadata['videoDetails'].get('isLive', False):
230 return None, None, metadata, 'livestream', None
231
232 if not 'formats' in metadata['streamingData']:
233 continue # no urls
234
235 formats = metadata['streamingData']['formats']
236 for (i,v) in enumerate(formats):
237 if not ('cipher' in v or 'signatureCipher' in v): continue
238 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
239 formats[i]['url'] = unscramble(cipher, algo)
240
241 adaptive = metadata['streamingData']['adaptiveFormats']
242 for (i,v) in enumerate(adaptive):
243 if not ('cipher' in v or 'signatureCipher' in v): continue
244 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
245 adaptive[i]['url'] = unscramble(cipher, algo)
246
247 stream_map = {'adaptive': adaptive, 'muxed': formats}
248
249 # todo: check if we have urls or try again
250 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
251
252 # ip-locked videos can be recovered if the proxy module is loaded:
253 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
254
255 return url, stream_map, metadata, is_geolocked, None
256 else:
257 return None, None, metadata, 'exhausted', player_error
258
259 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
260 signature = list(cipher['s'][0])
261 for c in algo.split():
262 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
263 ix = int(ix) % len(signature) if ix else 0
264 if not op: continue
265 if op == 'r': signature = list(reversed(signature))
266 if op == 's': signature = signature[ix:]
267 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
268 sp = cipher.get('sp', ['signature'])[0]
269 sig = cipher.get('sig', [''.join(signature)])[0]
270 return f"{cipher['url'][0]}&{sp}={sig}"
271
272 def video_metadata(metadata):
273 if not metadata:
274 return {}
275
276 meta1 = metadata['videoDetails']
277 meta2 = metadata['microformat']['playerMicroformatRenderer']
278
279 published_at = meta2.get('liveBroadcastDetails',{}) \
280 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
281
282 return {
283 'title': meta1['title'],
284 'author': meta1['author'],
285 'channel_id': meta1['channelId'],
286 'published': published_at,
287 'views': int(meta1['viewCount']),
288 'length': int(meta1['lengthSeconds']),
289 }
290
291 def store_video_metadata(video_id):
292 # check if we know about it, and if not, fetch and store video metadata
293 with sqlite3.connect(cf['global']['database']) as conn:
294 c = conn.cursor()
295 c.execute("SELECT 1 from videos where id = ?", (video_id,))
296 new_video = len(c.fetchall()) < 1
297 if new_video:
298 _, _, meta, _, _ = get_video_info(video_id)
299 if meta:
300 meta = video_metadata(meta)
301 c.execute("""
302 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
303 VALUES (?, ?, ?, datetime(?), datetime(?))
304 """, (
305 video_id,
306 meta['channel_id'],
307 meta['title'],
308 meta['published'],
309 meta['published'],
310 ))
311 c.execute("""
312 INSERT OR REPLACE INTO channels (id, name)
313 VALUES (?, ?)
314 """, (meta['channel_id'], meta['author']))
315
316 from werkzeug.exceptions import NotFound
317 class NoFallbackException(NotFound): pass
318 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
319 """
320 finds the next route that matches the current url rule, and executes it.
321 args, kwargs: pass all arguments of the current route
322 """
323 from flask import current_app, request, g
324
325 # build a list of endpoints that match the current request's url rule:
326 matching = [
327 rule.endpoint
328 for rule in current_app.url_map.iter_rules()
329 if rule.rule == request.url_rule.rule
330 ]
331 current = matching.index(request.endpoint)
332
333 # since we can't change request.endpoint, we always get the original
334 # endpoint back. so for repeated fall throughs, we use the g object to
335 # increment how often we want to fall through.
336 if not '_fallback_next' in g:
337 g._fallback_next = 0
338 g._fallback_next += 1
339
340 next_ep = current + g._fallback_next
341
342 if next_ep < len(matching):
343 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
344 else:
345 raise NoFallbackException
346
347 def websub_url_hmac(key, feed_id, timestamp, nonce):
348 """ generate sha1 hmac, as required by websub/pubsubhubbub """
349 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
350 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
351
352 def websub_body_hmac(key, body):
353 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
354
355 def pp(*args):
356 from pprint import pprint
357 import sys, codecs
358 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum