]> git.gir.st - subscriptionfeed.git/blob - app/common/common.py
sometimes videoDetails.lengthSeconds=="0"
[subscriptionfeed.git] / app / common / common.py
1 import os
2 import re
3 import json
4 import base64
5 import sqlite3
6 import requests
7 import hmac, hashlib
8 import requests_cache
9 import dateutil.parser
10 from xml.etree import ElementTree
11 from configparser import ConfigParser
12 from datetime import datetime, timezone
13 from urllib.parse import parse_qs, urlparse
14
15 cf = ConfigParser()
16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
17 cf.read(config_filename)
18 if not 'global' in cf: # todo: full config check
19 raise Exception("Configuration file not found or empty")
20
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
23
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading import Timer
27 def purge_cache(sec):
28 requests_cache.remove_expired_responses()
29 t = Timer(sec, purge_cache, args=(sec,))
30 t.setDaemon(True)
31 t.start()
32 purge_cache(10*60)
33
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from flask import g
37 import requests
38 from requests import Session as OriginalSession
39 class _NSASession(OriginalSession):
40 def request(self, method, url, params=None, data=None, **kwargs):
41 response = super(_NSASession, self).request(
42 method, url, params, data, **kwargs
43 )
44 try:
45 if 'api_requests' not in g:
46 g.api_requests = []
47 g.api_requests.append((url, params, response.text))
48 except RuntimeError: pass # not within flask (e.g. utils.py)
49 return response
50 requests.Session = requests.sessions.Session = _NSASession
51
52 def fetch_xml(feed_type, feed_id):
53 # TODO: handle requests.exceptions.ConnectionError
54 r = requests.get("https://www.youtube.com/feeds/videos.xml", {
55 feed_type: feed_id,
56 })
57 if not r.ok:
58 return None
59
60 return r.content
61
62 def parse_xml(xmldata):
63 ns = {
64 'atom':"http://www.w3.org/2005/Atom",
65 'yt': "http://www.youtube.com/xml/schemas/2015",
66 'media':"http://search.yahoo.com/mrss/",
67 'at': "http://purl.org/atompub/tombstones/1.0",
68 }
69
70 feed = ElementTree.fromstring(xmldata)
71
72 if feed.find('at:deleted-entry',ns):
73 (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
74 return None, None, [{'deleted': True, 'video_id': vid}], None, None
75
76 title = feed.find('atom:title',ns).text
77 author = feed.find('atom:author/atom:name',ns).text \
78 if feed.find('atom:author',ns) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try: channel_id = feed.find('yt:channelId',ns).text
82 except:channel_id=None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try: playlist_id = feed.find('yt:playlistId',ns).text
85 except:playlist_id=None # XXX: why does ternary not work!?
86 videos = []
87 for entry in feed.findall('atom:entry',ns):
88 videos.append({
89 'video_id': entry.find('yt:videoId',ns).text,
90 'title': entry.find('atom:title',ns).text,
91 'published': entry.find('atom:published',ns).text,
92 'channel_id': entry.find('yt:channelId',ns).text,
93 'author': entry.find('atom:author',ns).find('atom:name',ns).text,
94 # extra fields for pull_subs/webhook:
95 'updated': entry.find('atom:updated',ns).text,
96 })
97
98 return title, author, videos, channel_id, playlist_id
99
100 def update_channel(db, xmldata, from_webhook=False):
101 if not xmldata: return False
102
103 # Note: websub does not return global author, hence taking from first video
104 title, author, videos, channel, playlist = parse_xml(xmldata)
105
106 c = db.cursor()
107 from flask import current_app # XXX: remove
108 for i, video in enumerate(videos):
109 if video.get('deleted'):
110 if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
111 # TODO: enable once we enforce hmac validation:
112 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
113 break
114
115 c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
116 new_video = len(c.fetchall()) < 1
117 if new_video:
118 if from_webhook:current_app.logger.warning(f"new video {video['video_id']}")
119 _, _, meta, _, _ = get_video_info(video['video_id'])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published = dateutil.parser.parse(video['published'])
128 length = None
129 livestream = None
130 if meta:
131 meta = video_metadata(meta)
132 published2 = dateutil.parser.parse(meta['published'])
133 if from_webhook:current_app.logger.warning(f"published {published} / {published2}")
134 if published < published2: # g_v_i date is more accurate:
135 published = published2
136 length = meta['length']
137 livestream = meta['livestream']
138
139 now = datetime.now(timezone.utc)
140
141 # we pretend that all videos uploaded this week were uploaded just
142 # now, so the user sees it at the top of the feed, and it doesn't
143 # get inserted somewhere further down.
144 if (now - published).days < 7:
145 timestamp = now
146 else:#, it's just an update to an older video.
147 timestamp = published
148
149 c.execute("""
150 INSERT OR IGNORE INTO videos
151 (id, channel_id, title, length, livestream, published, crawled)
152 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
153 """, (
154 video['video_id'],
155 video['channel_id'],
156 video['title'],
157 length,
158 livestream,
159 video['published'],
160 timestamp
161 ))
162 else:
163 # update video title (everything else can't change)
164 c.execute("""
165 UPDATE OR IGNORE videos
166 SET title = ?
167 WHERE id = ?
168 """, (
169 video['title'],
170 video['video_id'],
171 ))
172
173 # for channels, this is obviously always the same, but playlists can
174 # consist of videos from different channels:
175 if i == 0 or playlist:
176 c.execute("""
177 INSERT OR REPLACE INTO channels (id, name)
178 VALUES (?, ?)
179 """, (video['channel_id'], video['author']))
180
181 # keep track of which videos are in a playlist, so we can show the user
182 # why a video is in their feed:
183 if playlist:
184 c.execute("""
185 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
186 VALUES (?, ?)
187 """, (video['video_id'], playlist))
188
189 if playlist and not from_webhook: # Note: playlists can't get updated via websub
190 c.execute("""
191 INSERT OR REPLACE INTO playlists (id, name, author)
192 VALUES (?, ?, ?)
193 """, (playlist, title, channel))
194 c.execute("""
195 INSERT OR REPLACE INTO channels (id, name)
196 VALUES (?, ?)
197 """, (channel, author))
198
199 db.commit()
200
201 return True
202
203 def get_video_info(video_id, sts=0, algo=""):
204 """
205 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
206 error types: player, malformed, livestream, geolocked, exhausted
207 """
208 player_error = None # for 'exhausted'
209 for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
210 r = requests.get("https://www.youtube.com/get_video_info", {
211 "video_id": video_id,
212 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
213 "el": el,
214 "sts": sts,
215 "hl": "en_US",
216 })
217 params = parse_qs(r.text)
218 if 'errorcode' in params: # status=fail
219 return None, None, None, 'malformed', params['reason'][0]
220
221 metadata = json.loads(params.get('player_response')[0])
222 playabilityStatus = metadata['playabilityStatus']['status']
223 if playabilityStatus != "OK":
224 playabilityReason = metadata['playabilityStatus'].get('reason',
225 '//'.join(metadata['playabilityStatus'].get('messages',[])))
226 player_error = f"{playabilityStatus}: {playabilityReason}"
227 if playabilityStatus == "UNPLAYABLE":
228 continue # try again with next el value (or fail as exhausted)
229 # without videoDetails, there's only the error message
230 maybe_metadata = metadata if 'videoDetails' in metadata else None
231 return None, None, maybe_metadata, 'player', player_error
232 if metadata['videoDetails'].get('isLive', False):
233 return None, None, metadata, 'livestream', None
234
235 if not 'formats' in metadata['streamingData']:
236 continue # no urls
237
238 formats = metadata['streamingData']['formats']
239 for (i,v) in enumerate(formats):
240 if not ('cipher' in v or 'signatureCipher' in v): continue
241 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
242 formats[i]['url'] = unscramble(cipher, algo)
243
244 adaptive = metadata['streamingData']['adaptiveFormats']
245 for (i,v) in enumerate(adaptive):
246 if not ('cipher' in v or 'signatureCipher' in v): continue
247 cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
248 adaptive[i]['url'] = unscramble(cipher, algo)
249
250 stream_map = {'adaptive': adaptive, 'muxed': formats}
251
252 # todo: check if we have urls or try again
253 url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
254
255 # ip-locked videos can be recovered if the proxy module is loaded:
256 is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
257
258 return url, stream_map, metadata, is_geolocked, None
259 else:
260 return None, None, metadata, 'exhausted', player_error
261
262 def unscramble(cipher, algo): # test video id: UxxajLWwzqY
263 signature = list(cipher['s'][0])
264 for c in algo.split():
265 op, ix = re.match(r"([rsw])(\d+)?", c).groups()
266 ix = int(ix) % len(signature) if ix else 0
267 if not op: continue
268 if op == 'r': signature = list(reversed(signature))
269 if op == 's': signature = signature[ix:]
270 if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
271 sp = cipher.get('sp', ['signature'])[0]
272 sig = cipher.get('sig', [''.join(signature)])[0]
273 return f"{cipher['url'][0]}&{sp}={sig}"
274
275 def video_metadata(metadata):
276 if not metadata:
277 return {}
278
279 meta1 = metadata['videoDetails']
280 meta2 = metadata['microformat']['playerMicroformatRenderer']
281
282 published_at = meta2.get('liveBroadcastDetails',{}) \
283 .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
284
285 # Note: 'premiere' videos have livestream=False and published= will be the
286 # start of the premiere.
287 return {
288 'title': meta1['title'],
289 'author': meta1['author'],
290 'channel_id': meta1['channelId'],
291 'published': published_at,
292 'views': int(meta1['viewCount']) or int(meta2['lengthSeconds']),
293 'length': int(meta1['lengthSeconds']),
294 'livestream': meta1['isLiveContent'],
295 }
296
297 def store_video_metadata(video_id):
298 # check if we know about it, and if not, fetch and store video metadata
299 with sqlite3.connect(cf['global']['database']) as conn:
300 c = conn.cursor()
301 c.execute("SELECT 1 from videos where id = ?", (video_id,))
302 new_video = len(c.fetchall()) < 1
303 if new_video:
304 _, _, meta, _, _ = get_video_info(video_id)
305 if meta:
306 meta = video_metadata(meta)
307 c.execute("""
308 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
309 VALUES (?, ?, ?, datetime(?), datetime(?))
310 """, (
311 video_id,
312 meta['channel_id'],
313 meta['title'],
314 meta['published'],
315 meta['published'],
316 ))
317 c.execute("""
318 INSERT OR REPLACE INTO channels (id, name)
319 VALUES (?, ?)
320 """, (meta['channel_id'], meta['author']))
321
322 from werkzeug.exceptions import NotFound
323 class NoFallbackException(NotFound): pass
324 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
325 """
326 finds the next route that matches the current url rule, and executes it.
327 args, kwargs: pass all arguments of the current route
328 """
329 from flask import current_app, request, g
330
331 # build a list of endpoints that match the current request's url rule:
332 matching = [
333 rule.endpoint
334 for rule in current_app.url_map.iter_rules()
335 if rule.rule == request.url_rule.rule
336 ]
337 current = matching.index(request.endpoint)
338
339 # since we can't change request.endpoint, we always get the original
340 # endpoint back. so for repeated fall throughs, we use the g object to
341 # increment how often we want to fall through.
342 if not '_fallback_next' in g:
343 g._fallback_next = 0
344 g._fallback_next += 1
345
346 next_ep = current + g._fallback_next
347
348 if next_ep < len(matching):
349 return current_app.view_functions[matching[next_ep]](*args, **kwargs)
350 else:
351 raise NoFallbackException
352
353 def websub_url_hmac(key, feed_id, timestamp, nonce):
354 """ generate sha1 hmac, as required by websub/pubsubhubbub """
355 sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
356 return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
357
358 def websub_body_hmac(key, body):
359 return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
360
361 def pp(*args):
362 from pprint import pprint
363 import sys, codecs
364 pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))
Imprint / Impressum