]> git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
cache POST requests, explicity not cache some POSTs
[subscriptionfeed.git] / app / common / utils.py
1 #!/bin/sh
2 ''':'
3 . "`dirname "$0"`/../../venv/bin/activate"
4 exec python "$0" "$@"
5 ':'''
6
7 import os
8 import sys
9 import time
10 import secrets
11 import sqlite3
12 import requests
13 import subprocess
14 import html.parser
15
16 from common import *
17
18 feed_param = {
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
21 }
22
23 def pull_subscriptions(verbose=1, force_all=False, limit=-1):
24 """
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
29 """
30 with sqlite3.connect(cf['global']['database']) as conn:
31 c = conn.cursor()
32 c.execute("""
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
37 ORDER BY crawled_at
38 LIMIT ?
39 """, (force_all,limit))
40 results = c.fetchall()
41
42 if verbose >= 2 and not len(results):
43 sys.stderr.write(f'no feeds to update.\n')
44
45 for i,(feed_id, feed_type) in enumerate(results):
46 if i: time.sleep(60)
47 pull_feed(feed_id, feed_type, conn, verbose)
48
49 def pull_feed(feed_id, feed_type, conn, verbose):
50 c = conn.cursor()
51
52 if verbose >= 2:
53 sys.stderr.write(f'fetching {feed_id}\n')
54
55 xmlfeed = fetch_xml(feed_param[feed_type], feed_id)
56 if not xmlfeed:
57 if verbose:
58 sys.stderr.write(f'FETCH FAILED: {feed_id}\n')
59 return False
60
61 try:
62 update_channel(conn, xmlfeed)
63 except:
64 if verbose:
65 sys.stderr.write(f'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with open('/tmp/pull-subscriptions.err', 'ab') as f:
68 f.write(f"<!-- {time.ctime()} ({int(time.time())}) -->\n"
69 .encode('ascii'))
70 f.write(xmlfeed + b"\n")
71 return False
72
73 # update crawled_at timestamp:
74 c.execute("""
75 INSERT OR REPLACE INTO crawler (channel_id)
76 VALUES (?)
77 """, (feed_id,))
78
79 conn.commit()
80 return True
81
82
83 def update_subscriptions(verbose=1, force_all=False, limit=-1):
84 """
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
88 """
89 with sqlite3.connect(cf['global']['database']) as conn:
90 c = conn.cursor()
91 c.execute("""
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
98 LIMIT ?
99 """, (force_all,limit))
100 results = c.fetchall()
101
102 if verbose >= 2 and not len(results):
103 sys.stderr.write(f'no feeds to update.\n')
104
105 for i,(feed_id, feed_type) in enumerate(results):
106 if i: time.sleep(60)
107 update_feed(feed_id, feed_type, verbose)
108
109 def update_feed(feed_id, feed_type, verbose):
110 webhook = cf['webhooks']['public_uri']
111 lease = cf['websub']['lease']
112 hmackey = cf['websub']['hmac_key']
113
114 if verbose >= 2:
115 sys.stderr.write(f'updating {feed_id}\n')
116
117 version, timestamp = "v1", int(time.time())
118 nonce = secrets.token_urlsafe(16)
119 sig = websub_url_hmac(hmackey, feed_id, timestamp, nonce)
120 import requests_cache
121 with requests_cache.disabled():
122 r = requests.post("https://pubsubhubbub.appspot.com/subscribe", {
123 "hub.callback": f"{webhook}/websub/{version}/{timestamp}/" + \
124 f"{nonce}/{feed_id}/{sig}",
125 "hub.topic": f"https://www.youtube.com/xml/feeds/videos.xml" + \
126 f"?{feed_param[feed_type]}={feed_id}",
127 "hub.verify": "async",
128 "hub.mode": "subscribe",
129 "hub.lease_seconds": lease,
130 "hub.secret": hmackey,
131 })
132 if not r.ok:
133 if verbose:
134 sys.stderr.write(f'FAILED {feed_id}: {r.text}\n')
135 return False
136
137 return True
138
139
140 def refresh_cipher(verbose=1, force=False):
141 with sqlite3.connect(cf['global']['database']) as conn:
142 c = conn.cursor()
143 c.execute("SELECT url FROM cipher")
144 (player_url,) = c.fetchone()
145
146 new_url = find_player_url(verbose)
147 if not new_url:
148 if verbose:
149 sys.stderr.write(f'FAILED to get player url!\n')
150 return False
151
152 if player_url == new_url:
153 if verbose >= 2:
154 sys.stderr.write(f'player url unchanged.\n')
155 if not force:
156 return True
157
158 (cipher_id,) = re.match(r"/s/player/(.*?)\.js", new_url).groups()
159 sts, algo = ytdown_guess(cipher_id, verbose, force)
160 if not sts or not algo:
161 return False
162
163 c.execute("""
164 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
165 VALUES (0,?,?,?)
166 """, (new_url, sts, algo))
167
168 return True
169
170 def find_player_url(verbose, id="jNQXAC9IVRw"):
171 """
172 Extract the player.js URL, which can be passed to youtubedown.
173 Requests a random video (defaults to the first ever uploaded one, "Me at the
174 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
175 Note that the URL we're looking for contains our locale, so specify it to
176 avoid it changing.
177 """
178 class FindPlayer(html.parser.HTMLParser):
179 def __init__(self, feed):
180 super().__init__()
181 self.player_js = None
182 super().feed(feed)
183 def handle_starttag(self, tag, attrs):
184 attrs = dict(attrs)
185 if tag == "script" and attrs.get("src", "").endswith("/en_US/base.js"):
186 self.player_js = attrs.get("src")
187
188 if verbose >= 2:
189 sys.stderr.write(f'fetching embed page {id}\n')
190 r = requests.get(f"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
191 if not r.ok:
192 if verbose:
193 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
194 return None
195
196 player_js = FindPlayer(r.text).player_js
197 if verbose >= 2:
198 sys.stderr.write(f'player.js is {player_js}\n')
199 return player_js
200
201 def ytdown_guess(cipher_id, verbose, force):
202 ytdown = "/tmp/youtubedown.pm"
203
204 # update youtubedown once a week:
205 if force or not os.path.isfile(ytdown) or \
206 os.stat(ytdown).st_mtime < time.time()-7*24*60*60 or \
207 os.stat(ytdown).st_size == 0: # if previous write failed
208 if verbose >= 2:
209 sys.stderr.write('downloading youtubedown\n')
210 r = requests.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
211 if not r.ok:
212 if verbose:
213 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
214 return None, None
215 elif verbose >= 2:
216 sys.stderr.write(f'done (code {r.status_code})\n')
217
218 with open(ytdown, 'wb') as f:
219 f.write(r.content)
220
221 perl = subprocess.run(
222 ["perl", "-wE", """
223 require $ARGV[0];
224 say guess_cipher($ARGV[1], 0, 0);
225 """, "--", ytdown, cipher_id
226 ],
227 stdin=subprocess.DEVNULL,
228 stdout=subprocess.PIPE,
229 stderr=subprocess.PIPE
230 )
231 if perl.returncode > 0:
232 if verbose:
233 sys.stderr.write(f'FAILED guess_cipher (exit:{perl.returncode}):\n')
234 sys.stderr.write(perl.stderr.decode())
235 return None, None
236 sts, algo = perl.stdout.decode('ascii').strip().split(" ", 1)
237 if verbose >= 2:
238 sys.stderr.write(f'sts, algo = {sts}, {algo}\n')
239 return sts, algo
240
241
242 if __name__ == '__main__':
243 verbosity = 2 if '-vv' in sys.argv else 1 if '-v' in sys.argv else 0
244 limit = 1 if '-1' in sys.argv else -1
245 force = '-f' in sys.argv
246
247 if 'pull' in sys.argv:
248 pull_subscriptions(verbosity, force, limit)
249 elif 'websub' in sys.argv:
250 update_subscriptions(verbosity, force, limit)
251 elif 'cipher' in sys.argv:
252 refresh_cipher(verbosity, force)
253 else:
254 sys.stderr.write(
255 f'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
256 f' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
257 f' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
258 f'-f: force even if still up-to-date-ish\n'
259 f'-v: report errors\n'
260 f'-vv: report accessed feeds\n'
261 f'-1: limit to one feed (for testing it works)\n')
262 sys.exit(1)
Imprint / Impressum