]> git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
fix writing youtubedown (unicode chars)
[subscriptionfeed.git] / app / common / utils.py
1 #!/bin/sh
2 ''':'
3 . "`dirname "$0"`/../../venv/bin/activate"
4 exec python "$0" "$@"
5 ':'''
6
7 import os
8 import sys
9 import time
10 import secrets
11 import sqlite3
12 import requests
13 import subprocess
14 import html.parser
15
16 from common import *
17
18 feed_param = {
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
21 }
22
23 def pull_subscriptions(verbose=1, force_all=False, limit=-1):
24 """
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
29 """
30 with sqlite3.connect(cf['global']['database']) as conn:
31 c = conn.cursor()
32 c.execute("""
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
37 ORDER BY crawled_at
38 LIMIT ?
39 """, (force_all,limit))
40 results = c.fetchall()
41
42 if verbose >= 2 and not len(results):
43 sys.stderr.write(f'no feeds to update.\n')
44
45 for i,(feed_id, feed_type) in enumerate(results):
46 if i: time.sleep(60)
47 pull_feed(feed_id, feed_type, conn, verbose)
48
49 def pull_feed(feed_id, feed_type, conn, verbose):
50 c = conn.cursor()
51
52 if verbose >= 2:
53 sys.stderr.write(f'fetching {feed_id}\n')
54
55 xmlfeed = fetch_xml(feed_param[feed_type], feed_id)
56 if not xmlfeed:
57 if verbose:
58 sys.stderr.write(f'FETCH FAILED: {feed_id}\n')
59 return False
60
61 try:
62 update_channel(conn, xmlfeed)
63 except:
64 if verbose:
65 sys.stderr.write(f'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with open('/tmp/pull-subscriptions.err', 'ab') as f:
68 f.write(f"<!-- {time.ctime()} ({int(time.time())}) -->\n"
69 .encode('ascii'))
70 f.write(xmlfeed + b"\n")
71 return False
72
73 # update crawled_at timestamp:
74 c.execute("""
75 INSERT OR REPLACE INTO crawler (channel_id)
76 VALUES (?)
77 """, (feed_id,))
78
79 conn.commit()
80 return True
81
82
83 def update_subscriptions(verbose=1, force_all=False, limit=-1):
84 """
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
88 """
89 with sqlite3.connect(cf['global']['database']) as conn:
90 c = conn.cursor()
91 c.execute("""
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
98 LIMIT ?
99 """, (force_all,limit))
100 results = c.fetchall()
101
102 if verbose >= 2 and not len(results):
103 sys.stderr.write(f'no feeds to update.\n')
104
105 for i,(feed_id, feed_type) in enumerate(results):
106 if i: time.sleep(60)
107 update_feed(feed_id, feed_type, verbose)
108
109 def update_feed(feed_id, feed_type, verbose):
110 webhook = cf['websub']['public_uri']
111 lease = cf['websub']['lease']
112 hmackey = cf['websub']['hmac_key']
113
114 if verbose >= 2:
115 sys.stderr.write(f'updating {feed_id}\n')
116
117 version, timestamp = "v1", int(time.time())
118 nonce = secrets.token_urlsafe(16)
119 sig = websub_url_hmac(hmackey, feed_id, timestamp, nonce)
120 r = requests.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f"{webhook}/websub/{version}/{timestamp}/" + \
122 f"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease,
128 "hub.secret": hmackey,
129 })
130 if not r.ok:
131 if verbose:
132 sys.stderr.write(f'FAILED {feed_id}: {r.text}\n')
133 return False
134
135 return True
136
137
138 def refresh_cipher(verbose=1, force=False):
139 with sqlite3.connect(cf['global']['database']) as conn:
140 c = conn.cursor()
141 c.execute("SELECT url FROM cipher")
142 (player_url,) = c.fetchone()
143
144 new_url = find_player_url(verbose)
145 if not new_url:
146 if verbose:
147 sys.stderr.write(f'FAILED to get player url!\n')
148 return False
149
150 if player_url == new_url:
151 if verbose >= 2:
152 sys.stderr.write(f'player url unchanged.\n')
153 if not force:
154 return True
155
156 (cipher_id,) = re.match(r"/s/player/(.*?)\.js", new_url).groups()
157 sts, algo = ytdown_guess(cipher_id, verbose, force)
158
159 c.execute("""
160 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
161 VALUES (0,?,?,?)
162 """, (new_url, sts, algo))
163
164 def find_player_url(verbose, id="jNQXAC9IVRw"):
165 """
166 Extract the player.js URL, which can be passed to youtubedown.
167 Requests a random video (defaults to the first ever uploaded one, "Me at the
168 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
169 Note that the URL we're looking for contains our locale, so specify it to
170 avoid it changing.
171 """
172 class FindPlayer(html.parser.HTMLParser):
173 def __init__(self, feed):
174 super().__init__()
175 self.player_js = None
176 super().feed(feed)
177 def handle_starttag(self, tag, attrs):
178 attrs = dict(attrs)
179 if tag == "script" and attrs.get("name") == "player_ias/base":
180 self.player_js = attrs.get("src")
181
182 if verbose >= 2:
183 sys.stderr.write(f'fetching embed page {id}\n')
184 r = requests.get(f"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
185 if not r.ok:
186 if verbose:
187 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
188 return None
189
190 player_js = FindPlayer(r.text).player_js
191 if verbose >= 2:
192 sys.stderr.write(f'player.js is {player_js}\n')
193 return player_js
194
195 def ytdown_guess(cipher_id, verbose, force):
196 ytdown = "/tmp/youtubedown.pm"
197
198 # update youtubedown once a week:
199 if force or not os.path.isfile(ytdown) or \
200 os.stat(ytdown).st_mtime < time.time()-7*24*60*60 or \
201 os.stat(ytdown).st_size == 0: # if previous write failed
202 if verbose >= 2:
203 sys.stderr.write('downloading youtubedown\n')
204 r = requests.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
205 if not r.ok:
206 if verbose:
207 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
208 return None, None
209 elif verbose >= 2:
210 sys.stderr.write(f'done (code {r.status_code})\n')
211
212 # youtubedown unconditionally calls "main(); exit 0;", which breaks
213 # using it as a module:
214 contents = "\n".join(r.text.splitlines()[:-2] + ["1;"])
215
216 with open(ytdown, 'w', encoding='utf-8') as f:
217 f.write(contents)
218
219 perl = subprocess.run(
220 ["perl", "-wE", """
221 require $ARGV[0];
222 say guess_cipher($ARGV[1], 0, 0);
223 """, "--", ytdown, cipher_id
224 ],
225 stdin=subprocess.DEVNULL,
226 stdout=subprocess.PIPE,
227 stderr=subprocess.DEVNULL
228 )
229 sts, algo = perl.stdout.decode('ascii').strip().split(" ", 1)
230 if verbose >= 2:
231 sys.stderr.write(f'sts, algo = {sts}, {algo} (exit:{perl.returncode})\n')
232 return sts, algo
233
234
235 if __name__ == '__main__':
236 verbosity = 2 if '-vv' in sys.argv else 1 if '-v' in sys.argv else 0
237 limit = 1 if '-1' in sys.argv else -1
238 force = '-f' in sys.argv
239
240 if 'pull' in sys.argv:
241 pull_subscriptions(verbosity, force, limit)
242 elif 'websub' in sys.argv:
243 update_subscriptions(verbosity, force, limit)
244 elif 'cipher' in sys.argv:
245 refresh_cipher(verbosity, force)
246 else:
247 sys.stderr.write(
248 f'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
249 f' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
250 f' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
251 f'-f: force even if still up-to-date-ish\n'
252 f'-v: report errors\n'
253 f'-vv: report accessed feeds\n'
254 f'-1: limit to one feed (for testing it works)\n')
255 sys.exit(1)
Imprint / Impressum