]> git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
don't need to patch youtubedown any more
[subscriptionfeed.git] / app / common / utils.py
1 #!/bin/sh
2 ''':'
3 . "`dirname "$0"`/../../venv/bin/activate"
4 exec python "$0" "$@"
5 ':'''
6
7 import os
8 import sys
9 import time
10 import secrets
11 import sqlite3
12 import requests
13 import subprocess
14 import html.parser
15
16 from common import *
17
18 feed_param = {
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
21 }
22
23 def pull_subscriptions(verbose=1, force_all=False, limit=-1):
24 """
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
29 """
30 with sqlite3.connect(cf['global']['database']) as conn:
31 c = conn.cursor()
32 c.execute("""
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
37 ORDER BY crawled_at
38 LIMIT ?
39 """, (force_all,limit))
40 results = c.fetchall()
41
42 if verbose >= 2 and not len(results):
43 sys.stderr.write(f'no feeds to update.\n')
44
45 for i,(feed_id, feed_type) in enumerate(results):
46 if i: time.sleep(60)
47 pull_feed(feed_id, feed_type, conn, verbose)
48
49 def pull_feed(feed_id, feed_type, conn, verbose):
50 c = conn.cursor()
51
52 if verbose >= 2:
53 sys.stderr.write(f'fetching {feed_id}\n')
54
55 xmlfeed = fetch_xml(feed_param[feed_type], feed_id)
56 if not xmlfeed:
57 if verbose:
58 sys.stderr.write(f'FETCH FAILED: {feed_id}\n')
59 return False
60
61 try:
62 update_channel(conn, xmlfeed)
63 except:
64 if verbose:
65 sys.stderr.write(f'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with open('/tmp/pull-subscriptions.err', 'ab') as f:
68 f.write(f"<!-- {time.ctime()} ({int(time.time())}) -->\n"
69 .encode('ascii'))
70 f.write(xmlfeed + b"\n")
71 return False
72
73 # update crawled_at timestamp:
74 c.execute("""
75 INSERT OR REPLACE INTO crawler (channel_id)
76 VALUES (?)
77 """, (feed_id,))
78
79 conn.commit()
80 return True
81
82
83 def update_subscriptions(verbose=1, force_all=False, limit=-1):
84 """
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
88 """
89 with sqlite3.connect(cf['global']['database']) as conn:
90 c = conn.cursor()
91 c.execute("""
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
98 LIMIT ?
99 """, (force_all,limit))
100 results = c.fetchall()
101
102 if verbose >= 2 and not len(results):
103 sys.stderr.write(f'no feeds to update.\n')
104
105 for i,(feed_id, feed_type) in enumerate(results):
106 if i: time.sleep(60)
107 update_feed(feed_id, feed_type, verbose)
108
109 def update_feed(feed_id, feed_type, verbose):
110 webhook = cf['websub']['public_uri']
111 lease = cf['websub']['lease']
112 hmackey = cf['websub']['hmac_key']
113
114 if verbose >= 2:
115 sys.stderr.write(f'updating {feed_id}\n')
116
117 version, timestamp = "v1", int(time.time())
118 nonce = secrets.token_urlsafe(16)
119 sig = websub_url_hmac(hmackey, feed_id, timestamp, nonce)
120 r = requests.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f"{webhook}/websub/{version}/{timestamp}/" + \
122 f"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease,
128 "hub.secret": hmackey,
129 })
130 if not r.ok:
131 if verbose:
132 sys.stderr.write(f'FAILED {feed_id}: {r.text}\n')
133 return False
134
135 return True
136
137
138 def refresh_cipher(verbose=1, force=False):
139 with sqlite3.connect(cf['global']['database']) as conn:
140 c = conn.cursor()
141 c.execute("SELECT url FROM cipher")
142 (player_url,) = c.fetchone()
143
144 new_url = find_player_url(verbose)
145 if not new_url:
146 if verbose:
147 sys.stderr.write(f'FAILED to get player url!\n')
148 return False
149
150 if player_url == new_url:
151 if verbose >= 2:
152 sys.stderr.write(f'player url unchanged.\n')
153 if not force:
154 return True
155
156 (cipher_id,) = re.match(r"/s/player/(.*?)\.js", new_url).groups()
157 sts, algo = ytdown_guess(cipher_id, verbose, force)
158 if not sts or not algo:
159 return False
160
161 c.execute("""
162 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
163 VALUES (0,?,?,?)
164 """, (new_url, sts, algo))
165
166 return True
167
168 def find_player_url(verbose, id="jNQXAC9IVRw"):
169 """
170 Extract the player.js URL, which can be passed to youtubedown.
171 Requests a random video (defaults to the first ever uploaded one, "Me at the
172 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
173 Note that the URL we're looking for contains our locale, so specify it to
174 avoid it changing.
175 """
176 class FindPlayer(html.parser.HTMLParser):
177 def __init__(self, feed):
178 super().__init__()
179 self.player_js = None
180 super().feed(feed)
181 def handle_starttag(self, tag, attrs):
182 attrs = dict(attrs)
183 if tag == "script" and attrs.get("name") == "player_ias/base":
184 self.player_js = attrs.get("src")
185
186 if verbose >= 2:
187 sys.stderr.write(f'fetching embed page {id}\n')
188 r = requests.get(f"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
189 if not r.ok:
190 if verbose:
191 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
192 return None
193
194 player_js = FindPlayer(r.text).player_js
195 if verbose >= 2:
196 sys.stderr.write(f'player.js is {player_js}\n')
197 return player_js
198
199 def ytdown_guess(cipher_id, verbose, force):
200 ytdown = "/tmp/youtubedown.pm"
201
202 # update youtubedown once a week:
203 if force or not os.path.isfile(ytdown) or \
204 os.stat(ytdown).st_mtime < time.time()-7*24*60*60 or \
205 os.stat(ytdown).st_size == 0: # if previous write failed
206 if verbose >= 2:
207 sys.stderr.write('downloading youtubedown\n')
208 r = requests.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
209 if not r.ok:
210 if verbose:
211 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
212 return None, None
213 elif verbose >= 2:
214 sys.stderr.write(f'done (code {r.status_code})\n')
215
216 with open(ytdown, 'wb') as f:
217 f.write(r.content)
218
219 perl = subprocess.run(
220 ["perl", "-wE", """
221 require $ARGV[0];
222 say guess_cipher($ARGV[1], 0, 0);
223 """, "--", ytdown, cipher_id
224 ],
225 stdin=subprocess.DEVNULL,
226 stdout=subprocess.PIPE,
227 stderr=subprocess.PIPE
228 )
229 if perl.returncode > 0:
230 if verbose:
231 sys.stderr.write(f'FAILED guess_cipher (exit:{perl.returncode}):\n')
232 sys.stderr.write(perl.stderr.decode())
233 return None, None
234 sts, algo = perl.stdout.decode('ascii').strip().split(" ", 1)
235 if verbose >= 2:
236 sys.stderr.write(f'sts, algo = {sts}, {algo}\n')
237 return sts, algo
238
239
240 if __name__ == '__main__':
241 verbosity = 2 if '-vv' in sys.argv else 1 if '-v' in sys.argv else 0
242 limit = 1 if '-1' in sys.argv else -1
243 force = '-f' in sys.argv
244
245 if 'pull' in sys.argv:
246 pull_subscriptions(verbosity, force, limit)
247 elif 'websub' in sys.argv:
248 update_subscriptions(verbosity, force, limit)
249 elif 'cipher' in sys.argv:
250 refresh_cipher(verbosity, force)
251 else:
252 sys.stderr.write(
253 f'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
254 f' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
255 f' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
256 f'-f: force even if still up-to-date-ish\n'
257 f'-v: report errors\n'
258 f'-vv: report accessed feeds\n'
259 f'-1: limit to one feed (for testing it works)\n')
260 sys.exit(1)
Imprint / Impressum