]>
git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
3 . "`dirname "$0"`/../../venv/bin/activate"
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
23 def pull_subscriptions(verbose
=1, force_all
=False, limit
=-1):
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
30 with sqlite3
.connect(cf
['global']['database']) as conn
:
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
39 """, (force_all
,limit
))
40 results
= c
.fetchall()
42 if verbose
>= 2 and not len(results
):
43 sys
.stderr
.write(f
'no feeds to update.\n')
45 for i
,(feed_id
, feed_type
) in enumerate(results
):
47 pull_feed(feed_id
, feed_type
, conn
, verbose
)
49 def pull_feed(feed_id
, feed_type
, conn
, verbose
):
53 sys
.stderr
.write(f
'fetching {feed_id}\n')
55 xmlfeed
= fetch_xml(feed_param
[feed_type
], feed_id
)
58 sys
.stderr
.write(f
'FETCH FAILED: {feed_id}\n')
62 update_channel(conn
, xmlfeed
)
65 sys
.stderr
.write(f
'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with
open('/tmp/pull-subscriptions.err', 'ab') as f
:
68 f
.write(f
"<!-- {time.ctime()} ({int(time.time())}) -->\n"
70 f
.write(xmlfeed
+ b
"\n")
73 # update crawled_at timestamp:
75 INSERT OR REPLACE INTO crawler (channel_id)
83 def update_subscriptions(verbose
=1, force_all
=False, limit
=-1):
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
89 with sqlite3
.connect(cf
['global']['database']) as conn
:
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
99 """, (force_all
,limit
))
100 results
= c
.fetchall()
102 if verbose
>= 2 and not len(results
):
103 sys
.stderr
.write(f
'no feeds to update.\n')
105 for i
,(feed_id
, feed_type
) in enumerate(results
):
107 update_feed(feed_id
, feed_type
, verbose
)
109 def update_feed(feed_id
, feed_type
, verbose
):
110 webhook
= cf
['webhooks']['public_uri']
111 lease
= cf
['websub']['lease']
112 hmackey
= cf
['websub']['hmac_key']
115 sys
.stderr
.write(f
'updating {feed_id}\n')
117 version
, timestamp
= "v1", int(time
.time())
118 nonce
= secrets
.token_urlsafe(16)
119 sig
= websub_url_hmac(hmackey
, feed_id
, timestamp
, nonce
)
120 import requests_cache
121 with requests_cache
.disabled():
122 r
= requests
.post("https://pubsubhubbub.appspot.com/subscribe", {
123 "hub.callback": f
"{webhook}/websub/{version}/{timestamp}/" + \
124 f
"{nonce}/{feed_id}/{sig}",
125 "hub.topic": f
"https://www.youtube.com/xml/feeds/videos.xml" + \
126 f
"?{feed_param[feed_type]}={feed_id}",
127 "hub.verify": "async",
128 "hub.mode": "subscribe",
129 "hub.lease_seconds": lease
,
130 "hub.secret": hmackey
,
134 sys
.stderr
.write(f
'FAILED {feed_id}: {r.text}\n')
140 def refresh_cipher(verbose
=1, force
=False):
141 with sqlite3
.connect(cf
['global']['database']) as conn
:
143 c
.execute("SELECT url FROM cipher")
144 (player_url
,) = c
.fetchone()
146 new_url
= find_player_url(verbose
)
149 sys
.stderr
.write(f
'FAILED to get player url!\n')
152 if player_url
== new_url
:
154 sys
.stderr
.write(f
'player url unchanged.\n')
158 (cipher_id
,) = re
.match(r
"/s/player/(.*?)\.js", new_url
).groups()
159 sts
, algo
= ytdown_guess(cipher_id
, verbose
, force
)
160 if not sts
or not algo
:
164 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
166 """, (new_url
, sts
, algo
))
170 def find_player_url(verbose
, id="jNQXAC9IVRw"):
172 Extract the player.js URL, which can be passed to youtubedown.
173 Requests a random video (defaults to the first ever uploaded one, "Me at the
174 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
175 Note that the URL we're looking for contains our locale, so specify it to
178 class FindPlayer(html
.parser
.HTMLParser
):
179 def __init__(self
, feed
):
181 self
.player_js
= None
183 def handle_starttag(self
, tag
, attrs
):
185 if tag
== "script" and attrs
.get("src", "").endswith("/en_US/base.js"):
186 self
.player_js
= attrs
.get("src")
189 sys
.stderr
.write(f
'fetching embed page {id}\n')
190 r
= requests
.get(f
"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
193 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
196 player_js
= FindPlayer(r
.text
).player_js
198 sys
.stderr
.write(f
'player.js is {player_js}\n')
201 def ytdown_guess(cipher_id
, verbose
, force
):
202 ytdown
= "/tmp/youtubedown.pm"
204 # update youtubedown once a week:
205 if force
or not os
.path
.isfile(ytdown
) or \
206 os
.stat(ytdown
).st_mtime
< time
.time()-7*24*60*60 or \
207 os
.stat(ytdown
).st_size
== 0: # if previous write failed
209 sys
.stderr
.write('downloading youtubedown\n')
210 r
= requests
.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
213 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
216 sys
.stderr
.write(f
'done (code {r.status_code})\n')
218 with
open(ytdown
, 'wb') as f
:
221 perl
= subprocess
.run(
224 say guess_cipher($ARGV[1], 0, 0);
225 """, "--", ytdown
, cipher_id
227 stdin
=subprocess
.DEVNULL
,
228 stdout
=subprocess
.PIPE
,
229 stderr
=subprocess
.PIPE
231 if perl
.returncode
> 0:
233 sys
.stderr
.write(f
'FAILED guess_cipher (exit:{perl.returncode}):\n')
234 sys
.stderr
.write(perl
.stderr
.decode())
236 sts
, algo
= perl
.stdout
.decode('ascii').strip().split(" ", 1)
238 sys
.stderr
.write(f
'sts, algo = {sts}, {algo}\n')
242 if __name__
== '__main__':
243 verbosity
= 2 if '-vv' in sys
.argv
else 1 if '-v' in sys
.argv
else 0
244 limit
= 1 if '-1' in sys
.argv
else -1
245 force
= '-f' in sys
.argv
247 if 'pull' in sys
.argv
:
248 pull_subscriptions(verbosity
, force
, limit
)
249 elif 'websub' in sys
.argv
:
250 update_subscriptions(verbosity
, force
, limit
)
251 elif 'cipher' in sys
.argv
:
252 refresh_cipher(verbosity
, force
)
255 f
'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
256 f
' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
257 f
' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
258 f
'-f: force even if still up-to-date-ish\n'
259 f
'-v: report errors\n'
260 f
'-vv: report accessed feeds\n'
261 f
'-1: limit to one feed (for testing it works)\n')