]>
git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
3 . "`dirname "$0"`/../../venv/bin/activate"
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
23 def pull_subscriptions(verbose
=1, force_all
=False, limit
=-1):
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
30 with sqlite3
.connect(cf
['global']['database']) as conn
:
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
39 """, (force_all
,limit
))
40 results
= c
.fetchall()
42 if verbose
>= 2 and not len(results
):
43 sys
.stderr
.write(f
'no feeds to update.\n')
45 for i
,(feed_id
, feed_type
) in enumerate(results
):
47 pull_feed(feed_id
, feed_type
, conn
, verbose
)
49 def pull_feed(feed_id
, feed_type
, conn
, verbose
):
53 sys
.stderr
.write(f
'fetching {feed_id}\n')
55 xmlfeed
= fetch_xml(feed_param
[feed_type
], feed_id
)
58 sys
.stderr
.write(f
'FETCH FAILED: {feed_id}\n')
62 update_channel(conn
, xmlfeed
)
65 sys
.stderr
.write(f
'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with
open('/tmp/pull-subscriptions.err', 'ab') as f
:
68 f
.write(f
"<!-- {time.ctime()} ({int(time.time())}) -->\n"
70 f
.write(xmlfeed
+ b
"\n")
73 # update crawled_at timestamp:
75 INSERT OR REPLACE INTO crawler (channel_id)
83 def update_subscriptions(verbose
=1, force_all
=False, limit
=-1):
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
89 with sqlite3
.connect(cf
['global']['database']) as conn
:
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
99 """, (force_all
,limit
))
100 results
= c
.fetchall()
102 if verbose
>= 2 and not len(results
):
103 sys
.stderr
.write(f
'no feeds to update.\n')
105 for i
,(feed_id
, feed_type
) in enumerate(results
):
107 update_feed(feed_id
, feed_type
, verbose
)
109 def update_feed(feed_id
, feed_type
, verbose
):
110 webhook
= cf
['websub']['public_uri']
111 lease
= cf
['websub']['lease']
112 hmackey
= cf
['websub']['hmac_key']
115 sys
.stderr
.write(f
'updating {feed_id}\n')
117 version
, timestamp
= "v1", int(time
.time())
118 nonce
= secrets
.token_urlsafe(16)
119 sig
= websub_url_hmac(hmackey
, feed_id
, timestamp
, nonce
)
120 r
= requests
.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f
"{webhook}/websub/{version}/{timestamp}/" + \
122 f
"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f
"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f
"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease
,
128 "hub.secret": hmackey
,
132 sys
.stderr
.write(f
'FAILED {feed_id}: {r.text}\n')
138 def refresh_cipher(verbose
=1, force
=False):
139 with sqlite3
.connect(cf
['global']['database']) as conn
:
141 c
.execute("SELECT url FROM cipher")
142 (player_url
,) = c
.fetchone()
144 new_url
= find_player_url(verbose
)
147 sys
.stderr
.write(f
'FAILED to get player url!\n')
150 if player_url
== new_url
:
152 sys
.stderr
.write(f
'player url unchanged.\n')
156 (cipher_id
,) = re
.match(r
"/s/player/(.*?)\.js", new_url
).groups()
157 sts
, algo
= ytdown_guess(cipher_id
, verbose
, force
)
160 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
162 """, (new_url
, sts
, algo
))
164 def find_player_url(verbose
, id="jNQXAC9IVRw"):
166 Extract the player.js URL, which can be passed to youtubedown.
167 Requests a random video (defaults to the first ever uploaded one, "Me at the
168 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
169 Note that the URL we're looking for contains our locale, so specify it to
172 class FindPlayer(html
.parser
.HTMLParser
):
173 def __init__(self
, feed
):
175 self
.player_js
= None
177 def handle_starttag(self
, tag
, attrs
):
179 if tag
== "script" and attrs
.get("name") == "player_ias/base":
180 self
.player_js
= attrs
.get("src")
183 sys
.stderr
.write(f
'fetching embed page {id}\n')
184 r
= requests
.get(f
"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
187 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
190 player_js
= FindPlayer(r
.text
).player_js
192 sys
.stderr
.write(f
'player.js is {player_js}\n')
195 def ytdown_guess(cipher_id
, verbose
, force
):
196 ytdown
= "/tmp/youtubedown.pm"
198 # update youtubedown once a week:
199 if force
or not os
.path
.isfile(ytdown
) or \
200 os
.stat(ytdown
).st_mtime
< time
.time()-7*24*60*60:
202 sys
.stderr
.write('downloading youtubedown\n')
203 r
= requests
.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
206 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
209 sys
.stderr
.write(f
'done (code {r.status_code})\n')
211 # youtubedown unconditionally calls "main(); exit 0;", which breaks
212 # using it as a module:
213 contents
= "\n".join(r
.text
.splitlines()[:-2] + ["1;"])
215 with
open(ytdown
, 'w') as f
:
218 perl
= subprocess
.run(
221 say guess_cipher($ARGV[1], 0, 0);
222 """, "--", ytdown
, cipher_id
224 stdin
=subprocess
.DEVNULL
,
225 stdout
=subprocess
.PIPE
,
226 stderr
=subprocess
.DEVNULL
228 sts
, algo
= perl
.stdout
.decode('ascii').strip().split(" ", 1)
230 sys
.stderr
.write(f
'sts, algo = {sts}, {algo} (exit:{perl.returncode})\n')
234 if __name__
== '__main__':
235 verbosity
= 2 if '-vv' in sys
.argv
else 1 if '-v' in sys
.argv
else 0
236 limit
= 1 if '-1' in sys
.argv
else -1
237 force
= '-f' in sys
.argv
239 if 'pull' in sys
.argv
:
240 pull_subscriptions(verbosity
, force
, limit
)
241 elif 'websub' in sys
.argv
:
242 update_subscriptions(verbosity
, force
, limit
)
243 elif 'cipher' in sys
.argv
:
244 refresh_cipher(verbosity
, force
)
247 f
'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
248 f
' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
249 f
' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
250 f
'-f: force even if still up-to-date-ish\n'
251 f
'-v: report errors\n'
252 f
'-vv: report accessed feeds\n'
253 f
'-1: limit to one feed (for testing it works)\n')