]>
git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
3 . "`dirname "$0"`/../../venv/bin/activate"
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
23 def pull_subscriptions(verbose
=1, force_all
=False, limit
=-1):
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
30 with sqlite3
.connect(cf
['global']['database']) as conn
:
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
39 """, (force_all
,limit
))
40 results
= c
.fetchall()
42 if verbose
>= 2 and not len(results
):
43 sys
.stderr
.write(f
'no feeds to update.\n')
45 for i
,(feed_id
, feed_type
) in enumerate(results
):
47 pull_feed(feed_id
, feed_type
, conn
, verbose
)
49 def pull_feed(feed_id
, feed_type
, conn
, verbose
):
53 sys
.stderr
.write(f
'fetching {feed_id}\n')
55 xmlfeed
= fetch_xml(feed_param
[feed_type
], feed_id
)
58 sys
.stderr
.write(f
'FETCH FAILED: {feed_id}\n')
62 update_channel(conn
, xmlfeed
)
65 sys
.stderr
.write(f
'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with
open('/tmp/pull-subscriptions.err', 'ab') as f
:
68 f
.write(f
"<!-- {time.ctime()} ({int(time.time())}) -->\n"
70 f
.write(xmlfeed
+ b
"\n")
73 # update crawled_at timestamp:
75 INSERT OR REPLACE INTO crawler (channel_id)
83 def update_subscriptions(verbose
=1, force_all
=False, limit
=-1):
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
89 with sqlite3
.connect(cf
['global']['database']) as conn
:
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
99 """, (force_all
,limit
))
100 results
= c
.fetchall()
102 if verbose
>= 2 and not len(results
):
103 sys
.stderr
.write(f
'no feeds to update.\n')
105 for i
,(feed_id
, feed_type
) in enumerate(results
):
107 update_feed(feed_id
, feed_type
, verbose
)
109 def update_feed(feed_id
, feed_type
, verbose
):
110 webhook
= cf
['websub']['public_uri']
111 lease
= cf
['websub']['lease']
112 hmackey
= cf
['websub']['hmac_key']
115 sys
.stderr
.write(f
'updating {feed_id}\n')
117 version
, timestamp
= "v1", int(time
.time())
118 nonce
= secrets
.token_urlsafe(16)
119 sig
= websub_url_hmac(hmackey
, feed_id
, timestamp
, nonce
)
120 r
= requests
.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f
"{webhook}/websub/{version}/{timestamp}/" + \
122 f
"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f
"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f
"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease
,
128 "hub.secret": hmackey
,
132 sys
.stderr
.write(f
'FAILED {feed_id}: {r.text}\n')
138 def refresh_cipher(verbose
=1, force
=False):
139 with sqlite3
.connect(cf
['global']['database']) as conn
:
141 c
.execute("SELECT url FROM cipher")
142 (player_url
,) = c
.fetchone()
144 new_url
= find_player_url(verbose
)
147 sys
.stderr
.write(f
'FAILED to get player url!\n')
150 if player_url
== new_url
:
152 sys
.stderr
.write(f
'player url unchanged.\n')
156 (cipher_id
,) = re
.match(r
"/s/player/(.*?)\.js", new_url
).groups()
157 sts
, algo
= ytdown_guess(cipher_id
, verbose
, force
)
160 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
162 """, (new_url
, sts
, algo
))
164 def find_player_url(verbose
, id="jNQXAC9IVRw"):
166 Extract the player.js URL, which can be passed to youtubedown.
167 Requests a random video (defaults to the first ever uploaded one, "Me at the
168 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
169 Note that the URL we're looking for contains our locale, so specify it to
172 class FindPlayer(html
.parser
.HTMLParser
):
173 def __init__(self
, feed
):
175 self
.player_js
= None
177 def handle_starttag(self
, tag
, attrs
):
179 if tag
== "script" and attrs
.get("name") == "player_ias/base":
180 self
.player_js
= attrs
.get("src")
183 sys
.stderr
.write(f
'fetching embed page {id}\n')
184 r
= requests
.get(f
"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
187 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
190 player_js
= FindPlayer(r
.text
).player_js
192 sys
.stderr
.write(f
'player.js is {player_js}\n')
195 def ytdown_guess(cipher_id
, verbose
, force
):
196 ytdown
= "/tmp/youtubedown.pm"
198 # update youtubedown once a week:
199 if force
or not os
.path
.isfile(ytdown
) or \
200 os
.stat(ytdown
).st_mtime
< time
.time()-7*24*60*60 or \
201 os
.stat(ytdown
).st_size
== 0: # if previous write failed
203 sys
.stderr
.write('downloading youtubedown\n')
204 r
= requests
.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
207 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
210 sys
.stderr
.write(f
'done (code {r.status_code})\n')
212 # youtubedown unconditionally calls "main(); exit 0;", which breaks
213 # using it as a module:
214 contents
= "\n".join(r
.text
.splitlines()[:-2] + ["1;"])
216 with
open(ytdown
, 'w', encoding
='utf-8') as f
:
219 perl
= subprocess
.run(
222 say guess_cipher($ARGV[1], 0, 0);
223 """, "--", ytdown
, cipher_id
225 stdin
=subprocess
.DEVNULL
,
226 stdout
=subprocess
.PIPE
,
227 stderr
=subprocess
.DEVNULL
229 sts
, algo
= perl
.stdout
.decode('ascii').strip().split(" ", 1)
231 sys
.stderr
.write(f
'sts, algo = {sts}, {algo} (exit:{perl.returncode})\n')
235 if __name__
== '__main__':
236 verbosity
= 2 if '-vv' in sys
.argv
else 1 if '-v' in sys
.argv
else 0
237 limit
= 1 if '-1' in sys
.argv
else -1
238 force
= '-f' in sys
.argv
240 if 'pull' in sys
.argv
:
241 pull_subscriptions(verbosity
, force
, limit
)
242 elif 'websub' in sys
.argv
:
243 update_subscriptions(verbosity
, force
, limit
)
244 elif 'cipher' in sys
.argv
:
245 refresh_cipher(verbosity
, force
)
248 f
'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
249 f
' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
250 f
' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
251 f
'-f: force even if still up-to-date-ish\n'
252 f
'-v: report errors\n'
253 f
'-vv: report accessed feeds\n'
254 f
'-1: limit to one feed (for testing it works)\n')