]>
git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
3 . "`dirname "$0"`/../../venv/bin/activate"
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
23 def pull_subscriptions(verbose
=1, force_all
=False, limit
=-1):
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
30 with sqlite3
.connect(cf
['global']['database']) as conn
:
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
39 """, (force_all
,limit
))
40 results
= c
.fetchall()
42 if verbose
>= 2 and not len(results
):
43 sys
.stderr
.write(f
'no feeds to update.\n')
45 for i
,(feed_id
, feed_type
) in enumerate(results
):
47 pull_feed(feed_id
, feed_type
, conn
, verbose
)
49 def pull_feed(feed_id
, feed_type
, conn
, verbose
):
53 sys
.stderr
.write(f
'fetching {feed_id}\n')
55 xmlfeed
= fetch_xml(feed_param
[feed_type
], feed_id
)
58 sys
.stderr
.write(f
'FETCH FAILED: {feed_id}\n')
62 update_channel(conn
, xmlfeed
)
65 sys
.stderr
.write(f
'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with
open('/tmp/pull-subscriptions.err', 'ab') as f
:
68 f
.write(f
"<!-- {time.ctime()} ({int(time.time())}) -->\n"
70 f
.write(xmlfeed
+ b
"\n")
73 # update crawled_at timestamp:
75 INSERT OR REPLACE INTO crawler (channel_id)
83 def update_subscriptions(verbose
=1, force_all
=False, limit
=-1):
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
89 with sqlite3
.connect(cf
['global']['database']) as conn
:
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
99 """, (force_all
,limit
))
100 results
= c
.fetchall()
102 if verbose
>= 2 and not len(results
):
103 sys
.stderr
.write(f
'no feeds to update.\n')
105 for i
,(feed_id
, feed_type
) in enumerate(results
):
107 update_feed(feed_id
, feed_type
, verbose
)
109 def update_feed(feed_id
, feed_type
, verbose
):
110 webhook
= cf
['websub']['public_uri']
111 lease
= cf
['websub']['lease']
112 hmackey
= cf
['websub']['hmac_key']
115 sys
.stderr
.write(f
'updating {feed_id}\n')
117 version
, timestamp
= "v1", int(time
.time())
118 nonce
= secrets
.token_urlsafe(16)
119 sig
= websub_url_hmac(hmackey
, feed_id
, timestamp
, nonce
)
120 r
= requests
.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f
"{webhook}/websub/{version}/{timestamp}/" + \
122 f
"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f
"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f
"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease
,
128 "hub.secret": hmackey
,
132 sys
.stderr
.write(f
'FAILED {feed_id}: {r.text}\n')
138 def refresh_cipher(verbose
=1, force
=False):
139 with sqlite3
.connect(cf
['global']['database']) as conn
:
141 c
.execute("SELECT url FROM cipher")
142 (player_url
,) = c
.fetchone()
144 new_url
= find_player_url(verbose
)
147 sys
.stderr
.write(f
'FAILED to get player url!\n')
150 if player_url
== new_url
:
152 sys
.stderr
.write(f
'player url unchanged.\n')
156 (cipher_id
,) = re
.match(r
"/s/player/(.*?)\.js", new_url
).groups()
157 sts
, algo
= ytdown_guess(cipher_id
, verbose
, force
)
158 if not sts
or not algo
:
162 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
164 """, (new_url
, sts
, algo
))
168 def find_player_url(verbose
, id="jNQXAC9IVRw"):
170 Extract the player.js URL, which can be passed to youtubedown.
171 Requests a random video (defaults to the first ever uploaded one, "Me at the
172 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
173 Note that the URL we're looking for contains our locale, so specify it to
176 class FindPlayer(html
.parser
.HTMLParser
):
177 def __init__(self
, feed
):
179 self
.player_js
= None
181 def handle_starttag(self
, tag
, attrs
):
183 if tag
== "script" and attrs
.get("name") == "player_ias/base":
184 self
.player_js
= attrs
.get("src")
187 sys
.stderr
.write(f
'fetching embed page {id}\n')
188 r
= requests
.get(f
"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
191 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
194 player_js
= FindPlayer(r
.text
).player_js
196 sys
.stderr
.write(f
'player.js is {player_js}\n')
199 def ytdown_guess(cipher_id
, verbose
, force
):
200 ytdown
= "/tmp/youtubedown.pm"
202 # update youtubedown once a week:
203 if force
or not os
.path
.isfile(ytdown
) or \
204 os
.stat(ytdown
).st_mtime
< time
.time()-7*24*60*60 or \
205 os
.stat(ytdown
).st_size
== 0: # if previous write failed
207 sys
.stderr
.write('downloading youtubedown\n')
208 r
= requests
.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
211 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
214 sys
.stderr
.write(f
'done (code {r.status_code})\n')
216 with
open(ytdown
, 'wb') as f
:
219 perl
= subprocess
.run(
222 say guess_cipher($ARGV[1], 0, 0);
223 """, "--", ytdown
, cipher_id
225 stdin
=subprocess
.DEVNULL
,
226 stdout
=subprocess
.PIPE
,
227 stderr
=subprocess
.PIPE
229 if perl
.returncode
> 0:
231 sys
.stderr
.write(f
'FAILED guess_cipher (exit:{perl.returncode}):\n')
232 sys
.stderr
.write(perl
.stderr
.decode())
234 sts
, algo
= perl
.stdout
.decode('ascii').strip().split(" ", 1)
236 sys
.stderr
.write(f
'sts, algo = {sts}, {algo}\n')
240 if __name__
== '__main__':
241 verbosity
= 2 if '-vv' in sys
.argv
else 1 if '-v' in sys
.argv
else 0
242 limit
= 1 if '-1' in sys
.argv
else -1
243 force
= '-f' in sys
.argv
245 if 'pull' in sys
.argv
:
246 pull_subscriptions(verbosity
, force
, limit
)
247 elif 'websub' in sys
.argv
:
248 update_subscriptions(verbosity
, force
, limit
)
249 elif 'cipher' in sys
.argv
:
250 refresh_cipher(verbosity
, force
)
253 f
'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
254 f
' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
255 f
' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
256 f
'-f: force even if still up-to-date-ish\n'
257 f
'-v: report errors\n'
258 f
'-vv: report accessed feeds\n'
259 f
'-1: limit to one feed (for testing it works)\n')