]>
git.gir.st - subscriptionfeed.git/blob - app/common/utils.py
3 . "`dirname "$0"`/../../venv/bin/activate"
19 'channel': 'channel_id',
20 'playlist': 'playlist_id',
23 def pull_subscriptions(verbose
=1, force_all
=False, limit
=-1):
25 Crawls youtube channels' RSS feeds and stores found videos in the database.
26 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
27 force_all: fetch all known channels. otherwise only those not crawled in 24h
28 limit: limit number of processed feeds
30 with sqlite3
.connect(cf
['global']['database']) as conn
:
33 SELECT DISTINCT s.channel_id, type
34 FROM subscriptions AS s LEFT JOIN crawler AS c
35 ON s.channel_id = c.channel_id
36 WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
39 """, (force_all
,limit
))
40 results
= c
.fetchall()
42 if verbose
>= 2 and not len(results
):
43 sys
.stderr
.write(f
'no feeds to update.\n')
45 for i
,(feed_id
, feed_type
) in enumerate(results
):
47 pull_feed(feed_id
, feed_type
, conn
, verbose
)
49 def pull_feed(feed_id
, feed_type
, conn
, verbose
):
53 sys
.stderr
.write(f
'fetching {feed_id}\n')
55 xmlfeed
= fetch_xml(feed_param
[feed_type
], feed_id
)
58 sys
.stderr
.write(f
'FETCH FAILED: {feed_id}\n')
62 update_channel(conn
, xmlfeed
)
65 sys
.stderr
.write(f
'STORE FAILED: {feed_id}\n')
66 # writing failed, so we store the feed in a file for later analysis.
67 with
open('/tmp/pull-subscriptions.err', 'ab') as f
:
68 f
.write(f
"<!-- {time.ctime()} ({int(time.time())}) -->\n"
70 f
.write(xmlfeed
+ b
"\n")
73 # update crawled_at timestamp:
75 INSERT OR REPLACE INTO crawler (channel_id)
83 def update_subscriptions(verbose
=1, force_all
=False, limit
=-1):
85 Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
86 verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
87 limit: limit number of processed feeds
89 with sqlite3
.connect(cf
['global']['database']) as conn
:
92 SELECT DISTINCT s.channel_id, type
93 FROM subscriptions AS s LEFT JOIN websub AS w
94 ON s.channel_id = w.channel_id
95 WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
96 AND type = 'channel' -- playlists don't support websub
97 ORDER BY subscribed_until
99 """, (force_all
,limit
))
100 results
= c
.fetchall()
102 if verbose
>= 2 and not len(results
):
103 sys
.stderr
.write(f
'no feeds to update.\n')
105 for i
,(feed_id
, feed_type
) in enumerate(results
):
107 update_feed(feed_id
, feed_type
, verbose
)
109 def update_feed(feed_id
, feed_type
, verbose
):
110 webhook
= cf
['websub']['public_uri']
111 lease
= cf
['websub']['lease']
112 hmackey
= cf
['websub']['hmac_key']
115 sys
.stderr
.write(f
'updating {feed_id}\n')
117 version
, timestamp
= "v1", int(time
.time())
118 nonce
= secrets
.token_urlsafe(16)
119 sig
= websub_url_hmac(hmackey
, feed_id
, timestamp
, nonce
)
120 r
= requests
.post("https://pubsubhubbub.appspot.com/subscribe", {
121 "hub.callback": f
"{webhook}/websub/{version}/{timestamp}/" + \
122 f
"{nonce}/{feed_id}/{sig}",
123 "hub.topic": f
"https://www.youtube.com/xml/feeds/videos.xml" + \
124 f
"?{feed_param[feed_type]}={feed_id}",
125 "hub.verify": "async",
126 "hub.mode": "subscribe",
127 "hub.lease_seconds": lease
,
128 "hub.secret": hmackey
,
132 sys
.stderr
.write(f
'FAILED {feed_id}: {r.text}\n')
138 def refresh_cipher(verbose
=1, force
=False):
139 with sqlite3
.connect(cf
['global']['database']) as conn
:
141 c
.execute("SELECT url FROM cipher")
142 (player_url
,) = c
.fetchone()
144 new_url
= find_player_url(verbose
)
147 sys
.stderr
.write(f
'FAILED to get player url!\n')
150 if player_url
== new_url
:
152 sys
.stderr
.write(f
'player url unchanged.\n')
156 (cipher_id
,) = re
.match(r
"/s/player/(.*?)\.js", new_url
).groups()
157 sts
, algo
= ytdown_guess(cipher_id
, verbose
, force
)
158 if not sts
or not algo
:
162 INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
164 """, (new_url
, sts
, algo
))
168 def find_player_url(verbose
, id="jNQXAC9IVRw"):
170 Extract the player.js URL, which can be passed to youtubedown.
171 Requests a random video (defaults to the first ever uploaded one, "Me at the
172 zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
173 Note that the URL we're looking for contains our locale, so specify it to
176 class FindPlayer(html
.parser
.HTMLParser
):
177 def __init__(self
, feed
):
179 self
.player_js
= None
181 def handle_starttag(self
, tag
, attrs
):
183 if tag
== "script" and attrs
.get("name") == "player_ias/base":
184 self
.player_js
= attrs
.get("src")
187 sys
.stderr
.write(f
'fetching embed page {id}\n')
188 r
= requests
.get(f
"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
191 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
194 player_js
= FindPlayer(r
.text
).player_js
196 sys
.stderr
.write(f
'player.js is {player_js}\n')
199 def ytdown_guess(cipher_id
, verbose
, force
):
200 ytdown
= "/tmp/youtubedown.pm"
202 # update youtubedown once a week:
203 if force
or not os
.path
.isfile(ytdown
) or \
204 os
.stat(ytdown
).st_mtime
< time
.time()-7*24*60*60 or \
205 os
.stat(ytdown
).st_size
== 0: # if previous write failed
207 sys
.stderr
.write('downloading youtubedown\n')
208 r
= requests
.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
211 sys
.stderr
.write(f
'FAILED {r.status_code}: {r.text[:128]}\n')
214 sys
.stderr
.write(f
'done (code {r.status_code})\n')
216 # youtubedown unconditionally calls "main(); exit 0;", which breaks
217 # using it as a module:
218 contents
= "\n".join(r
.text
.splitlines()[:-2] + ["1;"])
220 with
open(ytdown
, 'w', encoding
='utf-8') as f
:
223 perl
= subprocess
.run(
226 say guess_cipher($ARGV[1], 0, 0);
227 """, "--", ytdown
, cipher_id
229 stdin
=subprocess
.DEVNULL
,
230 stdout
=subprocess
.PIPE
,
231 stderr
=subprocess
.PIPE
233 if perl
.returncode
> 0:
235 sys
.stderr
.write(f
'FAILED guess_cipher (exit:{perl.returncode}):\n')
236 sys
.stderr
.write(perl
.stderr
.decode())
238 sts
, algo
= perl
.stdout
.decode('ascii').strip().split(" ", 1)
240 sys
.stderr
.write(f
'sts, algo = {sts}, {algo}\n')
244 if __name__
== '__main__':
245 verbosity
= 2 if '-vv' in sys
.argv
else 1 if '-v' in sys
.argv
else 0
246 limit
= 1 if '-1' in sys
.argv
else -1
247 force
= '-f' in sys
.argv
249 if 'pull' in sys
.argv
:
250 pull_subscriptions(verbosity
, force
, limit
)
251 elif 'websub' in sys
.argv
:
252 update_subscriptions(verbosity
, force
, limit
)
253 elif 'cipher' in sys
.argv
:
254 refresh_cipher(verbosity
, force
)
257 f
'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
258 f
' YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
259 f
' YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
260 f
'-f: force even if still up-to-date-ish\n'
261 f
'-v: report errors\n'
262 f
'-vv: report accessed feeds\n'
263 f
'-1: limit to one feed (for testing it works)\n')