app/common/utils.py

   1 #!/bin/sh
   2 ''':'
   3 . "`dirname "$0"`/../../venv/bin/activate"
   4 exec python "$0" "$@"
   5 ':'''
   6
   7 import os
   8 import sys
   9 import time
  10 import secrets
  11 import sqlite3
  12 import requests
  13 import subprocess
  14 import html.parser
  15
  16 from common import *
  17
  18 feed_param = {
  19     'channel': 'channel_id',
  20     'playlist': 'playlist_id',
  21 }
  22
  23 def pull_subscriptions(verbose=1, force_all=False, limit=-1):
  24     """
  25     Crawls youtube channels' RSS feeds and stores found videos in the database.
  26     verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
  27     force_all: fetch all known channels. otherwise only those not crawled in 24h
  28     limit: limit number of processed feeds
  29     """
  30     with sqlite3.connect(cf['global']['database']) as conn:
  31         c = conn.cursor()
  32         c.execute("""
  33             SELECT DISTINCT s.channel_id, type
  34               FROM subscriptions AS s LEFT JOIN crawler AS c
  35                 ON s.channel_id = c.channel_id
  36              WHERE ? OR IFNULL(crawled_at,0) < datetime('now', '-1 day')
  37           ORDER BY crawled_at
  38              LIMIT ?
  39         """, (force_all,limit))
  40         results = c.fetchall()
  41
  42         if verbose >= 2 and not len(results):
  43             sys.stderr.write(f'no feeds to update.\n')
  44
  45         for i,(feed_id, feed_type) in enumerate(results):
  46             if i: time.sleep(60)
  47             pull_feed(feed_id, feed_type, conn, verbose)
  48
  49 def pull_feed(feed_id, feed_type, conn, verbose):
  50     c = conn.cursor()
  51
  52     if verbose >= 2:
  53         sys.stderr.write(f'fetching {feed_id}\n')
  54
  55     xmlfeed = fetch_xml(feed_param[feed_type], feed_id)
  56     if not xmlfeed:
  57         if verbose:
  58             sys.stderr.write(f'FETCH FAILED: {feed_id}\n')
  59         return False
  60
  61     try:
  62         update_channel(conn, xmlfeed)
  63     except:
  64         if verbose:
  65             sys.stderr.write(f'STORE FAILED: {feed_id}\n')
  66         # writing failed, so we store the feed in a file for later analysis.
  67         with open('/tmp/pull-subscriptions.err', 'ab') as f:
  68             f.write(f"<!-- {time.ctime()} ({int(time.time())}) -->\n"
  69                     .encode('ascii'))
  70             f.write(xmlfeed + b"\n")
  71         return False
  72
  73     # update crawled_at timestamp:
  74     c.execute("""
  75     INSERT OR REPLACE INTO crawler (channel_id)
  76                     VALUES (?)
  77     """, (feed_id,))
  78
  79     conn.commit()
  80     return True
  81
  82
  83 def update_subscriptions(verbose=1, force_all=False, limit=-1):
  84     """
  85     Refreshes the websub (pubsubhubhub) subscription requests for youtube feeds.
  86     verbose: 0: completely silent; 1: warn on errors; 2: log all accessed feeds
  87     limit: limit number of processed feeds
  88     """
  89     with sqlite3.connect(cf['global']['database']) as conn:
  90         c = conn.cursor()
  91         c.execute("""
  92             SELECT DISTINCT s.channel_id, type
  93               FROM subscriptions AS s LEFT JOIN websub AS w
  94                 ON s.channel_id = w.channel_id
  95              WHERE ? OR IFNULL(subscribed_until,0) < datetime('now','+12 hours')
  96                    AND type = 'channel' -- playlists don't support websub
  97           ORDER BY subscribed_until
  98              LIMIT ?
  99         """, (force_all,limit))
 100         results = c.fetchall()
 101
 102         if verbose >= 2 and not len(results):
 103             sys.stderr.write(f'no feeds to update.\n')
 104
 105         for i,(feed_id, feed_type) in enumerate(results):
 106             if i: time.sleep(60)
 107             update_feed(feed_id, feed_type, verbose)
 108
 109 def update_feed(feed_id, feed_type, verbose):
 110     webhook = cf['websub']['public_uri']
 111     lease = cf['websub']['lease']
 112     hmackey = cf['websub']['hmac_key']
 113
 114     if verbose >= 2:
 115         sys.stderr.write(f'updating {feed_id}\n')
 116
 117     version, timestamp = "v1", int(time.time())
 118     nonce = secrets.token_urlsafe(16)
 119     sig = websub_url_hmac(hmackey, feed_id, timestamp, nonce)
 120     r = requests.post("https://pubsubhubbub.appspot.com/subscribe", {
 121         "hub.callback":      f"{webhook}/websub/{version}/{timestamp}/" + \
 122                              f"{nonce}/{feed_id}/{sig}",
 123         "hub.topic":         f"https://www.youtube.com/xml/feeds/videos.xml" + \
 124                              f"?{feed_param[feed_type]}={feed_id}",
 125         "hub.verify":        "async",
 126         "hub.mode":          "subscribe",
 127         "hub.lease_seconds": lease,
 128         "hub.secret":        hmackey,
 129     })
 130     if not r.ok:
 131         if verbose:
 132             sys.stderr.write(f'FAILED {feed_id}: {r.text}\n')
 133         return False
 134
 135     return True
 136
 137
 138 def refresh_cipher(verbose=1, force=False):
 139     with sqlite3.connect(cf['global']['database']) as conn:
 140         c = conn.cursor()
 141         c.execute("SELECT url FROM cipher")
 142         (player_url,) = c.fetchone()
 143
 144         new_url = find_player_url(verbose)
 145         if not new_url:
 146             if verbose:
 147                 sys.stderr.write(f'FAILED to get player url!\n')
 148             return False
 149
 150         if player_url == new_url:
 151             if verbose >= 2:
 152                 sys.stderr.write(f'player url unchanged.\n')
 153             if not force:
 154                 return True
 155
 156         (cipher_id,) = re.match(r"/s/player/(.*?)\.js", new_url).groups()
 157         sts, algo = ytdown_guess(cipher_id, verbose, force)
 158
 159         c.execute("""
 160             INSERT OR REPLACE INTO cipher (rowid, url, sts, algorithm)
 161                             VALUES        (0,?,?,?)
 162         """, (new_url, sts, algo))
 163
 164 def find_player_url(verbose, id="jNQXAC9IVRw"):
 165     """
 166     Extract the player.js URL, which can be passed to youtubedown.
 167     Requests a random video (defaults to the first ever uploaded one, "Me at the
 168     zoo". It shouldn't go away any time soon) and parses the returned Tag Soup.
 169     Note that the URL we're looking for contains our locale, so specify it to
 170     avoid it changing.
 171     """
 172     class FindPlayer(html.parser.HTMLParser):
 173         def __init__(self, feed):
 174             super().__init__()
 175             self.player_js = None
 176             super().feed(feed)
 177         def handle_starttag(self, tag, attrs):
 178             attrs = dict(attrs)
 179             if tag == "script" and attrs.get("name") == "player_ias/base":
 180                 self.player_js = attrs.get("src")
 181
 182     if verbose >= 2:
 183         sys.stderr.write(f'fetching embed page {id}\n')
 184     r = requests.get(f"https://www.youtube-nocookie.com/embed/{id}?hl=en&gl=US")
 185     if not r.ok:
 186         if verbose:
 187             sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
 188         return None
 189
 190     player_js = FindPlayer(r.text).player_js
 191     if verbose >= 2:
 192         sys.stderr.write(f'player.js is {player_js}\n')
 193     return player_js
 194
 195 def ytdown_guess(cipher_id, verbose, force):
 196     ytdown = "/tmp/youtubedown.pm"
 197
 198     # update youtubedown once a week:
 199     if force or not os.path.isfile(ytdown) or \
 200             os.stat(ytdown).st_mtime < time.time()-7*24*60*60 or \
 201             os.stat(ytdown).st_size == 0: # if previous write failed
 202         if verbose >= 2:
 203             sys.stderr.write('downloading youtubedown\n')
 204         r = requests.get("https://www.jwz.org/hacks/youtubedown") # UA sniffing!
 205         if not r.ok:
 206             if verbose:
 207                 sys.stderr.write(f'FAILED {r.status_code}: {r.text[:128]}\n')
 208             return None, None
 209         elif verbose >= 2:
 210             sys.stderr.write(f'done (code {r.status_code})\n')
 211
 212         # youtubedown unconditionally calls "main(); exit 0;", which breaks
 213         # using it as a module:
 214         contents = "\n".join(r.text.splitlines()[:-2] + ["1;"])
 215
 216         with open(ytdown, 'w', encoding='utf-8') as f:
 217             f.write(contents)
 218
 219     perl = subprocess.run(
 220         ["perl", "-wE", """
 221             require $ARGV[0];
 222             say guess_cipher($ARGV[1], 0, 0);
 223         """, "--", ytdown, cipher_id
 224         ],
 225         stdin=subprocess.DEVNULL,
 226         stdout=subprocess.PIPE,
 227         stderr=subprocess.DEVNULL
 228     )
 229     sts, algo = perl.stdout.decode('ascii').strip().split(" ", 1)
 230     if verbose >= 2:
 231        sys.stderr.write(f'sts, algo = {sts}, {algo} (exit:{perl.returncode})\n')
 232     return sts, algo
 233
 234
 235 if __name__ == '__main__':
 236     verbosity = 2 if '-vv' in sys.argv else 1 if '-v' in sys.argv else 0
 237     limit = 1 if '-1' in sys.argv else -1
 238     force = '-f' in sys.argv
 239
 240     if 'pull' in sys.argv:
 241         pull_subscriptions(verbosity, force, limit)
 242     elif 'websub' in sys.argv:
 243         update_subscriptions(verbosity, force, limit)
 244     elif 'cipher' in sys.argv:
 245         refresh_cipher(verbosity, force)
 246     else:
 247         sys.stderr.write(
 248             f'Usage: YT_CONFIG=... {sys.argv[0]} pull [-f] [-1] [-v|-vv]\n'
 249             f'       YT_CONFIG=... {sys.argv[0]} websub [-f] [-1] [-v|-vv]\n'
 250             f'       YT_CONFIG=... {sys.argv[0]} cipher [-f] [-v|-vv]\n'
 251             f'-f: force even if still up-to-date-ish\n'
 252             f'-v: report errors\n'
 253             f'-vv: report accessed feeds\n'
 254             f'-1: limit to one feed (for testing it works)\n')
 255         sys.exit(1)