app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     from flask import current_app # XXX: remove
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 111             # TODO: enable once we enforce hmac validation:
 112             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 113             break
 114
 115         now = datetime.now(timezone.utc)
 116         updated = dateutil.parser.parse(video['updated'])
 117         published = dateutil.parser.parse(video['published'])
 118         # if update and published time are near-identical, we assume it's new.
 119         # checking if it was posted this week is necessary during xmlfeed pulling.
 120         if (updated - published).seconds < 60 and (now - published).days < 7:
 121             timestamp = now
 122             if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
 123         else:#, it might just an update to an older video, or a previously unlisted one.
 124             # first, assume it's an older video (correct when pulling xmlfeeds)
 125             timestamp = published
 126             # then, check if we don't know about it and if so, look up the real date.
 127
 128             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 129             # video gets uploaded as unlisted on day A and set to public on day B;
 130             # the webhook is sent on day B, but 'published' says A. The video
 131             # therefore looks like it's just an update to an older video). If
 132             # that's the case, we fetch get_video_info and double-check.
 133             # We only need to do this to not-yet-in-the-database videos.
 134             c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
 135             new_video = len(c.fetchall()) < 1
 136             if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
 137             if from_webhook and new_video:
 138                 if from_webhook: current_app.logger.warning(f"  is webhook and new") # XXX: remove
 139                 _, _, meta, _, _ = get_video_info(video['video_id'])
 140                 if meta:
 141                     meta = video_metadata(meta)
 142                     published = dateutil.parser.parse(meta['published'])
 143                     if from_webhook: current_app.logger.warning(f"  uploaded {published}") # XXX: remove
 144                     if (now - published).days < 7:
 145                         timestamp = now
 146                     else:#, it's just an update to an older video.
 147                         timestamp = published
 148
 149         c.execute("""
 150             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 151                            VALUES (?, ?, ?, datetime(?), datetime(?))
 152         """, (
 153             video['video_id'],
 154             video['channel_id'],
 155             video['title'],
 156             video['published'],
 157             timestamp
 158         ))
 159
 160         # for channels, this is obviously always the same, but playlists can
 161         # consist of videos from different channels:
 162         if i == 0 or playlist:
 163             c.execute("""
 164                 INSERT OR REPLACE INTO channels (id, name)
 165                                 VALUES (?, ?)
 166             """, (video['channel_id'], video['author']))
 167
 168         # keep track of which videos are in a playlist, so we can show the user
 169         # why a video is in their feed:
 170         if playlist:
 171             c.execute("""
 172                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 173                                VALUES (?, ?)
 174             """, (video['video_id'], playlist))
 175
 176     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 177         c.execute("""
 178             INSERT OR REPLACE INTO playlists (id, name, author)
 179                             VALUES (?, ?, ?)
 180             """, (playlist, title, channel))
 181         c.execute("""
 182             INSERT OR REPLACE INTO channels (id, name)
 183                             VALUES (?, ?)
 184         """, (channel, author))
 185
 186     db.commit()
 187
 188     return True
 189
 190 def get_video_info(video_id, sts=0, algo=""):
 191     """
 192     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 193     error types: player, malformed, livestream, geolocked, exhausted
 194     """
 195     player_error = None # for 'exhausted'
 196     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 197         r = requests.get("https://www.youtube.com/get_video_info", {
 198             "video_id": video_id,
 199             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 200             "el": el,
 201             "sts": sts,
 202             "hl": "en_US",
 203         })
 204         params = parse_qs(r.text)
 205         if 'errorcode' in params: # status=fail
 206             return None, None, None, 'malformed', params['reason'][0]
 207
 208         metadata = json.loads(params.get('player_response')[0])
 209         playabilityStatus = metadata['playabilityStatus']['status']
 210         if playabilityStatus != "OK":
 211             playabilityReason = metadata['playabilityStatus'].get('reason',
 212                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 213             player_error = f"{playabilityStatus}: {playabilityReason}"
 214             if playabilityStatus == "UNPLAYABLE":
 215                 continue  # try again with next el value (or fail as exhausted)
 216             # without videoDetails, there's only the error message
 217             maybe_metadata = metadata if 'videoDetails' in metadata else None
 218             return None, None, maybe_metadata, 'player', player_error
 219         if metadata['videoDetails'].get('isLive', False):
 220             return None, None, metadata, 'livestream', None
 221
 222         if not 'formats' in metadata['streamingData']:
 223             continue # no urls
 224
 225         formats = metadata['streamingData']['formats']
 226         for (i,v) in enumerate(formats):
 227             if not ('cipher' in v or 'signatureCipher' in v): continue
 228             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 229             formats[i]['url'] = unscramble(cipher, algo)
 230
 231         adaptive = metadata['streamingData']['adaptiveFormats']
 232         for (i,v) in enumerate(adaptive):
 233             if not ('cipher' in v or 'signatureCipher' in v): continue
 234             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 235             adaptive[i]['url'] = unscramble(cipher, algo)
 236
 237         stream_map = {'adaptive': adaptive, 'muxed': formats}
 238
 239         # todo: check if we have urls or try again
 240         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 241
 242         # ip-locked videos can be recovered if the proxy module is loaded:
 243         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 244
 245         return url, stream_map, metadata, is_geolocked, None
 246     else:
 247         return None, None, metadata, 'exhausted', player_error
 248
 249 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 250     signature = list(cipher['s'][0])
 251     for c in algo.split():
 252         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 253         ix = int(ix) % len(signature) if ix else 0
 254         if not op: continue
 255         if op == 'r': signature = list(reversed(signature))
 256         if op == 's': signature = signature[ix:]
 257         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 258     sp = cipher.get('sp', ['signature'])[0]
 259     sig = cipher.get('sig', [''.join(signature)])[0]
 260     return f"{cipher['url'][0]}&{sp}={sig}"
 261
 262 def video_metadata(metadata):
 263     if not metadata:
 264         return {}
 265
 266     meta1 = metadata['videoDetails']
 267     meta2 = metadata['microformat']['playerMicroformatRenderer']
 268
 269     published_at = meta2.get('liveBroadcastDetails',{}) \
 270         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 271
 272     return {
 273         'title': meta1['title'],
 274         'author': meta1['author'],
 275         'channel_id': meta1['channelId'],
 276         'published': published_at,
 277         'views': int(meta1['viewCount']),
 278         'length': int(meta1['lengthSeconds']),
 279     }
 280
 281 def store_video_metadata(video_id):
 282     # check if we know about it, and if not, fetch and store video metadata
 283     with sqlite3.connect(cf['global']['database']) as conn:
 284         c = conn.cursor()
 285         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 286         new_video = len(c.fetchall()) < 1
 287         if new_video:
 288             _, _, meta, _, _ = get_video_info(video_id)
 289             if meta:
 290                 meta = video_metadata(meta)
 291                 c.execute("""
 292                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 293                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 294                 """, (
 295                     video_id,
 296                     meta['channel_id'],
 297                     meta['title'],
 298                     meta['published'],
 299                     meta['published'],
 300                 ))
 301                 c.execute("""
 302                     INSERT OR REPLACE INTO channels (id, name)
 303                                     VALUES (?, ?)
 304                 """, (meta['channel_id'], meta['author']))
 305
 306 from werkzeug.exceptions import NotFound
 307 class NoFallbackException(NotFound): pass
 308 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 309     """
 310     finds the next route that matches the current url rule, and executes it.
 311     args, kwargs: pass all arguments of the current route
 312     """
 313     from flask import current_app, request, g
 314
 315     # build a list of endpoints that match the current request's url rule:
 316     matching = [
 317         rule.endpoint
 318         for rule in current_app.url_map.iter_rules()
 319         if rule.rule == request.url_rule.rule
 320     ]
 321     current = matching.index(request.endpoint)
 322
 323     # since we can't change request.endpoint, we always get the original
 324     # endpoint back. so for repeated fall throughs, we use the g object to
 325     # increment how often we want to fall through.
 326     if not '_fallback_next' in g:
 327         g._fallback_next = 0
 328     g._fallback_next += 1
 329
 330     next_ep = current + g._fallback_next
 331
 332     if next_ep < len(matching):
 333         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 334     else:
 335         raise NoFallbackException
 336
 337 def websub_url_hmac(key, feed_id, timestamp, nonce):
 338     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 339     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 340     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 341
 342 def websub_body_hmac(key, body):
 343     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 344
 345 def pp(*args):
 346     from pprint import pprint
 347     import sys, codecs
 348     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))