app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     from flask import current_app # XXX: remove
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 111             # TODO: enable once we enforce hmac validation:
 112             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 113             break
 114
 115         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 116         new_video = len(c.fetchall()) < 1
 117         if new_video:
 118             if from_webhook:current_app.logger.warning(f"new video {video['video_id']}")
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 if from_webhook:current_app.logger.warning(f"published {published} / {published2}")
 134                 if published < published2: # g_v_i date is more accurate:
 135                     published = published2
 136                 length = meta['length']
 137                 livestream = meta['livestream']
 138
 139             now = datetime.now(timezone.utc)
 140
 141             # we pretend that all videos uploaded this week were uploaded just
 142             # now, so the user sees it at the top of the feed, and it doesn't
 143             # get inserted somewhere further down.
 144             if (now - published).days < 7:
 145                 timestamp = now
 146             else:#, it's just an update to an older video.
 147                 timestamp = published
 148
 149             c.execute("""
 150                 INSERT OR IGNORE INTO videos
 151                     (id, channel_id, title, length, livestream, published, crawled)
 152                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 153             """, (
 154                 video['video_id'],
 155                 video['channel_id'],
 156                 video['title'],
 157                 length,
 158                 livestream,
 159                 video['published'],
 160                 timestamp
 161             ))
 162         else:
 163             # update video title (everything else can't change)
 164             c.execute("""
 165                 UPDATE OR IGNORE videos
 166                     SET title = ?
 167                     WHERE id = ?
 168             """, (
 169                 video['title'],
 170                 video['video_id'],
 171             ))
 172
 173         # for channels, this is obviously always the same, but playlists can
 174         # consist of videos from different channels:
 175         if i == 0 or playlist:
 176             c.execute("""
 177                 INSERT OR REPLACE INTO channels (id, name)
 178                                 VALUES (?, ?)
 179             """, (video['channel_id'], video['author']))
 180
 181         # keep track of which videos are in a playlist, so we can show the user
 182         # why a video is in their feed:
 183         if playlist:
 184             c.execute("""
 185                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 186                                VALUES (?, ?)
 187             """, (video['video_id'], playlist))
 188
 189     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 190         c.execute("""
 191             INSERT OR REPLACE INTO playlists (id, name, author)
 192                             VALUES (?, ?, ?)
 193             """, (playlist, title, channel))
 194         c.execute("""
 195             INSERT OR REPLACE INTO channels (id, name)
 196                             VALUES (?, ?)
 197         """, (channel, author))
 198
 199     db.commit()
 200
 201     return True
 202
 203 def get_video_info(video_id, sts=0, algo=""):
 204     """
 205     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 206     error types: player, malformed, livestream, geolocked, exhausted
 207     """
 208     player_error = None # for 'exhausted'
 209     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 210         r = requests.get("https://www.youtube.com/get_video_info", {
 211             "video_id": video_id,
 212             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 213             "el": el,
 214             "sts": sts,
 215             "hl": "en_US",
 216         })
 217         params = parse_qs(r.text)
 218         if 'errorcode' in params: # status=fail
 219             return None, None, None, 'malformed', params['reason'][0]
 220
 221         metadata = json.loads(params.get('player_response')[0])
 222         playabilityStatus = metadata['playabilityStatus']['status']
 223         if playabilityStatus != "OK":
 224             playabilityReason = metadata['playabilityStatus'].get('reason',
 225                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 226             player_error = f"{playabilityStatus}: {playabilityReason}"
 227             if playabilityStatus == "UNPLAYABLE":
 228                 continue  # try again with next el value (or fail as exhausted)
 229             # without videoDetails, there's only the error message
 230             maybe_metadata = metadata if 'videoDetails' in metadata else None
 231             return None, None, maybe_metadata, 'player', player_error
 232         if metadata['videoDetails'].get('isLive', False):
 233             return None, None, metadata, 'livestream', None
 234
 235         if not 'formats' in metadata['streamingData']:
 236             continue # no urls
 237
 238         formats = metadata['streamingData']['formats']
 239         for (i,v) in enumerate(formats):
 240             if not ('cipher' in v or 'signatureCipher' in v): continue
 241             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 242             formats[i]['url'] = unscramble(cipher, algo)
 243
 244         adaptive = metadata['streamingData']['adaptiveFormats']
 245         for (i,v) in enumerate(adaptive):
 246             if not ('cipher' in v or 'signatureCipher' in v): continue
 247             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 248             adaptive[i]['url'] = unscramble(cipher, algo)
 249
 250         stream_map = {'adaptive': adaptive, 'muxed': formats}
 251
 252         # todo: check if we have urls or try again
 253         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 254
 255         # ip-locked videos can be recovered if the proxy module is loaded:
 256         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 257
 258         return url, stream_map, metadata, is_geolocked, None
 259     else:
 260         return None, None, metadata, 'exhausted', player_error
 261
 262 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 263     signature = list(cipher['s'][0])
 264     for c in algo.split():
 265         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 266         ix = int(ix) % len(signature) if ix else 0
 267         if not op: continue
 268         if op == 'r': signature = list(reversed(signature))
 269         if op == 's': signature = signature[ix:]
 270         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 271     sp = cipher.get('sp', ['signature'])[0]
 272     sig = cipher.get('sig', [''.join(signature)])[0]
 273     return f"{cipher['url'][0]}&{sp}={sig}"
 274
 275 def video_metadata(metadata):
 276     if not metadata:
 277         return {}
 278
 279     meta1 = metadata['videoDetails']
 280     meta2 = metadata['microformat']['playerMicroformatRenderer']
 281
 282     published_at = meta2.get('liveBroadcastDetails',{}) \
 283         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 284
 285     # Note: 'premiere' videos have livestream=False and published= will be the
 286     # start of the premiere.
 287     return {
 288         'title': meta1['title'],
 289         'author': meta1['author'],
 290         'channel_id': meta1['channelId'],
 291         'published': published_at,
 292         'views': int(meta1['viewCount']) or int(meta2['lengthSeconds']),
 293         'length': int(meta1['lengthSeconds']),
 294         'livestream': meta1['isLiveContent'],
 295     }
 296
 297 def store_video_metadata(video_id):
 298     # check if we know about it, and if not, fetch and store video metadata
 299     with sqlite3.connect(cf['global']['database']) as conn:
 300         c = conn.cursor()
 301         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 302         new_video = len(c.fetchall()) < 1
 303         if new_video:
 304             _, _, meta, _, _ = get_video_info(video_id)
 305             if meta:
 306                 meta = video_metadata(meta)
 307                 c.execute("""
 308                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 309                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 310                 """, (
 311                     video_id,
 312                     meta['channel_id'],
 313                     meta['title'],
 314                     meta['published'],
 315                     meta['published'],
 316                 ))
 317                 c.execute("""
 318                     INSERT OR REPLACE INTO channels (id, name)
 319                                     VALUES (?, ?)
 320                 """, (meta['channel_id'], meta['author']))
 321
 322 from werkzeug.exceptions import NotFound
 323 class NoFallbackException(NotFound): pass
 324 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 325     """
 326     finds the next route that matches the current url rule, and executes it.
 327     args, kwargs: pass all arguments of the current route
 328     """
 329     from flask import current_app, request, g
 330
 331     # build a list of endpoints that match the current request's url rule:
 332     matching = [
 333         rule.endpoint
 334         for rule in current_app.url_map.iter_rules()
 335         if rule.rule == request.url_rule.rule
 336     ]
 337     current = matching.index(request.endpoint)
 338
 339     # since we can't change request.endpoint, we always get the original
 340     # endpoint back. so for repeated fall throughs, we use the g object to
 341     # increment how often we want to fall through.
 342     if not '_fallback_next' in g:
 343         g._fallback_next = 0
 344     g._fallback_next += 1
 345
 346     next_ep = current + g._fallback_next
 347
 348     if next_ep < len(matching):
 349         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 350     else:
 351         raise NoFallbackException
 352
 353 def websub_url_hmac(key, feed_id, timestamp, nonce):
 354     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 355     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 356     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 357
 358 def websub_body_hmac(key, body):
 359     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 360
 361 def pp(*args):
 362     from pprint import pprint
 363     import sys, codecs
 364     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))