app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     from flask import current_app # XXX: remove
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 111             # TODO: enable once we enforce hmac validation:
 112             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 113             break
 114
 115         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 116         new_video = len(c.fetchall()) < 1
 117         if new_video:
 118             if from_webhook:current_app.logger.warning(f"new video {video['video_id']}")
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             if meta:
 130                 meta = video_metadata(meta)
 131                 published2 = dateutil.parser.parse(meta['published'])
 132                 if from_webhook:current_app.logger.warning(f"published {published} / {published2}")
 133                 if published < published2: # g_v_i date is more accurate:
 134                     published = published2
 135                 length = meta['length']
 136
 137             now = datetime.now(timezone.utc)
 138
 139             # we pretend that all videos uploaded this week were uploaded just
 140             # now, so the user sees it at the top of the feed, and it doesn't
 141             # get inserted somewhere further down.
 142             if (now - published).days < 7:
 143                 timestamp = now
 144             else:#, it's just an update to an older video.
 145                 timestamp = published
 146
 147             c.execute("""
 148                 INSERT OR IGNORE INTO videos
 149                     (id, channel_id, title, length, published, crawled)
 150                 VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 151             """, (
 152                 video['video_id'],
 153                 video['channel_id'],
 154                 video['title'],
 155                 length,
 156                 video['published'],
 157                 timestamp
 158             ))
 159         else:
 160             # update video title (everything else can't change)
 161             c.execute("""
 162                 UPDATE OR IGNORE videos
 163                     SET title = ?
 164                     WHERE id = ?
 165             """, (
 166                 video['title'],
 167                 video['video_id'],
 168             ))
 169
 170         # for channels, this is obviously always the same, but playlists can
 171         # consist of videos from different channels:
 172         if i == 0 or playlist:
 173             c.execute("""
 174                 INSERT OR REPLACE INTO channels (id, name)
 175                                 VALUES (?, ?)
 176             """, (video['channel_id'], video['author']))
 177
 178         # keep track of which videos are in a playlist, so we can show the user
 179         # why a video is in their feed:
 180         if playlist:
 181             c.execute("""
 182                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 183                                VALUES (?, ?)
 184             """, (video['video_id'], playlist))
 185
 186     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 187         c.execute("""
 188             INSERT OR REPLACE INTO playlists (id, name, author)
 189                             VALUES (?, ?, ?)
 190             """, (playlist, title, channel))
 191         c.execute("""
 192             INSERT OR REPLACE INTO channels (id, name)
 193                             VALUES (?, ?)
 194         """, (channel, author))
 195
 196     db.commit()
 197
 198     return True
 199
 200 def get_video_info(video_id, sts=0, algo=""):
 201     """
 202     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 203     error types: player, malformed, livestream, geolocked, exhausted
 204     """
 205     player_error = None # for 'exhausted'
 206     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 207         r = requests.get("https://www.youtube.com/get_video_info", {
 208             "video_id": video_id,
 209             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 210             "el": el,
 211             "sts": sts,
 212             "hl": "en_US",
 213         })
 214         params = parse_qs(r.text)
 215         if 'errorcode' in params: # status=fail
 216             return None, None, None, 'malformed', params['reason'][0]
 217
 218         metadata = json.loads(params.get('player_response')[0])
 219         playabilityStatus = metadata['playabilityStatus']['status']
 220         if playabilityStatus != "OK":
 221             playabilityReason = metadata['playabilityStatus'].get('reason',
 222                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 223             player_error = f"{playabilityStatus}: {playabilityReason}"
 224             if playabilityStatus == "UNPLAYABLE":
 225                 continue  # try again with next el value (or fail as exhausted)
 226             # without videoDetails, there's only the error message
 227             maybe_metadata = metadata if 'videoDetails' in metadata else None
 228             return None, None, maybe_metadata, 'player', player_error
 229         if metadata['videoDetails'].get('isLive', False):
 230             return None, None, metadata, 'livestream', None
 231
 232         if not 'formats' in metadata['streamingData']:
 233             continue # no urls
 234
 235         formats = metadata['streamingData']['formats']
 236         for (i,v) in enumerate(formats):
 237             if not ('cipher' in v or 'signatureCipher' in v): continue
 238             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 239             formats[i]['url'] = unscramble(cipher, algo)
 240
 241         adaptive = metadata['streamingData']['adaptiveFormats']
 242         for (i,v) in enumerate(adaptive):
 243             if not ('cipher' in v or 'signatureCipher' in v): continue
 244             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 245             adaptive[i]['url'] = unscramble(cipher, algo)
 246
 247         stream_map = {'adaptive': adaptive, 'muxed': formats}
 248
 249         # todo: check if we have urls or try again
 250         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 251
 252         # ip-locked videos can be recovered if the proxy module is loaded:
 253         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 254
 255         return url, stream_map, metadata, is_geolocked, None
 256     else:
 257         return None, None, metadata, 'exhausted', player_error
 258
 259 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 260     signature = list(cipher['s'][0])
 261     for c in algo.split():
 262         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 263         ix = int(ix) % len(signature) if ix else 0
 264         if not op: continue
 265         if op == 'r': signature = list(reversed(signature))
 266         if op == 's': signature = signature[ix:]
 267         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 268     sp = cipher.get('sp', ['signature'])[0]
 269     sig = cipher.get('sig', [''.join(signature)])[0]
 270     return f"{cipher['url'][0]}&{sp}={sig}"
 271
 272 def video_metadata(metadata):
 273     if not metadata:
 274         return {}
 275
 276     meta1 = metadata['videoDetails']
 277     meta2 = metadata['microformat']['playerMicroformatRenderer']
 278
 279     published_at = meta2.get('liveBroadcastDetails',{}) \
 280         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 281
 282     return {
 283         'title': meta1['title'],
 284         'author': meta1['author'],
 285         'channel_id': meta1['channelId'],
 286         'published': published_at,
 287         'views': int(meta1['viewCount']),
 288         'length': int(meta1['lengthSeconds']),
 289     }
 290
 291 def store_video_metadata(video_id):
 292     # check if we know about it, and if not, fetch and store video metadata
 293     with sqlite3.connect(cf['global']['database']) as conn:
 294         c = conn.cursor()
 295         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 296         new_video = len(c.fetchall()) < 1
 297         if new_video:
 298             _, _, meta, _, _ = get_video_info(video_id)
 299             if meta:
 300                 meta = video_metadata(meta)
 301                 c.execute("""
 302                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 303                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 304                 """, (
 305                     video_id,
 306                     meta['channel_id'],
 307                     meta['title'],
 308                     meta['published'],
 309                     meta['published'],
 310                 ))
 311                 c.execute("""
 312                     INSERT OR REPLACE INTO channels (id, name)
 313                                     VALUES (?, ?)
 314                 """, (meta['channel_id'], meta['author']))
 315
 316 from werkzeug.exceptions import NotFound
 317 class NoFallbackException(NotFound): pass
 318 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 319     """
 320     finds the next route that matches the current url rule, and executes it.
 321     args, kwargs: pass all arguments of the current route
 322     """
 323     from flask import current_app, request, g
 324
 325     # build a list of endpoints that match the current request's url rule:
 326     matching = [
 327         rule.endpoint
 328         for rule in current_app.url_map.iter_rules()
 329         if rule.rule == request.url_rule.rule
 330     ]
 331     current = matching.index(request.endpoint)
 332
 333     # since we can't change request.endpoint, we always get the original
 334     # endpoint back. so for repeated fall throughs, we use the g object to
 335     # increment how often we want to fall through.
 336     if not '_fallback_next' in g:
 337         g._fallback_next = 0
 338     g._fallback_next += 1
 339
 340     next_ep = current + g._fallback_next
 341
 342     if next_ep < len(matching):
 343         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 344     else:
 345         raise NoFallbackException
 346
 347 def websub_url_hmac(key, feed_id, timestamp, nonce):
 348     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 349     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 350     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 351
 352 def websub_body_hmac(key, body):
 353     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 354
 355 def pp(*args):
 356     from pprint import pprint
 357     import sys, codecs
 358     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))