app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     for i, video in enumerate(videos):
 108         if video.get('deleted'):
 109             # Note: Deletion events are not just fired for actual deletions,
 110             # but also for unlisting videos and livestreams that just ended
 111             # (even postLiveDVR ones). Hence, we don't follow it.
 112             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 113             break
 114
 115         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 116         new_video = len(c.fetchall()) < 1
 117         if new_video:
 118             flask_logger(f"new video {video['video_id']}")
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 flask_logger(f"published {published} / {published2}")
 134                 if published < published2: # g_v_i date is more accurate:
 135                     published = published2
 136                 length = meta['length']
 137                 livestream = meta['livestream']
 138
 139             now = datetime.now(timezone.utc)
 140
 141             # we pretend that all videos uploaded this week were uploaded just
 142             # now, so the user sees it at the top of the feed, and it doesn't
 143             # get inserted somewhere further down.
 144             if (now - published).days < 7:
 145                 timestamp = now
 146             else:#, it's just an update to an older video.
 147                 timestamp = published
 148
 149             c.execute("""
 150                 INSERT OR IGNORE INTO videos
 151                     (id, channel_id, title, length, livestream, published, crawled)
 152                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 153             """, (
 154                 video['video_id'],
 155                 video['channel_id'],
 156                 video['title'],
 157                 length,
 158                 livestream,
 159                 video['published'],
 160                 timestamp
 161             ))
 162         else:
 163             # update video title (everything else can't change)
 164             c.execute("""
 165                 UPDATE OR IGNORE videos
 166                     SET title = ?
 167                     WHERE id = ?
 168             """, (
 169                 video['title'],
 170                 video['video_id'],
 171             ))
 172
 173         # for channels, this is obviously always the same, but playlists can
 174         # consist of videos from different channels:
 175         if i == 0 or playlist:
 176             c.execute("""
 177                 INSERT OR REPLACE INTO channels (id, name)
 178                                 VALUES (?, ?)
 179             """, (video['channel_id'], video['author']))
 180
 181         # keep track of which videos are in a playlist, so we can show the user
 182         # why a video is in their feed:
 183         if playlist:
 184             c.execute("""
 185                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 186                                VALUES (?, ?)
 187             """, (video['video_id'], playlist))
 188
 189     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 190         c.execute("""
 191             INSERT OR REPLACE INTO playlists (id, name, author)
 192                             VALUES (?, ?, ?)
 193             """, (playlist, title, channel))
 194         c.execute("""
 195             INSERT OR REPLACE INTO channels (id, name)
 196                             VALUES (?, ?)
 197         """, (channel, author))
 198
 199     db.commit()
 200
 201     return True
 202
 203 def get_video_info(video_id, sts=0, algo=""):
 204     """
 205     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 206     error types: player, malformed, livestream, geolocked, exhausted
 207     """
 208     player_error = None # for 'exhausted'
 209     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 210         r = requests.get("https://www.youtube.com/get_video_info", {
 211             "video_id": video_id,
 212             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 213             "el": el,
 214             "sts": sts,
 215             "hl": "en_US",
 216         })
 217
 218         if r.status_code == 429:
 219             return None, None, None, 'banned', 'possible IP ban'
 220
 221         params = parse_qs(r.text)
 222         if 'errorcode' in params: # status=fail
 223             return None, None, None, 'malformed', params['reason'][0]
 224
 225         metadata = json.loads(params.get('player_response')[0])
 226         playabilityStatus = metadata['playabilityStatus']['status']
 227         if playabilityStatus != "OK":
 228             playabilityReason = metadata['playabilityStatus'].get('reason',
 229                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 230             player_error = f"{playabilityStatus}: {playabilityReason}"
 231             if playabilityStatus == "UNPLAYABLE":
 232                 continue  # try again with next el value (or fail as exhausted)
 233             # without videoDetails, there's only the error message
 234             maybe_metadata = metadata if 'videoDetails' in metadata else None
 235             return None, None, maybe_metadata, 'player', player_error
 236         if metadata['videoDetails'].get('isLive', False):
 237             return None, None, metadata, 'livestream', None
 238
 239         if not 'formats' in metadata['streamingData']:
 240             continue # no urls
 241
 242         formats = metadata['streamingData'].get('formats',[])
 243         for (i,v) in enumerate(formats):
 244             if not ('cipher' in v or 'signatureCipher' in v): continue
 245             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 246             formats[i]['url'] = unscramble(cipher, algo)
 247
 248         adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 249         for (i,v) in enumerate(adaptive):
 250             if not ('cipher' in v or 'signatureCipher' in v): continue
 251             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 252             adaptive[i]['url'] = unscramble(cipher, algo)
 253
 254         stream_map = {'adaptive': adaptive, 'muxed': formats}
 255
 256         # todo: check if we have urls or try again
 257         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 258
 259         # ip-locked videos can be recovered if the proxy module is loaded:
 260         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 261
 262         return url, stream_map, metadata, is_geolocked, None
 263     else:
 264         return None, None, metadata, 'exhausted', player_error
 265
 266 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 267     signature = list(cipher['s'][0])
 268     for c in algo.split():
 269         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 270         ix = int(ix) % len(signature) if ix else 0
 271         if not op: continue
 272         if op == 'r': signature = list(reversed(signature))
 273         if op == 's': signature = signature[ix:]
 274         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 275     sp = cipher.get('sp', ['signature'])[0]
 276     sig = cipher.get('sig', [''.join(signature)])[0]
 277     return f"{cipher['url'][0]}&{sp}={sig}"
 278
 279 def video_metadata(metadata):
 280     if not metadata:
 281         return {}
 282
 283     meta1 = metadata['videoDetails']
 284     meta2 = metadata['microformat']['playerMicroformatRenderer']
 285
 286     published_at = meta2.get('liveBroadcastDetails',{}) \
 287         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 288
 289     # Note: 'premiere' videos have livestream=False and published= will be the
 290     # start of the premiere.
 291     return {
 292         'title': meta1['title'],
 293         'author': meta1['author'],
 294         'channel_id': meta1['channelId'],
 295         'published': published_at,
 296         'views': int(meta1['viewCount']),
 297         'length': int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']),
 298         'livestream': meta1['isLiveContent'],
 299     }
 300
 301 def store_video_metadata(video_id):
 302     # check if we know about it, and if not, fetch and store video metadata
 303     with sqlite3.connect(cf['global']['database']) as conn:
 304         c = conn.cursor()
 305         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 306         new_video = len(c.fetchall()) < 1
 307         if new_video:
 308             _, _, meta, _, _ = get_video_info(video_id)
 309             if meta:
 310                 meta = video_metadata(meta)
 311                 c.execute("""
 312                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 313                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 314                 """, (
 315                     video_id,
 316                     meta['channel_id'],
 317                     meta['title'],
 318                     meta['published'],
 319                     meta['published'],
 320                 ))
 321                 c.execute("""
 322                     INSERT OR REPLACE INTO channels (id, name)
 323                                     VALUES (?, ?)
 324                 """, (meta['channel_id'], meta['author']))
 325
 326 from werkzeug.exceptions import NotFound
 327 class NoFallbackException(NotFound): pass
 328 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 329     """
 330     finds the next route that matches the current url rule, and executes it.
 331     args, kwargs: pass all arguments of the current route
 332     """
 333     from flask import current_app, request, g
 334
 335     # build a list of endpoints that match the current request's url rule:
 336     matching = [
 337         rule.endpoint
 338         for rule in current_app.url_map.iter_rules()
 339         if rule.rule == request.url_rule.rule
 340     ]
 341     current = matching.index(request.endpoint)
 342
 343     # since we can't change request.endpoint, we always get the original
 344     # endpoint back. so for repeated fall throughs, we use the g object to
 345     # increment how often we want to fall through.
 346     if not '_fallback_next' in g:
 347         g._fallback_next = 0
 348     g._fallback_next += 1
 349
 350     next_ep = current + g._fallback_next
 351
 352     if next_ep < len(matching):
 353         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 354     else:
 355         raise NoFallbackException
 356
 357 def websub_url_hmac(key, feed_id, timestamp, nonce):
 358     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 359     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 360     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 361
 362 def websub_body_hmac(key, body):
 363     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 364
 365 def flask_logger(msg, level="warning"):
 366     try:
 367         from flask import current_app
 368         current_app.logger.log(level, msg)
 369     except:
 370         pass
 371
 372 def pp(*args):
 373     from pprint import pprint
 374     import sys, codecs
 375     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))