app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params, data, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 def fetch_xml(feed_type, feed_id):
  54     # TODO: handle requests.exceptions.ConnectionError
  55     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  56         feed_type: feed_id,
  57     })
  58     if not r.ok:
  59         return None
  60
  61     return r.content
  62
  63 def parse_xml(xmldata):
  64     ns = {
  65         'atom':"http://www.w3.org/2005/Atom",
  66         'yt': "http://www.youtube.com/xml/schemas/2015",
  67         'media':"http://search.yahoo.com/mrss/",
  68         'at': "http://purl.org/atompub/tombstones/1.0",
  69     }
  70
  71     feed = ElementTree.fromstring(xmldata)
  72
  73     if feed.find('at:deleted-entry',ns):
  74         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  75         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  76
  77     title = feed.find('atom:title',ns).text
  78     author = feed.find('atom:author/atom:name',ns).text \
  79         if feed.find('atom:author',ns) else None
  80     # for /user/<> endpoint: find out UC-id:
  81     # for playlists: this is who created the playlist:
  82     try:   channel_id = feed.find('yt:channelId',ns).text
  83     except:channel_id=None # XXX: why does ternary not work!?
  84     # for pullsub: if this exists, we're looking at a playlist:
  85     try:   playlist_id = feed.find('yt:playlistId',ns).text
  86     except:playlist_id=None # XXX: why does ternary not work!?
  87     videos = []
  88     for entry in feed.findall('atom:entry',ns):
  89         videos.append({
  90             'video_id': entry.find('yt:videoId',ns).text,
  91             'title': entry.find('atom:title',ns).text,
  92             'published': entry.find('atom:published',ns).text,
  93             'channel_id': entry.find('yt:channelId',ns).text,
  94             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  95             # extra fields for pull_subs/webhook:
  96             'updated': entry.find('atom:updated',ns).text,
  97         })
  98
  99     return title, author, videos, channel_id, playlist_id
 100
 101 def update_channel(db, xmldata, from_webhook=False):
 102     if not xmldata: return False
 103
 104     # Note: websub does not return global author, hence taking from first video
 105     title, author, videos, channel, playlist = parse_xml(xmldata)
 106
 107     c = db.cursor()
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             # Note: Deletion events are not just fired for actual deletions,
 111             # but also for unlisting videos and livestreams that just ended
 112             # (even postLiveDVR ones). Hence, we don't follow it.
 113             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 114             break
 115
 116         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if new_video:
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 if published < published2: # g_v_i date is more accurate:
 134                     published = published2
 135                 length = meta['length']
 136                 livestream = meta['livestream']
 137
 138             now = datetime.now(timezone.utc)
 139
 140             # we pretend that all videos uploaded this week were uploaded just
 141             # now, so the user sees it at the top of the feed, and it doesn't
 142             # get inserted somewhere further down.
 143             if (now - published).days < 7:
 144                 timestamp = now
 145             else:#, it's just an update to an older video.
 146                 timestamp = published
 147
 148             c.execute("""
 149                 INSERT OR IGNORE INTO videos
 150                     (id, channel_id, title, length, livestream, published, crawled)
 151                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 152             """, (
 153                 video['video_id'],
 154                 video['channel_id'],
 155                 video['title'],
 156                 length,
 157                 livestream,
 158                 video['published'],
 159                 timestamp
 160             ))
 161         else:
 162             # update video title (everything else can't change)
 163             c.execute("""
 164                 UPDATE OR IGNORE videos
 165                     SET title = ?
 166                     WHERE id = ?
 167             """, (
 168                 video['title'],
 169                 video['video_id'],
 170             ))
 171
 172         # for channels, this is obviously always the same, but playlists can
 173         # consist of videos from different channels:
 174         if i == 0 or playlist:
 175             c.execute("""
 176                 INSERT OR REPLACE INTO channels (id, name)
 177                                 VALUES (?, ?)
 178             """, (video['channel_id'], video['author']))
 179
 180         # keep track of which videos are in a playlist, so we can show the user
 181         # why a video is in their feed:
 182         if playlist:
 183             c.execute("""
 184                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 185                                VALUES (?, ?)
 186             """, (video['video_id'], playlist))
 187
 188     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 189         c.execute("""
 190             INSERT OR REPLACE INTO playlists (id, name, author)
 191                             VALUES (?, ?, ?)
 192             """, (playlist, title, channel))
 193         c.execute("""
 194             INSERT OR REPLACE INTO channels (id, name)
 195                             VALUES (?, ?)
 196         """, (channel, author))
 197
 198     db.commit()
 199
 200     return True
 201
 202 def get_video_info(video_id, sts=0, algo="", _embed=True):
 203     """
 204     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 205     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 206     """
 207     player_error, metadata = None, None # for 'exhausted'
 208     with sqlite3.connect(cf['global']['database']) as conn:
 209         c = conn.cursor()
 210         c.execute("SELECT * FROM captcha_cookies")
 211         cookies = dict(c.fetchall())
 212     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 213     # XXX: anticaptcha hasn't been adapted
 214     # XXX: this is not cached any more!
 215     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
 216         'videoId': video_id,
 217         'context': {
 218             'client': {
 219                 'gl': 'US',
 220                 'hl': 'en',
 221                 'clientName': 'WEB',
 222                 'clientVersion': f'2.{today}.01.01',
 223                 **({'clientScreen': 'EMBED'} if _embed else {}),
 224             },
 225             'thirdParty': {'embedUrl': 'http://example.com/'}
 226         },
 227         'playbackContext': {'contentPlaybackContext': {'signatureTimestamp': sts}}
 228     }, cookies=cookies)
 229
 230     if not r or r.status_code == 429:
 231         return None, None, None, 'banned', 'possible IP ban'
 232
 233     metadata = r.json()
 234     if "error" in metadata:
 235         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 236     playabilityStatus = metadata['playabilityStatus']['status']
 237     if playabilityStatus != "OK":
 238         playabilityReason = metadata['playabilityStatus'].get('reason',
 239                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 240         player_error = f"{playabilityStatus}: {playabilityReason}"
 241         # "Watch on YouTube" button is visible => "Playback on other websites
 242         # has been disabled by the video owner." => retry detailpage API
 243         if (playabilityStatus == "UNPLAYABLE" and
 244             'proceedButton' in metadata['playabilityStatus'] \
 245                 .get('errorScreen',{}).get('playerErrorMessageRenderer',{})
 246             and sts != 0 # only need metadata when no sts (via pubsubhubbub)
 247             and _embed
 248
 249         ):
 250             _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, sts, algo, _embed=False)
 251             if not error_embed:
 252                 metadata = metadata_embed
 253             elif errormsg_embed == "LOGIN_REQUIRED: Sign in to confirm your age":
 254                 return None, None, metadata, 'agegated', player_error
 255             else:
 256                 return None, None, metadata, error_embed, errormsg_embed
 257         else:
 258             # without videoDetails, there's only the error message
 259             maybe_metadata = metadata if 'videoDetails' in metadata else None
 260             return None, None, maybe_metadata, 'player', player_error
 261
 262     # livestreams have no adaptive/muxed formats:
 263     is_live = metadata['videoDetails'].get('isLive', False)
 264
 265     if not 'formats' in metadata['streamingData'] and not is_live:
 266         return None, None, metadata, 'no-url', player_error
 267
 268     formats = metadata['streamingData'].get('formats',[])
 269     for (i,v) in enumerate(formats):
 270         if not ('cipher' in v or 'signatureCipher' in v): continue
 271         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 272         formats[i]['url'] = unscramble(cipher, algo)
 273
 274     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 275     for (i,v) in enumerate(adaptive):
 276         if not ('cipher' in v or 'signatureCipher' in v): continue
 277         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 278         adaptive[i]['url'] = unscramble(cipher, algo)
 279
 280     stream_map = {
 281         'adaptive': adaptive, 'muxed': formats,
 282         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 283     }
 284
 285     url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] \
 286         if not is_live else None
 287
 288     # ip-locked videos can be recovered if the proxy module is loaded:
 289     is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 290
 291     nonfatal = 'livestream' if is_live \
 292         else 'geolocked' if is_geolocked \
 293         else None
 294
 295     return url, stream_map, metadata, nonfatal, None
 296
 297 def unscramble(cipher, algo):
 298     signature = list(cipher['s'][0])
 299     for c in algo.split():
 300         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 301         ix = int(ix) % len(signature) if ix else 0
 302         if op == 'r': signature = list(reversed(signature))
 303         if op == 's': signature = signature[ix:]
 304         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 305     sp = cipher.get('sp', ['signature'])[0]
 306     sig = cipher.get('sig', [''.join(signature)])[0]
 307     return f"{cipher['url'][0]}&{sp}={sig}"
 308
 309 def video_metadata(metadata):
 310     if not metadata:
 311         return {}
 312
 313     meta1 = metadata['videoDetails']
 314     meta2 = metadata['microformat']['playerMicroformatRenderer']
 315
 316     # sometimes, we receive the notification so early that the length is not
 317     # yet populated. Nothing we can do about it.
 318     length = int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']) or None
 319
 320     published_at = meta2.get('liveBroadcastDetails',{}) \
 321         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 322
 323     # Note: 'premiere' videos have livestream=False and published= will be the
 324     # start of the premiere.
 325     return {
 326         'title': meta1['title'],
 327         'author': meta1['author'],
 328         'channel_id': meta1['channelId'],
 329         'published': published_at,
 330         'views': int(meta1['viewCount']),
 331         'length': length,
 332         'livestream': meta1['isLiveContent'],
 333     }
 334
 335 def store_video_metadata(video_id):
 336     # check if we know about it, and if not, fetch and store video metadata
 337     with sqlite3.connect(cf['global']['database']) as conn:
 338         c = conn.cursor()
 339         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 340         new_video = len(c.fetchall()) < 1
 341         if new_video:
 342             _, _, meta, _, _ = get_video_info(video_id)
 343             if meta:
 344                 meta = video_metadata(meta)
 345                 c.execute("""
 346                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
 347                                    VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 348                 """, (
 349                     video_id,
 350                     meta['channel_id'],
 351                     meta['title'],
 352                     meta['length'],
 353                     meta['published'],
 354                     meta['published'],
 355                 ))
 356                 c.execute("""
 357                     INSERT OR REPLACE INTO channels (id, name)
 358                                     VALUES (?, ?)
 359                 """, (meta['channel_id'], meta['author']))
 360
 361 def fetch_video_flags(token, video_ids):
 362     with sqlite3.connect(cf['global']['database']) as conn:
 363         c = conn.cursor()
 364         c.execute("""
 365             SELECT video_id,display
 366               FROM flags
 367              WHERE user = ?
 368                AND display IS NOT NULL
 369                AND video_id IN ({})
 370                -- AND display = 'pinned'
 371         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 372         flags = c.fetchall()
 373         pinned = [video for video,disp in flags if disp == 'pinned']
 374         hidden = [video for video,disp in flags if disp == 'hidden']
 375
 376         return pinned, hidden
 377
 378 from werkzeug.exceptions import NotFound
 379 class NoFallbackException(NotFound): pass
 380 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 381     """
 382     finds the next route that matches the current url rule, and executes it.
 383     args, kwargs: pass all arguments of the current route
 384     """
 385     from flask import current_app, request, g
 386
 387     # build a list of endpoints that match the current request's url rule:
 388     matching = [
 389         rule.endpoint
 390         for rule in current_app.url_map.iter_rules()
 391         if rule.rule == request.url_rule.rule
 392     ]
 393     current = matching.index(request.endpoint)
 394
 395     # since we can't change request.endpoint, we always get the original
 396     # endpoint back. so for repeated fall throughs, we use the g object to
 397     # increment how often we want to fall through.
 398     if not '_fallback_next' in g:
 399         g._fallback_next = 0
 400     g._fallback_next += 1
 401
 402     next_ep = current + g._fallback_next
 403
 404     if next_ep < len(matching):
 405         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 406     else:
 407         raise NoFallbackException
 408
 409 def websub_url_hmac(key, feed_id, timestamp, nonce):
 410     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 411     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 412     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 413
 414 def websub_body_hmac(key, body):
 415     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 416
 417 def flask_logger(msg, level="warning"):
 418     level = dict(
 419         CRITICAL=50,
 420         ERROR=40,
 421         WARNING=30,
 422         INFO=20,
 423         DEBUG=10,
 424         NOTSET=0,
 425     ).get(level.upper(), 0)
 426     try:
 427         from flask import current_app
 428         current_app.logger.log(level, msg)
 429     except:
 430         pass
 431
 432 def pp(*args):
 433     from pprint import pprint
 434     import sys, codecs
 435     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))