app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params, data, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 def fetch_xml(feed_type, feed_id):
  54     # TODO: handle requests.exceptions.ConnectionError
  55     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  56         feed_type: feed_id,
  57     })
  58     if not r.ok:
  59         return None
  60
  61     return r.content
  62
  63 def parse_xml(xmldata):
  64     ns = {
  65         'atom':"http://www.w3.org/2005/Atom",
  66         'yt': "http://www.youtube.com/xml/schemas/2015",
  67         'media':"http://search.yahoo.com/mrss/",
  68         'at': "http://purl.org/atompub/tombstones/1.0",
  69     }
  70
  71     feed = ElementTree.fromstring(xmldata)
  72
  73     if feed.find('at:deleted-entry',ns):
  74         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  75         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  76
  77     title = feed.find('atom:title',ns).text
  78     author = feed.find('atom:author/atom:name',ns).text \
  79         if feed.find('atom:author',ns) else None
  80     # for /user/<> endpoint: find out UC-id:
  81     # for playlists: this is who created the playlist:
  82     try:   channel_id = feed.find('yt:channelId',ns).text
  83     except:channel_id=None # XXX: why does ternary not work!?
  84     # for pullsub: if this exists, we're looking at a playlist:
  85     try:   playlist_id = feed.find('yt:playlistId',ns).text
  86     except:playlist_id=None # XXX: why does ternary not work!?
  87     videos = []
  88     for entry in feed.findall('atom:entry',ns):
  89         videos.append({
  90             'video_id': entry.find('yt:videoId',ns).text,
  91             'title': entry.find('atom:title',ns).text,
  92             'published': entry.find('atom:published',ns).text,
  93             'channel_id': entry.find('yt:channelId',ns).text,
  94             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  95             # extra fields for pull_subs/webhook:
  96             'updated': entry.find('atom:updated',ns).text,
  97         })
  98
  99     return title, author, videos, channel_id, playlist_id
 100
 101 def update_channel(db, xmldata, from_webhook=False):
 102     if not xmldata: return False
 103
 104     # Note: websub does not return global author, hence taking from first video
 105     title, author, videos, channel, playlist = parse_xml(xmldata)
 106
 107     c = db.cursor()
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             # Note: Deletion events are not just fired for actual deletions,
 111             # but also for unlisting videos and livestreams that just ended
 112             # (even postLiveDVR ones). Hence, we don't follow it.
 113             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 114             break
 115
 116         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if new_video:
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 if published < published2: # g_v_i date is more accurate:
 134                     published = published2
 135                 length = meta['length']
 136                 livestream = meta['livestream']
 137
 138             now = datetime.now(timezone.utc)
 139
 140             # we pretend that all videos uploaded this week were uploaded just
 141             # now, so the user sees it at the top of the feed, and it doesn't
 142             # get inserted somewhere further down.
 143             if (now - published).days < 7:
 144                 timestamp = now
 145             else:#, it's just an update to an older video.
 146                 timestamp = published
 147
 148             c.execute("""
 149                 INSERT OR IGNORE INTO videos
 150                     (id, channel_id, title, length, livestream, published, crawled)
 151                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 152             """, (
 153                 video['video_id'],
 154                 video['channel_id'],
 155                 video['title'],
 156                 length,
 157                 livestream,
 158                 video['published'],
 159                 timestamp
 160             ))
 161         else:
 162             # update video title (everything else can't change)
 163             c.execute("""
 164                 UPDATE OR IGNORE videos
 165                     SET title = ?
 166                     WHERE id = ?
 167             """, (
 168                 video['title'],
 169                 video['video_id'],
 170             ))
 171
 172         # for channels, this is obviously always the same, but playlists can
 173         # consist of videos from different channels:
 174         if i == 0 or playlist:
 175             c.execute("""
 176                 INSERT OR REPLACE INTO channels (id, name)
 177                                 VALUES (?, ?)
 178             """, (video['channel_id'], video['author']))
 179
 180         # keep track of which videos are in a playlist, so we can show the user
 181         # why a video is in their feed:
 182         if playlist:
 183             c.execute("""
 184                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 185                                VALUES (?, ?)
 186             """, (video['video_id'], playlist))
 187
 188     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 189         c.execute("""
 190             INSERT OR REPLACE INTO playlists (id, name, author)
 191                             VALUES (?, ?, ?)
 192             """, (playlist, title, channel))
 193         c.execute("""
 194             INSERT OR REPLACE INTO channels (id, name)
 195                             VALUES (?, ?)
 196         """, (channel, author))
 197
 198     db.commit()
 199
 200     return True
 201
 202 def get_video_info(video_id, sts=0, algo="", _embed=False):
 203     """
 204     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 205     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 206     """
 207     player_error, metadata = None, None # for 'exhausted'
 208     with sqlite3.connect(cf['global']['database']) as conn:
 209         c = conn.cursor()
 210         c.execute("SELECT * FROM captcha_cookies")
 211         cookies = dict(c.fetchall())
 212     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 213     # XXX: anticaptcha hasn't been adapted
 214     # XXX: this is not cached any more!
 215     # note: age-gated works as long as it's embeddable (HtVdAasjOgU ok, XgnwCQzjau8 bad, SkRSXFQerZs tvhtml5-only)
 216     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
 217         'videoId': video_id,
 218         'context': {
 219             'client': {
 220                 'gl': 'US',
 221                 'hl': 'en',
 222                 'clientName': 'WEB_EMBEDDED_PLAYER' if _embed else 'WEB',
 223                 'clientVersion': f'2.{today}.01.01',
 224                 #"clientName": "ANDROID",
 225                 #"clientVersion": "16.02",
 226             }
 227         },
 228         'playbackContext': {'contentPlaybackContext': {'signatureTimestamp': sts}}
 229     }, cookies=cookies)
 230
 231     if not r or r.status_code == 429:
 232         return None, None, None, 'banned', 'possible IP ban'
 233
 234     metadata = r.json()
 235     if "error" in metadata:
 236         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 237     playabilityStatus = metadata['playabilityStatus']['status']
 238     if playabilityStatus != "OK":
 239         playabilityReason = metadata['playabilityStatus'].get('reason',
 240                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 241         player_error = f"{playabilityStatus}: {playabilityReason}"
 242         #if playabilityStatus == "UNPLAYABLE": XXX: do we need that still?
 243         if (playabilityStatus == "LOGIN_REQUIRED"
 244                 and "confirm your age" in metadata['playabilityStatus'].get('reason','')
 245                 and sts != 0 # only need metadata when no sts (via pubsubhubbub)
 246                 and not _embed
 247         ):
 248             _, _, metadata_embed, error_embed, _ = get_video_info(video_id, sts, algo, True)
 249             if not error_embed:
 250                 metadata['streamingData'] = metadata_embed['streamingData']
 251                 metadata['playabilityStatus'] = metadata_embed['playabilityStatus']
 252             else:
 253               try:
 254                 r2 = requests.get("https://www.youtube.com/get_video_info", dict(
 255                     video_id=video_id,
 256                     html5="1",
 257                     c="ANDROID",  # XXX: randomly 404's as well
 258                     cver="16.02",
 259                     el="embedded",
 260                     eurl=f"https://youtube.googleapis.com/v/{video_id}",
 261                 ))
 262                 metadata_tvhtml5 = json.loads(parse_qs(r2.text).get('player_response',['{}'])[0])
 263                 # has 'playabilityStatus', 'streamingData', 'videoDetails', but not 'microformat' keys
 264                 metadata['streamingData'] = metadata_tvhtml5['streamingData']
 265                 metadata['playabilityStatus'] = metadata_tvhtml5['playabilityStatus']
 266               except:
 267                 return None, None, metadata, 'agegated', player_error
 268         else:
 269             # without videoDetails, there's only the error message
 270             maybe_metadata = metadata if 'videoDetails' in metadata else None
 271             return None, None, maybe_metadata, 'player', player_error
 272
 273     # livestreams have no adaptive/muxed formats:
 274     is_live = metadata['videoDetails'].get('isLive', False)
 275
 276     if not 'formats' in metadata['streamingData'] and not is_live:
 277         return None, None, metadata, 'no-url', player_error
 278
 279     formats = metadata['streamingData'].get('formats',[])
 280     for (i,v) in enumerate(formats):
 281         if not ('cipher' in v or 'signatureCipher' in v): continue
 282         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 283         formats[i]['url'] = unscramble(cipher, algo)
 284
 285     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 286     for (i,v) in enumerate(adaptive):
 287         if not ('cipher' in v or 'signatureCipher' in v): continue
 288         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 289         adaptive[i]['url'] = unscramble(cipher, algo)
 290
 291     stream_map = {
 292         'adaptive': adaptive, 'muxed': formats,
 293         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 294     }
 295
 296     url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] \
 297         if not is_live else None
 298
 299     # ip-locked videos can be recovered if the proxy module is loaded:
 300     is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 301
 302     nonfatal = 'livestream' if is_live \
 303         else 'geolocked' if is_geolocked \
 304         else None
 305
 306     return url, stream_map, metadata, nonfatal, None
 307
 308 def unscramble(cipher, algo):
 309     signature = list(cipher['s'][0])
 310     for c in algo.split():
 311         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 312         ix = int(ix) % len(signature) if ix else 0
 313         if op == 'r': signature = list(reversed(signature))
 314         if op == 's': signature = signature[ix:]
 315         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 316     sp = cipher.get('sp', ['signature'])[0]
 317     sig = cipher.get('sig', [''.join(signature)])[0]
 318     return f"{cipher['url'][0]}&{sp}={sig}"
 319
 320 def video_metadata(metadata):
 321     if not metadata:
 322         return {}
 323
 324     meta1 = metadata['videoDetails']
 325     meta2 = metadata['microformat']['playerMicroformatRenderer']
 326
 327     # sometimes, we receive the notification so early that the length is not
 328     # yet populated. Nothing we can do about it.
 329     length = int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']) or None
 330
 331     published_at = meta2.get('liveBroadcastDetails',{}) \
 332         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 333
 334     # Note: 'premiere' videos have livestream=False and published= will be the
 335     # start of the premiere.
 336     return {
 337         'title': meta1['title'],
 338         'author': meta1['author'],
 339         'channel_id': meta1['channelId'],
 340         'published': published_at,
 341         'views': int(meta1['viewCount']),
 342         'length': length,
 343         'livestream': meta1['isLiveContent'],
 344     }
 345
 346 def store_video_metadata(video_id):
 347     # check if we know about it, and if not, fetch and store video metadata
 348     with sqlite3.connect(cf['global']['database']) as conn:
 349         c = conn.cursor()
 350         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 351         new_video = len(c.fetchall()) < 1
 352         if new_video:
 353             _, _, meta, _, _ = get_video_info(video_id)
 354             if meta:
 355                 meta = video_metadata(meta)
 356                 c.execute("""
 357                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
 358                                    VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 359                 """, (
 360                     video_id,
 361                     meta['channel_id'],
 362                     meta['title'],
 363                     meta['length'],
 364                     meta['published'],
 365                     meta['published'],
 366                 ))
 367                 c.execute("""
 368                     INSERT OR REPLACE INTO channels (id, name)
 369                                     VALUES (?, ?)
 370                 """, (meta['channel_id'], meta['author']))
 371
 372 def fetch_video_flags(token, video_ids):
 373     with sqlite3.connect(cf['global']['database']) as conn:
 374         c = conn.cursor()
 375         c.execute("""
 376             SELECT video_id,display
 377               FROM flags
 378              WHERE user = ?
 379                AND display IS NOT NULL
 380                AND video_id IN ({})
 381                -- AND display = 'pinned'
 382         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 383         flags = c.fetchall()
 384         pinned = [video for video,disp in flags if disp == 'pinned']
 385         hidden = [video for video,disp in flags if disp == 'hidden']
 386
 387         return pinned, hidden
 388
 389 from werkzeug.exceptions import NotFound
 390 class NoFallbackException(NotFound): pass
 391 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 392     """
 393     finds the next route that matches the current url rule, and executes it.
 394     args, kwargs: pass all arguments of the current route
 395     """
 396     from flask import current_app, request, g
 397
 398     # build a list of endpoints that match the current request's url rule:
 399     matching = [
 400         rule.endpoint
 401         for rule in current_app.url_map.iter_rules()
 402         if rule.rule == request.url_rule.rule
 403     ]
 404     current = matching.index(request.endpoint)
 405
 406     # since we can't change request.endpoint, we always get the original
 407     # endpoint back. so for repeated fall throughs, we use the g object to
 408     # increment how often we want to fall through.
 409     if not '_fallback_next' in g:
 410         g._fallback_next = 0
 411     g._fallback_next += 1
 412
 413     next_ep = current + g._fallback_next
 414
 415     if next_ep < len(matching):
 416         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 417     else:
 418         raise NoFallbackException
 419
 420 def websub_url_hmac(key, feed_id, timestamp, nonce):
 421     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 422     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 423     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 424
 425 def websub_body_hmac(key, body):
 426     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 427
 428 def flask_logger(msg, level="warning"):
 429     level = dict(
 430         CRITICAL=50,
 431         ERROR=40,
 432         WARNING=30,
 433         INFO=20,
 434         DEBUG=10,
 435         NOTSET=0,
 436     ).get(level.upper(), 0)
 437     try:
 438         from flask import current_app
 439         current_app.logger.log(level, msg)
 440     except:
 441         pass
 442
 443 def pp(*args):
 444     from pprint import pprint
 445     import sys, codecs
 446     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))