app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params, data, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 def fetch_xml(feed_type, feed_id):
  54     # TODO: handle requests.exceptions.ConnectionError
  55     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  56         feed_type: feed_id,
  57     })
  58     if not r.ok:
  59         return None
  60
  61     return r.content
  62
  63 def parse_xml(xmldata):
  64     ns = {
  65         'atom':"http://www.w3.org/2005/Atom",
  66         'yt': "http://www.youtube.com/xml/schemas/2015",
  67         'media':"http://search.yahoo.com/mrss/",
  68         'at': "http://purl.org/atompub/tombstones/1.0",
  69     }
  70
  71     feed = ElementTree.fromstring(xmldata)
  72
  73     if feed.find('at:deleted-entry',ns):
  74         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  75         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  76
  77     title = feed.find('atom:title',ns).text
  78     author = feed.find('atom:author/atom:name',ns).text \
  79         if feed.find('atom:author',ns) else None
  80     # for /user/<> endpoint: find out UC-id:
  81     # for playlists: this is who created the playlist:
  82     try:   channel_id = feed.find('yt:channelId',ns).text
  83     except:channel_id=None # XXX: why does ternary not work!?
  84     # for pullsub: if this exists, we're looking at a playlist:
  85     try:   playlist_id = feed.find('yt:playlistId',ns).text
  86     except:playlist_id=None # XXX: why does ternary not work!?
  87     videos = []
  88     for entry in feed.findall('atom:entry',ns):
  89         videos.append({
  90             'video_id': entry.find('yt:videoId',ns).text,
  91             'title': entry.find('atom:title',ns).text,
  92             'published': entry.find('atom:published',ns).text,
  93             'channel_id': entry.find('yt:channelId',ns).text,
  94             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  95             # extra fields for pull_subs/webhook:
  96             'updated': entry.find('atom:updated',ns).text,
  97         })
  98
  99     return title, author, videos, channel_id, playlist_id
 100
 101 def update_channel(db, xmldata, from_webhook=False):
 102     if not xmldata: return False
 103
 104     # Note: websub does not return global author, hence taking from first video
 105     title, author, videos, channel, playlist = parse_xml(xmldata)
 106
 107     c = db.cursor()
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             # Note: Deletion events are not just fired for actual deletions,
 111             # but also for unlisting videos and livestreams that just ended
 112             # (even postLiveDVR ones). Hence, we don't follow it.
 113             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 114             break
 115
 116         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if new_video:
 119             flask_logger(f"new video {video['video_id']}")
 120             _, _, meta, _, _ = get_video_info(video['video_id'])
 121             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 122             # video gets uploaded as unlisted on day A and set to public on day B;
 123             # the webhook is sent on day B, but 'published' says A. The video
 124             # therefore looks like it's just an update to an older video).
 125             # g_v_i gives is the date the video was published to viewers, so we
 126             # prefer that. But since g_v_i only returns the date without time,
 127             # we still use xmlfeed's date if it's the same date.
 128             published = dateutil.parser.parse(video['published'])
 129             length = None
 130             livestream = None
 131             if meta:
 132                 meta = video_metadata(meta)
 133                 published2 = dateutil.parser.parse(meta['published'])
 134                 flask_logger(f"published {published} / {published2}")
 135                 if published < published2: # g_v_i date is more accurate:
 136                     published = published2
 137                 length = meta['length']
 138                 livestream = meta['livestream']
 139
 140             now = datetime.now(timezone.utc)
 141
 142             # we pretend that all videos uploaded this week were uploaded just
 143             # now, so the user sees it at the top of the feed, and it doesn't
 144             # get inserted somewhere further down.
 145             if (now - published).days < 7:
 146                 timestamp = now
 147             else:#, it's just an update to an older video.
 148                 timestamp = published
 149
 150             c.execute("""
 151                 INSERT OR IGNORE INTO videos
 152                     (id, channel_id, title, length, livestream, published, crawled)
 153                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 154             """, (
 155                 video['video_id'],
 156                 video['channel_id'],
 157                 video['title'],
 158                 length,
 159                 livestream,
 160                 video['published'],
 161                 timestamp
 162             ))
 163         else:
 164             # update video title (everything else can't change)
 165             c.execute("""
 166                 UPDATE OR IGNORE videos
 167                     SET title = ?
 168                     WHERE id = ?
 169             """, (
 170                 video['title'],
 171                 video['video_id'],
 172             ))
 173
 174         # for channels, this is obviously always the same, but playlists can
 175         # consist of videos from different channels:
 176         if i == 0 or playlist:
 177             c.execute("""
 178                 INSERT OR REPLACE INTO channels (id, name)
 179                                 VALUES (?, ?)
 180             """, (video['channel_id'], video['author']))
 181
 182         # keep track of which videos are in a playlist, so we can show the user
 183         # why a video is in their feed:
 184         if playlist:
 185             c.execute("""
 186                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 187                                VALUES (?, ?)
 188             """, (video['video_id'], playlist))
 189
 190     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 191         c.execute("""
 192             INSERT OR REPLACE INTO playlists (id, name, author)
 193                             VALUES (?, ?, ?)
 194             """, (playlist, title, channel))
 195         c.execute("""
 196             INSERT OR REPLACE INTO channels (id, name)
 197                             VALUES (?, ?)
 198         """, (channel, author))
 199
 200     db.commit()
 201
 202     return True
 203
 204 def get_video_info(video_id, sts=0, algo=""):
 205     """
 206     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 207     error types: player, malformed, livestream, geolocked, exhausted
 208     """
 209     player_error, metadata = None, None # for 'exhausted'
 210     with sqlite3.connect(cf['global']['database']) as conn:
 211         c = conn.cursor()
 212         c.execute("SELECT * FROM captcha_cookies")
 213         cookies = dict(c.fetchall())
 214     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 215     # XXX: anticaptcha hasn't been adapted
 216     # XXX: this is not cached any more!
 217     # XXX: age-gated now broken: HtVdAasjOgU (embed ok), XgnwCQzjau8 (no embed)
 218     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
 219         'videoId': video_id,
 220         'context': {
 221             'client': {
 222                 'gl': 'US',
 223                 'hl': 'en',
 224                 'clientName': 'WEB',
 225                 'clientVersion': f'2.{today}.01.01',
 226             }
 227         },
 228         'playbackContext': {'contentPlaybackContext': {'signatureTimestamp': sts}}
 229     }, cookies=cookies)
 230
 231     if r.status_code == 429:
 232         return None, None, None, 'banned', 'possible IP ban'
 233
 234     metadata = r.json()
 235     playabilityStatus = metadata['playabilityStatus']['status']
 236     if playabilityStatus != "OK":
 237         playabilityReason = metadata['playabilityStatus'].get('reason',
 238                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 239         player_error = f"{playabilityStatus}: {playabilityReason}"
 240         #if playabilityStatus == "UNPLAYABLE": XXX: do we need that still?
 241         if playabilityStatus == "LOGIN_REQUIRED" and metadata['playabilityStatus'].get('reason') == "Sign in to confirm your age" and sts != 0:
 242             r = requests.get("https://www.youtube.com/get_video_info?html5=1&video_id="+video_id, {
 243                 "video_id": video_id,
 244                 "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 245                 "el": "embedded",
 246                 "sts": sts,
 247                 "hl": "en_US",
 248             })
 249             params = parse_qs(r.text)
 250             if 'errorcode' in params: # status=fail
 251                 return None, None, None, 'malformed', params['reason'][0]
 252             from flask import current_app
 253             current_app.logger.error(r.text)
 254             metadata = json.loads(params.get('player_response')[0])
 255
 256         # without videoDetails, there's only the error message
 257         maybe_metadata = metadata if 'videoDetails' in metadata else None
 258         return None, None, maybe_metadata, 'player', player_error
 259
 260     # livestreams have no adaptive/muxed formats:
 261     is_live = metadata['videoDetails'].get('isLive', False)
 262
 263     if not 'formats' in metadata['streamingData'] and not is_live:
 264         return None, None, metadata, 'no-url', player_error
 265
 266     formats = metadata['streamingData'].get('formats',[])
 267     for (i,v) in enumerate(formats):
 268         if not ('cipher' in v or 'signatureCipher' in v): continue
 269         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 270         formats[i]['url'] = unscramble(cipher, algo)
 271
 272     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 273     for (i,v) in enumerate(adaptive):
 274         if not ('cipher' in v or 'signatureCipher' in v): continue
 275         cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 276         adaptive[i]['url'] = unscramble(cipher, algo)
 277
 278     stream_map = {
 279         'adaptive': adaptive, 'muxed': formats,
 280         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 281     }
 282
 283     url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url'] \
 284         if not is_live else None
 285
 286     # ip-locked videos can be recovered if the proxy module is loaded:
 287     is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 288
 289     nonfatal = 'livestream' if is_live \
 290         else 'geolocked' if is_geolocked \
 291         else None
 292
 293     return url, stream_map, metadata, nonfatal, None
 294
 295 def unscramble(cipher, algo):
 296     signature = list(cipher['s'][0])
 297     for c in algo.split():
 298         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 299         ix = int(ix) % len(signature) if ix else 0
 300         if op == 'r': signature = list(reversed(signature))
 301         if op == 's': signature = signature[ix:]
 302         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 303     sp = cipher.get('sp', ['signature'])[0]
 304     sig = cipher.get('sig', [''.join(signature)])[0]
 305     return f"{cipher['url'][0]}&{sp}={sig}"
 306
 307 def video_metadata(metadata):
 308     if not metadata:
 309         return {}
 310
 311     meta1 = metadata['videoDetails']
 312     meta2 = metadata['microformat']['playerMicroformatRenderer']
 313
 314     # sometimes, we receive the notification so early that the length is not
 315     # yet populated. Nothing we can do about it.
 316     length = int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']) or None
 317
 318     published_at = meta2.get('liveBroadcastDetails',{}) \
 319         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 320
 321     # Note: 'premiere' videos have livestream=False and published= will be the
 322     # start of the premiere.
 323     return {
 324         'title': meta1['title'],
 325         'author': meta1['author'],
 326         'channel_id': meta1['channelId'],
 327         'published': published_at,
 328         'views': int(meta1['viewCount']),
 329         'length': length,
 330         'livestream': meta1['isLiveContent'],
 331     }
 332
 333 def store_video_metadata(video_id):
 334     # check if we know about it, and if not, fetch and store video metadata
 335     with sqlite3.connect(cf['global']['database']) as conn:
 336         c = conn.cursor()
 337         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 338         new_video = len(c.fetchall()) < 1
 339         if new_video:
 340             _, _, meta, _, _ = get_video_info(video_id)
 341             if meta:
 342                 meta = video_metadata(meta)
 343                 c.execute("""
 344                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
 345                                    VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 346                 """, (
 347                     video_id,
 348                     meta['channel_id'],
 349                     meta['title'],
 350                     meta['length'],
 351                     meta['published'],
 352                     meta['published'],
 353                 ))
 354                 c.execute("""
 355                     INSERT OR REPLACE INTO channels (id, name)
 356                                     VALUES (?, ?)
 357                 """, (meta['channel_id'], meta['author']))
 358
 359 def fetch_video_flags(token, video_ids):
 360     with sqlite3.connect(cf['global']['database']) as conn:
 361         c = conn.cursor()
 362         c.execute("""
 363             SELECT video_id,display
 364               FROM flags
 365              WHERE user = ?
 366                AND display IS NOT NULL
 367                AND video_id IN ({})
 368                -- AND display = 'pinned'
 369         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 370         flags = c.fetchall()
 371         pinned = [video for video,disp in flags if disp == 'pinned']
 372         hidden = [video for video,disp in flags if disp == 'hidden']
 373
 374         return pinned, hidden
 375
 376 from werkzeug.exceptions import NotFound
 377 class NoFallbackException(NotFound): pass
 378 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 379     """
 380     finds the next route that matches the current url rule, and executes it.
 381     args, kwargs: pass all arguments of the current route
 382     """
 383     from flask import current_app, request, g
 384
 385     # build a list of endpoints that match the current request's url rule:
 386     matching = [
 387         rule.endpoint
 388         for rule in current_app.url_map.iter_rules()
 389         if rule.rule == request.url_rule.rule
 390     ]
 391     current = matching.index(request.endpoint)
 392
 393     # since we can't change request.endpoint, we always get the original
 394     # endpoint back. so for repeated fall throughs, we use the g object to
 395     # increment how often we want to fall through.
 396     if not '_fallback_next' in g:
 397         g._fallback_next = 0
 398     g._fallback_next += 1
 399
 400     next_ep = current + g._fallback_next
 401
 402     if next_ep < len(matching):
 403         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 404     else:
 405         raise NoFallbackException
 406
 407 def websub_url_hmac(key, feed_id, timestamp, nonce):
 408     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 409     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 410     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 411
 412 def websub_body_hmac(key, body):
 413     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 414
 415 def flask_logger(msg, level="warning"):
 416     level = dict(
 417         CRITICAL=50,
 418         ERROR=40,
 419         WARNING=30,
 420         INFO=20,
 421         DEBUG=10,
 422         NOTSET=0,
 423     ).get(level.upper(), 0)
 424     try:
 425         from flask import current_app
 426         current_app.logger.log(level, msg)
 427     except:
 428         pass
 429
 430 def pp(*args):
 431     from pprint import pprint
 432     import sys, codecs
 433     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))