app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     for i, video in enumerate(videos):
 108         if video.get('deleted'):
 109             # Note: Deletion events are not just fired for actual deletions,
 110             # but also for unlisting videos and livestreams that just ended
 111             # (even postLiveDVR ones). Hence, we don't follow it.
 112             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 113             break
 114
 115         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 116         new_video = len(c.fetchall()) < 1
 117         if new_video:
 118             flask_logger(f"new video {video['video_id']}")
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 flask_logger(f"published {published} / {published2}")
 134                 if published < published2: # g_v_i date is more accurate:
 135                     published = published2
 136                 length = meta['length']
 137                 livestream = meta['livestream']
 138
 139             now = datetime.now(timezone.utc)
 140
 141             # we pretend that all videos uploaded this week were uploaded just
 142             # now, so the user sees it at the top of the feed, and it doesn't
 143             # get inserted somewhere further down.
 144             if (now - published).days < 7:
 145                 timestamp = now
 146             else:#, it's just an update to an older video.
 147                 timestamp = published
 148
 149             c.execute("""
 150                 INSERT OR IGNORE INTO videos
 151                     (id, channel_id, title, length, livestream, published, crawled)
 152                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 153             """, (
 154                 video['video_id'],
 155                 video['channel_id'],
 156                 video['title'],
 157                 length,
 158                 livestream,
 159                 video['published'],
 160                 timestamp
 161             ))
 162         else:
 163             # update video title (everything else can't change)
 164             c.execute("""
 165                 UPDATE OR IGNORE videos
 166                     SET title = ?
 167                     WHERE id = ?
 168             """, (
 169                 video['title'],
 170                 video['video_id'],
 171             ))
 172
 173         # for channels, this is obviously always the same, but playlists can
 174         # consist of videos from different channels:
 175         if i == 0 or playlist:
 176             c.execute("""
 177                 INSERT OR REPLACE INTO channels (id, name)
 178                                 VALUES (?, ?)
 179             """, (video['channel_id'], video['author']))
 180
 181         # keep track of which videos are in a playlist, so we can show the user
 182         # why a video is in their feed:
 183         if playlist:
 184             c.execute("""
 185                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 186                                VALUES (?, ?)
 187             """, (video['video_id'], playlist))
 188
 189     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 190         c.execute("""
 191             INSERT OR REPLACE INTO playlists (id, name, author)
 192                             VALUES (?, ?, ?)
 193             """, (playlist, title, channel))
 194         c.execute("""
 195             INSERT OR REPLACE INTO channels (id, name)
 196                             VALUES (?, ?)
 197         """, (channel, author))
 198
 199     db.commit()
 200
 201     return True
 202
 203 def get_video_info(video_id, sts=0, algo=""):
 204     """
 205     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 206     error types: player, malformed, livestream, geolocked, exhausted
 207     """
 208     player_error = None # for 'exhausted'
 209     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 210         r = requests.get("https://www.youtube.com/get_video_info", {
 211             "video_id": video_id,
 212             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 213             "el": el,
 214             "sts": sts,
 215             "hl": "en_US",
 216         })
 217
 218         if r.status_code == 429:
 219             return None, None, None, 'banned', 'possible IP ban'
 220
 221         params = parse_qs(r.text)
 222         if 'errorcode' in params: # status=fail
 223             return None, None, None, 'malformed', params['reason'][0]
 224
 225         metadata = json.loads(params.get('player_response')[0])
 226         playabilityStatus = metadata['playabilityStatus']['status']
 227         if playabilityStatus != "OK":
 228             playabilityReason = metadata['playabilityStatus'].get('reason',
 229                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 230             player_error = f"{playabilityStatus}: {playabilityReason}"
 231             if playabilityStatus == "UNPLAYABLE":
 232                 continue  # try again with next el value (or fail as exhausted)
 233             # without videoDetails, there's only the error message
 234             maybe_metadata = metadata if 'videoDetails' in metadata else None
 235             return None, None, maybe_metadata, 'player', player_error
 236         if metadata['videoDetails'].get('isLive', False):
 237             return None, None, metadata, 'livestream', None
 238
 239         if not 'formats' in metadata['streamingData']:
 240             continue # no urls
 241
 242         formats = metadata['streamingData'].get('formats',[])
 243         for (i,v) in enumerate(formats):
 244             if not ('cipher' in v or 'signatureCipher' in v): continue
 245             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 246             formats[i]['url'] = unscramble(cipher, algo)
 247
 248         adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 249         for (i,v) in enumerate(adaptive):
 250             if not ('cipher' in v or 'signatureCipher' in v): continue
 251             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 252             adaptive[i]['url'] = unscramble(cipher, algo)
 253
 254         stream_map = {'adaptive': adaptive, 'muxed': formats}
 255         stream_map.update({'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl')})
 256         stream_map.update({'dashManifestUrl': metadata['streamingData'].get('dashManifestUrl')})
 257
 258         # todo: check if we have urls or try again
 259         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 260
 261         # ip-locked videos can be recovered if the proxy module is loaded:
 262         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 263
 264         return url, stream_map, metadata, is_geolocked, None
 265     else:
 266         return None, None, metadata, 'exhausted', player_error
 267
 268 def unscramble(cipher, algo):
 269     signature = list(cipher['s'][0])
 270     for c in algo.split():
 271         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 272         ix = int(ix) % len(signature) if ix else 0
 273         if op == 'r': signature = list(reversed(signature))
 274         if op == 's': signature = signature[ix:]
 275         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 276     sp = cipher.get('sp', ['signature'])[0]
 277     sig = cipher.get('sig', [''.join(signature)])[0]
 278     return f"{cipher['url'][0]}&{sp}={sig}"
 279
 280 def video_metadata(metadata):
 281     if not metadata:
 282         return {}
 283
 284     meta1 = metadata['videoDetails']
 285     meta2 = metadata['microformat']['playerMicroformatRenderer']
 286
 287     published_at = meta2.get('liveBroadcastDetails',{}) \
 288         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 289
 290     # Note: 'premiere' videos have livestream=False and published= will be the
 291     # start of the premiere.
 292     return {
 293         'title': meta1['title'],
 294         'author': meta1['author'],
 295         'channel_id': meta1['channelId'],
 296         'published': published_at,
 297         'views': int(meta1['viewCount']),
 298         'length': int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']),
 299         'livestream': meta1['isLiveContent'],
 300     }
 301
 302 def store_video_metadata(video_id):
 303     # check if we know about it, and if not, fetch and store video metadata
 304     with sqlite3.connect(cf['global']['database']) as conn:
 305         c = conn.cursor()
 306         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 307         new_video = len(c.fetchall()) < 1
 308         if new_video:
 309             _, _, meta, _, _ = get_video_info(video_id)
 310             if meta:
 311                 meta = video_metadata(meta)
 312                 c.execute("""
 313                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
 314                                    VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 315                 """, (
 316                     video_id,
 317                     meta['channel_id'],
 318                     meta['title'],
 319                     meta['length'],
 320                     meta['published'],
 321                     meta['published'],
 322                 ))
 323                 c.execute("""
 324                     INSERT OR REPLACE INTO channels (id, name)
 325                                     VALUES (?, ?)
 326                 """, (meta['channel_id'], meta['author']))
 327
 328 def fetch_video_flags(token, video_ids):
 329     with sqlite3.connect(cf['global']['database']) as conn:
 330         c = conn.cursor()
 331         c.execute("""
 332             SELECT video_id,display
 333               FROM flags
 334              WHERE user = ?
 335                AND display IS NOT NULL
 336                AND video_id IN ({})
 337                -- AND display = 'pinned'
 338         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 339         flags = c.fetchall()
 340         pinned = [video for video,disp in flags if disp == 'pinned']
 341         hidden = [video for video,disp in flags if disp == 'hidden']
 342
 343         return pinned, hidden
 344
 345 from werkzeug.exceptions import NotFound
 346 class NoFallbackException(NotFound): pass
 347 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 348     """
 349     finds the next route that matches the current url rule, and executes it.
 350     args, kwargs: pass all arguments of the current route
 351     """
 352     from flask import current_app, request, g
 353
 354     # build a list of endpoints that match the current request's url rule:
 355     matching = [
 356         rule.endpoint
 357         for rule in current_app.url_map.iter_rules()
 358         if rule.rule == request.url_rule.rule
 359     ]
 360     current = matching.index(request.endpoint)
 361
 362     # since we can't change request.endpoint, we always get the original
 363     # endpoint back. so for repeated fall throughs, we use the g object to
 364     # increment how often we want to fall through.
 365     if not '_fallback_next' in g:
 366         g._fallback_next = 0
 367     g._fallback_next += 1
 368
 369     next_ep = current + g._fallback_next
 370
 371     if next_ep < len(matching):
 372         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 373     else:
 374         raise NoFallbackException
 375
 376 def websub_url_hmac(key, feed_id, timestamp, nonce):
 377     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 378     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 379     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 380
 381 def websub_body_hmac(key, body):
 382     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 383
 384 def flask_logger(msg, level="warning"):
 385     try:
 386         from flask import current_app
 387         current_app.logger.log(level, msg)
 388     except:
 389         pass
 390
 391 def pp(*args):
 392     from pprint import pprint
 393     import sys, codecs
 394     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))