app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     # for /user/<> endpoint: find out UC-id:
  80     # for playlists: this is who created the playlist:
  81     try:   channel_id = feed.find('yt:channelId',ns).text
  82     except:channel_id=None # XXX: why does ternary not work!?
  83     # for pullsub: if this exists, we're looking at a playlist:
  84     try:   playlist_id = feed.find('yt:playlistId',ns).text
  85     except:playlist_id=None # XXX: why does ternary not work!?
  86     videos = []
  87     for entry in feed.findall('atom:entry',ns):
  88         videos.append({
  89             'video_id': entry.find('yt:videoId',ns).text,
  90             'title': entry.find('atom:title',ns).text,
  91             'published': entry.find('atom:published',ns).text,
  92             'channel_id': entry.find('yt:channelId',ns).text,
  93             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  94             # extra fields for pull_subs/webhook:
  95             'updated': entry.find('atom:updated',ns).text,
  96         })
  97
  98     return title, author, videos, channel_id, playlist_id
  99
 100 def update_channel(db, xmldata, from_webhook=False):
 101     if not xmldata: return False
 102
 103     # Note: websub does not return global author, hence taking from first video
 104     title, author, videos, channel, playlist = parse_xml(xmldata)
 105
 106     c = db.cursor()
 107     for i, video in enumerate(videos):
 108         if video.get('deleted'):
 109             # Note: Deletion events are not just fired for actual deletions,
 110             # but also for unlisting videos and livestreams that just ended
 111             # (even postLiveDVR ones). Hence, we don't follow it.
 112             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 113             break
 114
 115         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 116         new_video = len(c.fetchall()) < 1
 117         if new_video:
 118             flask_logger(f"new video {video['video_id']}")
 119             _, _, meta, _, _ = get_video_info(video['video_id'])
 120             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 121             # video gets uploaded as unlisted on day A and set to public on day B;
 122             # the webhook is sent on day B, but 'published' says A. The video
 123             # therefore looks like it's just an update to an older video).
 124             # g_v_i gives is the date the video was published to viewers, so we
 125             # prefer that. But since g_v_i only returns the date without time,
 126             # we still use xmlfeed's date if it's the same date.
 127             published = dateutil.parser.parse(video['published'])
 128             length = None
 129             livestream = None
 130             if meta:
 131                 meta = video_metadata(meta)
 132                 published2 = dateutil.parser.parse(meta['published'])
 133                 flask_logger(f"published {published} / {published2}")
 134                 if published < published2: # g_v_i date is more accurate:
 135                     published = published2
 136                 length = meta['length']
 137                 livestream = meta['livestream']
 138
 139             now = datetime.now(timezone.utc)
 140
 141             # we pretend that all videos uploaded this week were uploaded just
 142             # now, so the user sees it at the top of the feed, and it doesn't
 143             # get inserted somewhere further down.
 144             if (now - published).days < 7:
 145                 timestamp = now
 146             else:#, it's just an update to an older video.
 147                 timestamp = published
 148
 149             c.execute("""
 150                 INSERT OR IGNORE INTO videos
 151                     (id, channel_id, title, length, livestream, published, crawled)
 152                 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
 153             """, (
 154                 video['video_id'],
 155                 video['channel_id'],
 156                 video['title'],
 157                 length,
 158                 livestream,
 159                 video['published'],
 160                 timestamp
 161             ))
 162         else:
 163             # update video title (everything else can't change)
 164             c.execute("""
 165                 UPDATE OR IGNORE videos
 166                     SET title = ?
 167                     WHERE id = ?
 168             """, (
 169                 video['title'],
 170                 video['video_id'],
 171             ))
 172
 173         # for channels, this is obviously always the same, but playlists can
 174         # consist of videos from different channels:
 175         if i == 0 or playlist:
 176             c.execute("""
 177                 INSERT OR REPLACE INTO channels (id, name)
 178                                 VALUES (?, ?)
 179             """, (video['channel_id'], video['author']))
 180
 181         # keep track of which videos are in a playlist, so we can show the user
 182         # why a video is in their feed:
 183         if playlist:
 184             c.execute("""
 185                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 186                                VALUES (?, ?)
 187             """, (video['video_id'], playlist))
 188
 189     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 190         c.execute("""
 191             INSERT OR REPLACE INTO playlists (id, name, author)
 192                             VALUES (?, ?, ?)
 193             """, (playlist, title, channel))
 194         c.execute("""
 195             INSERT OR REPLACE INTO channels (id, name)
 196                             VALUES (?, ?)
 197         """, (channel, author))
 198
 199     db.commit()
 200
 201     return True
 202
 203 def get_video_info(video_id, sts=0, algo=""):
 204     """
 205     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 206     error types: player, malformed, livestream, geolocked, exhausted
 207     """
 208     player_error = None # for 'exhausted'
 209     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 210         r = requests.get("https://www.youtube.com/get_video_info", {
 211             "video_id": video_id,
 212             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 213             "el": el,
 214             "sts": sts,
 215             "hl": "en_US",
 216         })
 217
 218         if r.status_code == 429:
 219             return None, None, None, 'banned', 'possible IP ban'
 220
 221         params = parse_qs(r.text)
 222         if 'errorcode' in params: # status=fail
 223             return None, None, None, 'malformed', params['reason'][0]
 224
 225         metadata = json.loads(params.get('player_response')[0])
 226         playabilityStatus = metadata['playabilityStatus']['status']
 227         if playabilityStatus != "OK":
 228             playabilityReason = metadata['playabilityStatus'].get('reason',
 229                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 230             player_error = f"{playabilityStatus}: {playabilityReason}"
 231             if playabilityStatus == "UNPLAYABLE":
 232                 continue  # try again with next el value (or fail as exhausted)
 233             # without videoDetails, there's only the error message
 234             maybe_metadata = metadata if 'videoDetails' in metadata else None
 235             return None, None, maybe_metadata, 'player', player_error
 236         if metadata['videoDetails'].get('isLive', False):
 237             return None, None, metadata, 'livestream', None
 238
 239         if not 'formats' in metadata['streamingData']:
 240             continue # no urls
 241
 242         formats = metadata['streamingData'].get('formats',[])
 243         for (i,v) in enumerate(formats):
 244             if not ('cipher' in v or 'signatureCipher' in v): continue
 245             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 246             formats[i]['url'] = unscramble(cipher, algo)
 247
 248         adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 249         for (i,v) in enumerate(adaptive):
 250             if not ('cipher' in v or 'signatureCipher' in v): continue
 251             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 252             adaptive[i]['url'] = unscramble(cipher, algo)
 253
 254         stream_map = {'adaptive': adaptive, 'muxed': formats}
 255         stream_map.update({'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl')})
 256         stream_map.update({'dashManifestUrl': metadata['streamingData'].get('dashManifestUrl')})
 257
 258         # todo: check if we have urls or try again
 259         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 260
 261         # ip-locked videos can be recovered if the proxy module is loaded:
 262         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 263
 264         return url, stream_map, metadata, is_geolocked, None
 265     else:
 266         return None, None, metadata, 'exhausted', player_error
 267
 268 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 269     signature = list(cipher['s'][0])
 270     for c in algo.split():
 271         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 272         ix = int(ix) % len(signature) if ix else 0
 273         if not op: continue
 274         if op == 'r': signature = list(reversed(signature))
 275         if op == 's': signature = signature[ix:]
 276         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 277     sp = cipher.get('sp', ['signature'])[0]
 278     sig = cipher.get('sig', [''.join(signature)])[0]
 279     return f"{cipher['url'][0]}&{sp}={sig}"
 280
 281 def video_metadata(metadata):
 282     if not metadata:
 283         return {}
 284
 285     meta1 = metadata['videoDetails']
 286     meta2 = metadata['microformat']['playerMicroformatRenderer']
 287
 288     published_at = meta2.get('liveBroadcastDetails',{}) \
 289         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 290
 291     # Note: 'premiere' videos have livestream=False and published= will be the
 292     # start of the premiere.
 293     return {
 294         'title': meta1['title'],
 295         'author': meta1['author'],
 296         'channel_id': meta1['channelId'],
 297         'published': published_at,
 298         'views': int(meta1['viewCount']),
 299         'length': int(meta2['lengthSeconds']) or int(meta1['lengthSeconds']),
 300         'livestream': meta1['isLiveContent'],
 301     }
 302
 303 def store_video_metadata(video_id):
 304     # check if we know about it, and if not, fetch and store video metadata
 305     with sqlite3.connect(cf['global']['database']) as conn:
 306         c = conn.cursor()
 307         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 308         new_video = len(c.fetchall()) < 1
 309         if new_video:
 310             _, _, meta, _, _ = get_video_info(video_id)
 311             if meta:
 312                 meta = video_metadata(meta)
 313                 c.execute("""
 314                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, published, crawled)
 315                                    VALUES (?, ?, ?, ?, datetime(?), datetime(?))
 316                 """, (
 317                     video_id,
 318                     meta['channel_id'],
 319                     meta['title'],
 320                     meta['length'],
 321                     meta['published'],
 322                     meta['published'],
 323                 ))
 324                 c.execute("""
 325                     INSERT OR REPLACE INTO channels (id, name)
 326                                     VALUES (?, ?)
 327                 """, (meta['channel_id'], meta['author']))
 328
 329 def fetch_video_flags(token, video_ids):
 330     with sqlite3.connect(cf['global']['database']) as conn:
 331         c = conn.cursor()
 332         c.execute("""
 333             SELECT video_id,display
 334               FROM flags
 335              WHERE user = ?
 336                AND display IS NOT NULL
 337                AND video_id IN ({})
 338                -- AND display = 'pinned'
 339         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 340         flags = c.fetchall()
 341         pinned = [video for video,disp in flags if disp == 'pinned']
 342         hidden = [video for video,disp in flags if disp == 'hidden']
 343
 344         return pinned, hidden
 345
 346 from werkzeug.exceptions import NotFound
 347 class NoFallbackException(NotFound): pass
 348 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 349     """
 350     finds the next route that matches the current url rule, and executes it.
 351     args, kwargs: pass all arguments of the current route
 352     """
 353     from flask import current_app, request, g
 354
 355     # build a list of endpoints that match the current request's url rule:
 356     matching = [
 357         rule.endpoint
 358         for rule in current_app.url_map.iter_rules()
 359         if rule.rule == request.url_rule.rule
 360     ]
 361     current = matching.index(request.endpoint)
 362
 363     # since we can't change request.endpoint, we always get the original
 364     # endpoint back. so for repeated fall throughs, we use the g object to
 365     # increment how often we want to fall through.
 366     if not '_fallback_next' in g:
 367         g._fallback_next = 0
 368     g._fallback_next += 1
 369
 370     next_ep = current + g._fallback_next
 371
 372     if next_ep < len(matching):
 373         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 374     else:
 375         raise NoFallbackException
 376
 377 def websub_url_hmac(key, feed_id, timestamp, nonce):
 378     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 379     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 380     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 381
 382 def websub_body_hmac(key, body):
 383     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 384
 385 def flask_logger(msg, level="warning"):
 386     try:
 387         from flask import current_app
 388         current_app.logger.log(level, msg)
 389     except:
 390         pass
 391
 392 def pp(*args):
 393     from pprint import pprint
 394     import sys, codecs
 395     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))