app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 from .innertube import parse_infocard, parse_endcard
  16
  17 cf = ConfigParser()
  18 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  19 cf.read(config_filename)
  20 if not 'global' in cf: # todo: full config check
  21     raise Exception("Configuration file not found or empty")
  22
  23 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  24 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  25
  26 # Note: this should only be required for the 'memory' backed cache.
  27 # TODO: only run for long-running processes, i.e. the frontend
  28 from threading import Timer
  29 def purge_cache(sec):
  30     requests_cache.remove_expired_responses()
  31     t = Timer(sec, purge_cache, args=(sec,))
  32     t.setDaemon(True)
  33     t.start()
  34 purge_cache(10*60)
  35
  36 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  37 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  38 from flask import g
  39 import requests
  40 from requests import Session as OriginalSession
  41 class _NSASession(OriginalSession):
  42     def request(self, method, url, params=None, data=None, **kwargs):
  43         response = super(_NSASession, self).request(
  44             method, url, params, data, **kwargs
  45             )
  46         try:
  47             if 'api_requests' not in g:
  48                 g.api_requests = []
  49             g.api_requests.append((url, params, response.text))
  50         except RuntimeError: pass # not within flask (e.g. utils.py)
  51         return response
  52 requests.Session = requests.sessions.Session = _NSASession
  53
  54 def fetch_xml(feed_type, feed_id):
  55     # TODO: handle requests.exceptions.ConnectionError
  56     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  57         feed_type: feed_id,
  58     })
  59     if not r.ok:
  60         return None
  61
  62     return r.content
  63
  64 def parse_xml(xmldata):
  65     ns = {
  66         'atom':"http://www.w3.org/2005/Atom",
  67         'yt': "http://www.youtube.com/xml/schemas/2015",
  68         'media':"http://search.yahoo.com/mrss/",
  69         'at': "http://purl.org/atompub/tombstones/1.0",
  70     }
  71
  72     feed = ElementTree.fromstring(xmldata)
  73
  74     if feed.find('at:deleted-entry',ns):
  75         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  76         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  77
  78     title = feed.find('atom:title',ns).text
  79     author = feed.find('atom:author/atom:name',ns).text \
  80         if feed.find('atom:author',ns) else None
  81     # for /user/<> endpoint: find out UC-id:
  82     # for playlists: this is who created the playlist:
  83     try:   channel_id = feed.find('yt:channelId',ns).text
  84     except:channel_id=None # XXX: why does ternary not work!?
  85     # for pullsub: if this exists, we're looking at a playlist:
  86     try:   playlist_id = feed.find('yt:playlistId',ns).text
  87     except:playlist_id=None # XXX: why does ternary not work!?
  88     videos = []
  89     for entry in feed.findall('atom:entry',ns):
  90         videos.append({
  91             'video_id': entry.find('yt:videoId',ns).text,
  92             'title': entry.find('atom:title',ns).text,
  93             'published': entry.find('atom:published',ns).text,
  94             'channel_id': entry.find('yt:channelId',ns).text,
  95             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  96             # extra fields for pull_subs/webhook:
  97             'updated': entry.find('atom:updated',ns).text,
  98         })
  99
 100     return title, author, videos, channel_id, playlist_id
 101
 102 def update_channel(db, xmldata, from_webhook=False):
 103     if not xmldata: return False
 104
 105     # Note: websub does not return global author, hence taking from first video
 106     title, author, videos, channel, playlist = parse_xml(xmldata)
 107
 108     c = db.cursor()
 109     from flask import current_app # XXX: remove
 110     for i, video in enumerate(videos):
 111         if video.get('deleted'):
 112             if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 113             # TODO: enable once we enforce hmac validation:
 114             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 115             break
 116
 117         now = datetime.now(timezone.utc)
 118         updated = dateutil.parser.parse(video['updated'])
 119         published = dateutil.parser.parse(video['published'])
 120         # if update and published time are near-identical, we assume it's new.
 121         # checking if it was posted this week is necessary during xmlfeed pulling.
 122         if (updated - published).seconds < 60 and (now - published).days < 7:
 123             timestamp = now
 124             if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
 125         else:#, it might just an update to an older video, or a previously unlisted one.
 126             # first, assume it's an older video (correct when pulling xmlfeeds)
 127             timestamp = published
 128             # then, check if we don't know about it and if so, look up the real date.
 129
 130             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 131             # video gets uploaded as unlisted on day A and set to public on day B;
 132             # the webhook is sent on day B, but 'published' says A. The video
 133             # therefore looks like it's just an update to an older video). If
 134             # that's the case, we fetch get_video_info and double-check.
 135             # We only need to do this to not-yet-in-the-database videos.
 136             c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
 137             new_video = len(c.fetchall()) < 1
 138             if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
 139             if from_webhook and new_video:
 140                 if from_webhook: current_app.logger.warning(f"  is webhook and new") # XXX: remove
 141                 _, _, meta, _, _ = get_video_info(video['video_id'])
 142                 if meta:
 143                     meta = video_metadata(meta)
 144                     published = dateutil.parser.parse(meta['published'])
 145                     if from_webhook: current_app.logger.warning(f"  uploaded {published}") # XXX: remove
 146                     if (now - published).days < 7:
 147                         timestamp = now
 148                     else:#, it's just an update to an older video.
 149                         timestamp = published
 150
 151         c.execute("""
 152             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 153                            VALUES (?, ?, ?, datetime(?), datetime(?))
 154         """, (
 155             video['video_id'],
 156             video['channel_id'],
 157             video['title'],
 158             video['published'],
 159             timestamp
 160         ))
 161
 162         # for channels, this is obviously always the same, but playlists can
 163         # consist of videos from different channels:
 164         if i == 0 or playlist:
 165             c.execute("""
 166                 INSERT OR REPLACE INTO channels (id, name)
 167                                 VALUES (?, ?)
 168             """, (video['channel_id'], video['author']))
 169
 170         # keep track of which videos are in a playlist, so we can show the user
 171         # why a video is in their feed:
 172         if playlist:
 173             c.execute("""
 174                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 175                                VALUES (?, ?)
 176             """, (video['video_id'], playlist))
 177
 178     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 179         c.execute("""
 180             INSERT OR REPLACE INTO playlists (id, name, author)
 181                             VALUES (?, ?, ?)
 182             """, (playlist, title, channel))
 183         c.execute("""
 184             INSERT OR REPLACE INTO channels (id, name)
 185                             VALUES (?, ?)
 186         """, (channel, author))
 187
 188     db.commit()
 189
 190     return True
 191
 192 def get_video_info(video_id, sts=0, algo=""):
 193     """
 194     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 195     error types: player, malformed, livestream, geolocked, exhausted
 196     """
 197     player_error = None # for 'exhausted'
 198     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 199         r = requests.get("https://www.youtube.com/get_video_info", {
 200             "video_id": video_id,
 201             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 202             "el": el,
 203             "sts": sts,
 204             "hl": "en_US",
 205         })
 206         params = parse_qs(r.text)
 207         if 'errorcode' in params: # status=fail
 208             return None, None, None, 'malformed', params['reason'][0]
 209
 210         metadata = json.loads(params.get('player_response')[0])
 211         playabilityStatus = metadata['playabilityStatus']['status']
 212         if playabilityStatus != "OK":
 213             playabilityReason = metadata['playabilityStatus'].get('reason',
 214                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 215             player_error = f"{playabilityStatus}: {playabilityReason}"
 216             if playabilityStatus == "UNPLAYABLE":
 217                 continue  # try again with next el value (or fail as exhausted)
 218             # without videoDetails, there's only the error message
 219             maybe_metadata = metadata if 'videoDetails' in metadata else None
 220             return None, None, maybe_metadata, 'player', player_error
 221         if metadata['videoDetails'].get('isLive', False):
 222             return None, None, metadata, 'livestream', None
 223
 224         if not 'formats' in metadata['streamingData']:
 225             continue # no urls
 226
 227         formats = metadata['streamingData']['formats']
 228         for (i,v) in enumerate(formats):
 229             if not ('cipher' in v or 'signatureCipher' in v): continue
 230             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 231             formats[i]['url'] = unscramble(cipher, algo)
 232
 233         adaptive = metadata['streamingData']['adaptiveFormats']
 234         for (i,v) in enumerate(adaptive):
 235             if not ('cipher' in v or 'signatureCipher' in v): continue
 236             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 237             adaptive[i]['url'] = unscramble(cipher, algo)
 238
 239         stream_map = {'adaptive': adaptive, 'muxed': formats}
 240
 241         # todo: check if we have urls or try again
 242         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 243
 244         # ip-locked videos can be recovered if the proxy module is loaded:
 245         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 246
 247         return url, stream_map, metadata, is_geolocked, None
 248     else:
 249         return None, None, metadata, 'exhausted', player_error
 250
 251 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 252     signature = list(cipher['s'][0])
 253     for c in algo.split():
 254         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 255         ix = int(ix) % len(signature) if ix else 0
 256         if not op: continue
 257         if op == 'r': signature = list(reversed(signature))
 258         if op == 's': signature = signature[ix:]
 259         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 260     sp = cipher.get('sp', ['signature'])[0]
 261     sig = cipher.get('sig', [''.join(signature)])[0]
 262     return f"{cipher['url'][0]}&{sp}={sig}"
 263
 264 def video_metadata(metadata):
 265     if not metadata:
 266         return {}
 267
 268     meta1 = metadata['videoDetails']
 269     meta2 = metadata['microformat']['playerMicroformatRenderer']
 270
 271     published_at = meta2.get('liveBroadcastDetails',{}) \
 272         .get('startTimestamp', f"{meta2['publishDate']}T00:00:00Z")
 273
 274     return {
 275         'title': meta1['title'],
 276         'author': meta1['author'],
 277         'channel_id': meta1['channelId'],
 278         'published': published_at,
 279         'views': int(meta1['viewCount']),
 280         'length': int(meta1['lengthSeconds']),
 281     }
 282
 283 def prepare_metadata(metadata):
 284     meta1 = metadata['videoDetails']
 285     meta2 = metadata['microformat']['playerMicroformatRenderer']
 286     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 287         if 'cards' in metadata else []
 288     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 289         if 'endscreen' in metadata else []
 290
 291     # the actual video streams have exact information:
 292     try:
 293         sd = metadata['streamingData']
 294         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 295         aspect_ratio = some_stream['width'] / some_stream['height']
 296     # if that's unavailable (e.g. on livestreams), fall back to
 297     # thumbnails (only either 4:3 or 16:9).
 298     except:
 299         some_img = meta2['thumbnail']['thumbnails'][0]
 300         aspect_ratio = some_img['width'] / some_img['height']
 301
 302     # Note: we could get subtitles in multiple formats directly by querying
 303     # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
 304     # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
 305     # but that won't give us autogenerated subtitles (and is an extra request).
 306     # we can still add &fmt= to the extracted URLs below (first one takes precedence).
 307     try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
 308         default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
 309         main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
 310     except:
 311         main_subtitle = -1
 312     subtitles = sorted([
 313         {'url':cc['baseUrl'],
 314          'code':cc['languageCode'],
 315          'autogenerated':cc.get('kind')=="asr",
 316          'name':cc['name']['simpleText'],
 317          'default':i==main_subtitle,
 318          'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
 319         for i,cc in enumerate(metadata.get('captions',{})
 320             .get('playerCaptionsTracklistRenderer',{})
 321             .get('captionTracks',[]))
 322     # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
 323     ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
 324
 325     infocards = [parse_infocard(card) for card in cards]
 326     endcards = [parse_endcard(card) for card in endsc]
 327     # combine cards to weed out duplicates. for videos and playlists prefer
 328     # infocards, for channels and websites prefer endcards, as those have more
 329     # information than the other.
 330     # if the card type is not in ident, we use the whole card for comparison
 331     # (otherwise they'd all replace each other)
 332     ident = { # ctype -> ident
 333         'VIDEO': 'video_id',
 334         'PLAYLIST': 'playlist_id',
 335         'CHANNEL': 'channel_id',
 336         'WEBSITE': 'url',
 337         'POLL': 'question',
 338     }
 339     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 340     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 341     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 342
 343     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 344                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 345
 346     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 347         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 348         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 349         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 350         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 351         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 352         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 353         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 354         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 355         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 356         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 357     whitelisted = sorted(meta2.get('availableCountries',[]))
 358     blacklisted = sorted(set(all_countries) - set(whitelisted))
 359
 360     return {
 361         **video_metadata(metadata),
 362         'description': meta1['shortDescription'],
 363         'rating': meta1['averageRating'],
 364         'category': meta2['category'],
 365         'aspectr': aspect_ratio,
 366         'unlisted': meta2['isUnlisted'],
 367         'whitelisted': whitelisted,
 368         'blacklisted': blacklisted,
 369         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 370         'infocards': infocards,
 371         'endcards': endcards,
 372         'all_cards': allcards,
 373         'subtitles': subtitles,
 374     }
 375
 376 def store_video_metadata(video_id):
 377     # check if we know about it, and if not, fetch and store video metadata
 378     with sqlite3.connect(cf['global']['database']) as conn:
 379         c = conn.cursor()
 380         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 381         new_video = len(c.fetchall()) < 1
 382         if new_video:
 383             _, _, meta, _, _ = get_video_info(video_id)
 384             if meta:
 385                 meta = video_metadata(meta)
 386                 c.execute("""
 387                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 388                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 389                 """, (
 390                     video_id,
 391                     meta['channel_id'],
 392                     meta['title'],
 393                     meta['published'],
 394                     meta['published'],
 395                 ))
 396                 c.execute("""
 397                     INSERT OR REPLACE INTO channels (id, name)
 398                                     VALUES (?, ?)
 399                 """, (meta['channel_id'], meta['author']))
 400
 401 from werkzeug.exceptions import NotFound
 402 class NoFallbackException(NotFound): pass
 403 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 404     """
 405     finds the next route that matches the current url rule, and executes it.
 406     args, kwargs: pass all arguments of the current route
 407     """
 408     from flask import current_app, request, g
 409
 410     # build a list of endpoints that match the current request's url rule:
 411     matching = [
 412         rule.endpoint
 413         for rule in current_app.url_map.iter_rules()
 414         if rule.rule == request.url_rule.rule
 415     ]
 416     current = matching.index(request.endpoint)
 417
 418     # since we can't change request.endpoint, we always get the original
 419     # endpoint back. so for repeated fall throughs, we use the g object to
 420     # increment how often we want to fall through.
 421     if not '_fallback_next' in g:
 422         g._fallback_next = 0
 423     g._fallback_next += 1
 424
 425     next_ep = current + g._fallback_next
 426
 427     if next_ep < len(matching):
 428         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 429     else:
 430         raise NoFallbackException
 431
 432 def websub_url_hmac(key, feed_id, timestamp, nonce):
 433     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 434     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 435     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 436
 437 def websub_body_hmac(key, body):
 438     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 439
 440 def pp(*args):
 441     from pprint import pprint
 442     import sys, codecs
 443     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))