app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, json=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params=params, data=data, json=json, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, json, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 class G:
  54     """
  55     null-coalescing version of dict.get() that also works on lists.
  56
  57     the | operator is overloaded to achieve similar looking code to jq(1) filters.
  58     the first found key is used: dict(foo=1)|G('bar','foo') returns 1.
  59     """
  60     def __init__(self, *keys):
  61         self.keys = keys
  62     def __ror__(self, other):
  63         for key in self.keys:
  64             try:    return other[key]
  65             except: continue
  66         return None
  67     class _Text:
  68         """ parses youtube's .runs[].text and .simpleText variants """
  69         def __ror__(self, other): # Note: only returning runs[0], not concat'ing all!
  70             return other|G('simpleText') or other|G('runs')|G(0)|G('text')
  71     text = _Text()
  72
  73 def fetch_xml(feed_type, feed_id):
  74     # TODO: handle requests.exceptions.ConnectionError
  75     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  76         feed_type: feed_id,
  77     })
  78     if not r.ok:
  79         return None
  80
  81     return r.content
  82
  83 def parse_xml(xmldata):
  84     ns = {
  85         'atom':"http://www.w3.org/2005/Atom",
  86         'yt': "http://www.youtube.com/xml/schemas/2015",
  87         'media':"http://search.yahoo.com/mrss/",
  88         'at': "http://purl.org/atompub/tombstones/1.0",
  89     }
  90
  91     feed = ElementTree.fromstring(xmldata)
  92
  93     if feed.find('at:deleted-entry',ns):
  94         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  95         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  96
  97     title = feed.find('atom:title',ns).text
  98     author = feed.find('atom:author/atom:name',ns).text \
  99         if feed.find('atom:author',ns) else None
 100     # for /user/<> endpoint: find out UC-id:
 101     # for playlists: this is who created the playlist:
 102     try:   channel_id = feed.find('yt:channelId',ns).text
 103     except:channel_id=None # XXX: why does ternary not work!?
 104     # for pullsub: if this exists, we're looking at a playlist:
 105     try:   playlist_id = feed.find('yt:playlistId',ns).text
 106     except:playlist_id=None # XXX: why does ternary not work!?
 107     videos = []
 108     for entry in feed.findall('atom:entry',ns):
 109         videos.append({
 110             'video_id': entry.find('yt:videoId',ns).text,
 111             'title': entry.find('atom:title',ns).text,
 112             'published': entry.find('atom:published',ns).text,
 113             'channel_id': entry.find('yt:channelId',ns).text,
 114             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
 115             # extra fields for pull_subs/webhook:
 116             'updated': entry.find('atom:updated',ns).text,
 117         })
 118
 119     return title, author, videos, channel_id, playlist_id
 120
 121 def update_channel(db, xmldata, from_webhook=False):
 122     if not xmldata: return False
 123
 124     # Note: websub does not return global author, hence taking from first video
 125     title, author, videos, channel, playlist = parse_xml(xmldata)
 126
 127     c = db.cursor()
 128     for i, video in enumerate(videos):
 129         if video.get('deleted'):
 130             # Note: Deletion events are not just fired for actual deletions,
 131             # but also for unlisting videos and livestreams that just ended
 132             # (even postLiveDVR ones). Hence, we don't follow it.
 133             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 134             break
 135
 136         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 137         new_video = len(c.fetchall()) < 1
 138         if new_video:
 139             # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
 140             _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
 141             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 142             # video gets uploaded as unlisted on day A and set to public on day B;
 143             # the webhook is sent on day B, but 'published' says A. The video
 144             # therefore looks like it's just an update to an older video).
 145             # g_v_i gives is the date the video was published to viewers, so we
 146             # prefer that. But since g_v_i only returns the date without time,
 147             # we still use xmlfeed's date if it's the same date.
 148             published = dateutil.parser.parse(video['published'])
 149             length = None
 150             livestream = None
 151             premiere = None
 152             shorts = None
 153             if meta:
 154                 meta = video_metadata(meta)
 155                 published2 = dateutil.parser.parse(meta['published'])
 156                 if published < published2: # g_v_i date is more accurate:
 157                     published = published2
 158                 length = meta['length']
 159                 livestream = meta['livestream']
 160                 premiere = meta['premiere']
 161                 shorts = meta['shorts']
 162
 163             now = datetime.now(timezone.utc)
 164
 165             # we pretend that all videos uploaded this week were uploaded just
 166             # now, so the user sees it at the top of the feed, and it doesn't
 167             # get inserted somewhere further down.
 168             if (now - published).days < 7:
 169                 timestamp = now
 170             else:#, it's just an update to an older video.
 171                 timestamp = published
 172
 173             c.execute("""
 174                 INSERT OR IGNORE INTO videos
 175                     (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 176                 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 177             """, (
 178                 video['video_id'],
 179                 video['channel_id'],
 180                 video['title'],
 181                 length,
 182                 livestream,
 183                 premiere,
 184                 shorts,
 185                 published,
 186                 timestamp
 187             ))
 188         else:
 189             # update video title (everything else can't change)
 190             c.execute("""
 191                 UPDATE OR IGNORE videos
 192                     SET title = ?
 193                     WHERE id = ?
 194             """, (
 195                 video['title'],
 196                 video['video_id'],
 197             ))
 198
 199         # for channels, this is obviously always the same, but playlists can
 200         # consist of videos from different channels:
 201         if i == 0 or playlist:
 202             c.execute("""
 203                 INSERT OR REPLACE INTO channels (id, name)
 204                                 VALUES (?, ?)
 205             """, (video['channel_id'], video['author']))
 206
 207         # keep track of which videos are in a playlist, so we can show the user
 208         # why a video is in their feed:
 209         if playlist:
 210             c.execute("""
 211                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 212                                VALUES (?, ?)
 213             """, (video['video_id'], playlist))
 214
 215     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 216         c.execute("""
 217             INSERT OR REPLACE INTO playlists (id, name, author)
 218                             VALUES (?, ?, ?)
 219             """, (playlist, title, channel))
 220         c.execute("""
 221             INSERT OR REPLACE INTO channels (id, name)
 222                             VALUES (?, ?)
 223         """, (channel, author))
 224
 225     db.commit()
 226
 227     return True
 228
 229 def is_agegated(metadata):
 230     playabilityStatus = metadata['playabilityStatus']
 231     return bool(
 232         playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
 233         or playabilityStatus.get("desktopLegacyAgeGateReason")
 234     )
 235
 236 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
 237     """
 238     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 239     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 240     """
 241     player_error, metadata = None, None # for 'exhausted'
 242     with sqlite3.connect(cf['global']['database']) as conn:
 243         c = conn.cursor()
 244         c.execute("SELECT * FROM captcha_cookies")
 245         cookies = dict(c.fetchall())
 246     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 247     # XXX: anticaptcha hasn't been adapted
 248     key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
 249     # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
 250     # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
 251     # TODO: unscramble TVHTML5* streams (especially &n= throttling)
 252     client = {
 253         (False, False): { 'clientName': 'ANDROID',                        'clientVersion': '17.31.35',       'androidSdkVersion': 30},
 254         (False, True):  { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0'                                    },
 255         (True, False):  { 'clientName': 'WEB',                            'clientVersion':f'2.{today}.01.01'                        },
 256     }[(metaOnly, _agegate_bypass)]
 257     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
 258         'videoId': video_id,
 259         'context': {
 260             'client': {
 261                 'gl': 'US',
 262                 'hl': 'en',
 263                 **client,
 264             },
 265             'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
 266         },
 267         "racyCheckOk": True, # seems to do nothing, cargo-culted
 268         "contentCheckOk": True, # fix "This video may be inappropriate for some users."
 269     }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
 270
 271     if not r or r.status_code == 429:
 272         return None, None, None, 'banned', 'possible IP ban'
 273
 274     metadata = r.json()
 275     if "error" in metadata:
 276         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 277     playabilityStatus = metadata['playabilityStatus']['status']
 278     if playabilityStatus != "OK":
 279         playabilityReason = metadata['playabilityStatus'].get('reason',
 280                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 281         player_error = f"{playabilityStatus}: {playabilityReason}"
 282         if (is_agegated(metadata)
 283             and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
 284             and not _agegate_bypass
 285         ):
 286             _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
 287             if error_embed == "player": # agegate bypass failed?
 288                 return None, None, metadata, 'agegated', player_error
 289             elif not error_embed or error_embed in ('livestream','geolocked','scrambled'):
 290                 metadata = metadata_embed
 291             else:
 292                 return None, None, metadata, error_embed, errormsg_embed
 293         else:
 294             # without videoDetails, there's only the error message
 295             maybe_metadata = metadata if 'videoDetails' in metadata else None
 296             return None, None, maybe_metadata, 'player', player_error
 297
 298     # livestreams have no adaptive/muxed formats:
 299     is_live = metadata['videoDetails'].get('isLive', False)
 300
 301     if not 'formats' in metadata['streamingData'] and not is_live:
 302         return None, None, metadata, 'no-url', player_error
 303
 304     formats = metadata['streamingData'].get('formats',[])
 305     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 306     stream_map = {
 307         'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
 308         'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
 309         'muxed': formats,
 310         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 311     }
 312
 313     try:
 314         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 315
 316         # ip-locked videos can be recovered if the proxy module is loaded:
 317         is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 318     except:
 319         url = None
 320         is_geolocked = False
 321
 322     is_drm = formats and 'signatureCipher' in formats[0]
 323
 324     nonfatal = 'livestream' if is_live \
 325         else 'geolocked' if is_geolocked \
 326         else 'scrambled' if is_drm \
 327         else None
 328
 329     return url, stream_map, metadata, nonfatal, None
 330
 331 def video_metadata(metadata):
 332     if not metadata:
 333         return {}
 334
 335     meta1 = metadata['videoDetails']
 336     # With ANDROID player API, we don't get microformat => no publishDate!
 337     meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
 338
 339     # sometimes, we receive the notification so early that the length is not
 340     # yet populated. Nothing we can do about it. meta1 and meta2 use a
 341     # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
 342     length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
 343
 344     scheduled_time = metadata.get('playabilityStatus',{}) \
 345         .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
 346         .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
 347         .get('scheduledStartTime')
 348     if scheduled_time:
 349         scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
 350             .strftime("%Y-%m-%dT%H:%M:%SZ")
 351     published_at = (
 352         meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
 353         scheduled_time or
 354         f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
 355     )
 356
 357     # the actual video streams have exact information:
 358     # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
 359     try:
 360         sd = metadata['streamingData']
 361         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 362         aspect_ratio =  some_stream['width'] / some_stream['height']
 363     # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
 364     except:
 365         aspect_ratio = None
 366
 367     # shorts are <= 60 seconds and vertical or square. if we were unable to
 368     # determine it, we set it to None.
 369     is_short = (
 370         None if length is None and aspect_ratio is None else
 371         True if ((length or 61) <= 60) and ((aspect_ratio or 2) <= 1) else
 372         False # length > 60 or aspect_ratio > 1
 373     )
 374
 375     # Note: 'premiere' videos have livestream=False and published= will be the
 376     # start of the premiere.
 377     return {
 378         'title': meta1['title'],
 379         'author': meta1['author'],
 380         'channel_id': meta1['channelId'],
 381         'published': published_at,
 382         'views': int(meta1['viewCount']),
 383         'length': length,
 384         'aspect': aspect_ratio or 16/9,
 385         'livestream': meta1['isLiveContent'],
 386         'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
 387         'shorts': is_short,
 388     }
 389
 390 def mkthumbs(thumbs):
 391     output = {str(e['height']): e['url'] for e in thumbs}
 392     largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None)
 393     return {**output, 'largest': largest}
 394
 395 def store_video_metadata(video_id):
 396     # check if we know about it, and if not, fetch and store video metadata
 397     with sqlite3.connect(cf['global']['database']) as conn:
 398         c = conn.cursor()
 399         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 400         new_video = len(c.fetchall()) < 1
 401         if new_video:
 402             _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
 403             if meta:
 404                 meta = video_metadata(meta)
 405                 c.execute("""
 406                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 407                                    VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 408                 """, (
 409                     video_id,
 410                     meta['channel_id'],
 411                     meta['title'],
 412                     meta['length'],
 413                     meta['livestream'],
 414                     meta['premiere'],
 415                     meta['shorts'],
 416                     meta['published'],
 417                     meta['published'],
 418                 ))
 419                 c.execute("""
 420                     INSERT OR REPLACE INTO channels (id, name)
 421                                     VALUES (?, ?)
 422                 """, (meta['channel_id'], meta['author']))
 423
 424 def fetch_video_flags(token, video_ids):
 425     with sqlite3.connect(cf['global']['database']) as conn:
 426         c = conn.cursor()
 427         c.execute("""
 428             SELECT video_id,display
 429               FROM flags
 430              WHERE user = ?
 431                AND display IS NOT NULL
 432                AND video_id IN ({})
 433                -- AND display = 'pinned'
 434         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 435         flags = c.fetchall()
 436         pinned = [video for video,disp in flags if disp == 'pinned']
 437         hidden = [video for video,disp in flags if disp == 'hidden']
 438
 439         return pinned, hidden
 440
 441 def apply_video_flags(token, rows):
 442     video_ids = [card['content']['video_id'] for card in rows if 'video_id' in card['content']]
 443     pinned, hidden = fetch_video_flags(token, video_ids)
 444     return sorted([
 445         {'type':v['type'], 'content':{**v['content'], 'pinned': v['content']['video_id'] in pinned if 'video_id' in v['content'] else False}}
 446         for v in rows
 447         if 'video_id' not in v['content'] or v['content']['video_id'] not in hidden
 448     ], key=lambda v:v['content']['pinned'], reverse=True)
 449
 450 from werkzeug.exceptions import NotFound
 451 class NoFallbackException(NotFound): pass
 452 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 453     """
 454     finds the next route that matches the current url rule, and executes it.
 455     args, kwargs: pass all arguments of the current route
 456     """
 457     from flask import current_app, request, g
 458
 459     # build a list of endpoints that match the current request's url rule:
 460     matching = [
 461         rule.endpoint
 462         for rule in current_app.url_map.iter_rules()
 463         if rule.rule == request.url_rule.rule
 464     ]
 465     current = matching.index(request.endpoint)
 466
 467     # since we can't change request.endpoint, we always get the original
 468     # endpoint back. so for repeated fall throughs, we use the g object to
 469     # increment how often we want to fall through.
 470     if not '_fallback_next' in g:
 471         g._fallback_next = 0
 472     g._fallback_next += 1
 473
 474     next_ep = current + g._fallback_next
 475
 476     if next_ep < len(matching):
 477         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 478     else:
 479         raise NoFallbackException
 480
 481 def websub_url_hmac(key, feed_id, timestamp, nonce):
 482     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 483     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 484     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 485
 486 def websub_body_hmac(key, body):
 487     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 488
 489 def flask_logger(msg, level="warning"):
 490     level = dict(
 491         CRITICAL=50,
 492         ERROR=40,
 493         WARNING=30,
 494         INFO=20,
 495         DEBUG=10,
 496         NOTSET=0,
 497     ).get(level.upper(), 0)
 498     try:
 499         from flask import current_app
 500         current_app.logger.log(level, msg)
 501     except:
 502         pass
 503
 504 def log_unknown_card(data):
 505     import json
 506     try:
 507         from flask import request
 508         source = request.url
 509     except: source = "unknown"
 510     with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f:
 511         f.write(f"\n/***** {source} *****/\n")
 512         json.dump(data, f, indent=2)