app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, json=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params=params, data=data, json=json, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, json, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 class G:
  54     """
  55     null-coalescing version of dict.get() that also works on lists.
  56
  57     the | operator is overloaded to achieve similar looking code to jq(1) filters.
  58     the first found key is used: dict(foo=1)|G('bar','foo') returns 1.
  59     """
  60     def __init__(self, *keys):
  61         self.keys = keys
  62     def __ror__(self, other):
  63         for key in self.keys:
  64             try:    return other[key]
  65             except: continue
  66         return None
  67     class _Text:
  68         """ parses youtube's .runs[].text and .simpleText variants """
  69         def __ror__(self, other): # Note: only returning runs[0], not concat'ing all!
  70             return other|G('simpleText') or other|G('runs')|G(0)|G('text')
  71     text = _Text()
  72
  73 def fetch_xml(feed_type, feed_id):
  74     # TODO: handle requests.exceptions.ConnectionError
  75     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  76         feed_type: feed_id,
  77     })
  78     if not r.ok:
  79         return None
  80
  81     return r.content
  82
  83 def parse_xml(xmldata):
  84     ns = {
  85         'atom':"http://www.w3.org/2005/Atom",
  86         'yt': "http://www.youtube.com/xml/schemas/2015",
  87         'media':"http://search.yahoo.com/mrss/",
  88         'at': "http://purl.org/atompub/tombstones/1.0",
  89     }
  90
  91     feed = ElementTree.fromstring(xmldata)
  92
  93     if feed.find('at:deleted-entry',ns):
  94         del_entry = feed.find('at:deleted-entry',ns)
  95         del_author = del_entry.find('at:by',ns)
  96         _, _, vid = del_entry.get('ref').rpartition(':')
  97         _, _, channel_id = del_author.find('atom:uri',ns).text.rpartition('/')
  98         author = del_author.find('atom:name',ns).text
  99         entry = [{
 100             'deleted': True,
 101             'video_id': vid,
 102             'channel_id': channel_id,
 103             'author': author,
 104         }]
 105         return None, None, entry, None, None
 106
 107     title = feed.find('atom:title',ns).text
 108     author = feed.find('atom:author/atom:name',ns).text \
 109         if feed.find('atom:author',ns) else None
 110     # for /user/<> endpoint: find out UC-id:
 111     # for playlists: this is who created the playlist:
 112     try:   channel_id = feed.find('yt:channelId',ns).text
 113     except:channel_id = None
 114     # for pullsub: if this exists, we're looking at a playlist:
 115     try:   playlist_id = feed.find('yt:playlistId',ns).text
 116     except:playlist_id = None
 117     videos = []
 118     for entry in feed.findall('atom:entry',ns):
 119         videos.append({
 120             'video_id': entry.find('yt:videoId',ns).text,
 121             'title': entry.find('atom:title',ns).text,
 122             'published': entry.find('atom:published',ns).text,
 123             'channel_id': entry.find('yt:channelId',ns).text,
 124             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
 125             # extra fields for pull_subs/webhook:
 126             'updated': entry.find('atom:updated',ns).text,
 127         })
 128
 129     return title, author, videos, channel_id, playlist_id
 130
 131 def update_channel(db, xmldata, from_webhook=False):
 132     if not xmldata: return False
 133
 134     # Note: websub does not return global author, hence taking from first video
 135     title, author, videos, channel, playlist = parse_xml(xmldata)
 136
 137     c = db.cursor()
 138     for i, video in enumerate(videos):
 139         if video.get('deleted'):
 140             # Note: Deletion events are not just fired for actual deletions,
 141             # but also for unlisting videos and livestreams that just ended
 142             # (even postLiveDVR ones). Hence, we don't follow it.
 143             flask_logger(f"ignoring deleted/unlisted video or ended livestream {video['video_id']} by {video['channel_id']} ({video['author']})")
 144             break
 145
 146         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 147         new_video = len(c.fetchall()) < 1
 148         if new_video:
 149             # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
 150             _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
 151             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 152             # video gets uploaded as unlisted on day A and set to public on day B;
 153             # the webhook is sent on day B, but 'published' says A. The video
 154             # therefore looks like it's just an update to an older video).
 155             # g_v_i gives is the date the video was published to viewers, so we
 156             # prefer that. But since g_v_i only returns the date without time,
 157             # we still use xmlfeed's date if it's the same date.
 158             published = dateutil.parser.parse(video['published'])
 159             length = None
 160             livestream = None
 161             premiere = None
 162             shorts = None
 163             if meta:
 164                 meta = video_metadata(meta)
 165                 published2 = dateutil.parser.parse(meta['published'])
 166                 if published < published2: # g_v_i date is more accurate:
 167                     published = published2
 168                 length = meta['length']
 169                 livestream = meta['livestream']
 170                 premiere = meta['premiere']
 171                 shorts = meta['shorts']
 172
 173             now = datetime.now(timezone.utc)
 174
 175             # we pretend that all videos uploaded this week were uploaded just
 176             # now, so the user sees it at the top of the feed, and it doesn't
 177             # get inserted somewhere further down.
 178             if (now - published).days < 7:
 179                 timestamp = now
 180             else:#, it's just an update to an older video.
 181                 timestamp = published
 182
 183             c.execute("""
 184                 INSERT OR IGNORE INTO videos
 185                     (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 186                 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 187             """, (
 188                 video['video_id'],
 189                 video['channel_id'],
 190                 video['title'],
 191                 length,
 192                 livestream,
 193                 premiere,
 194                 shorts,
 195                 published,
 196                 timestamp
 197             ))
 198         else:
 199             # update video title (everything else can't change)
 200             c.execute("""
 201                 UPDATE OR IGNORE videos
 202                     SET title = ?
 203                     WHERE id = ?
 204             """, (
 205                 video['title'],
 206                 video['video_id'],
 207             ))
 208
 209         # for channels, this is obviously always the same, but playlists can
 210         # consist of videos from different channels:
 211         if i == 0 or playlist:
 212             c.execute("""
 213                 INSERT OR REPLACE INTO channels (id, name)
 214                                 VALUES (?, ?)
 215             """, (video['channel_id'], video['author']))
 216
 217         # keep track of which videos are in a playlist, so we can show the user
 218         # why a video is in their feed:
 219         if playlist:
 220             c.execute("""
 221                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 222                                VALUES (?, ?)
 223             """, (video['video_id'], playlist))
 224
 225     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 226         c.execute("""
 227             INSERT OR REPLACE INTO playlists (id, name, author)
 228                             VALUES (?, ?, ?)
 229             """, (playlist, title, channel))
 230         c.execute("""
 231             INSERT OR REPLACE INTO channels (id, name)
 232                             VALUES (?, ?)
 233         """, (channel, author))
 234
 235     db.commit()
 236
 237     return True
 238
 239 def is_agegated(metadata):
 240     playabilityStatus = metadata['playabilityStatus']
 241     return bool(
 242         playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
 243         or playabilityStatus.get("desktopLegacyAgeGateReason")
 244     )
 245
 246 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
 247     """
 248     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 249     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 250     """
 251     player_error, metadata = None, None # for 'exhausted'
 252     with sqlite3.connect(cf['global']['database']) as conn:
 253         c = conn.cursor()
 254         c.execute("SELECT * FROM captcha_cookies")
 255         cookies = dict(c.fetchall())
 256     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 257     # XXX: anticaptcha hasn't been adapted
 258     key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" if metaOnly or _agegate_bypass else "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w"
 259     # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
 260     # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
 261     # TODO: unscramble TVHTML5* streams (especially &n= throttling)
 262     client = {
 263         (False, False): { 'clientName': 'ANDROID',                        'clientVersion': '18.11.34',       'androidSdkVersion': 30},
 264         (False, True):  { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0'                                    },
 265         (True, False):  { 'clientName': 'WEB',                            'clientVersion':f'2.{today}.01.01'                        },
 266     }[(metaOnly, _agegate_bypass)]
 267     r = requests.post("https://youtubei.googleapis.com/youtubei/v1/player", params={'key': key}, json={
 268         'videoId': video_id,
 269         'context': {
 270             'client': {
 271                 'gl': 'US',
 272                 'hl': 'en',
 273                 **client,
 274             },
 275             'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
 276         },
 277         "racyCheckOk": True, # seems to do nothing, cargo-culted
 278         "contentCheckOk": True, # fix "This video may be inappropriate for some users."
 279         "params": "CgIQBg%3D%3D", # otherwise googlevideo URLs become 403/Forbidden after a few accesses (breaks buffering/scrubbing)
 280     }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/18.11.34 (Linux; U; Android 11) gzip"})
 281
 282     if not r or r.status_code == 429:
 283         return None, None, None, 'banned', 'possible IP ban'
 284
 285     metadata = r.json()
 286     if "error" in metadata:
 287         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 288     real_vid = metadata.get("videoDetails", {}).get("videoId")
 289     if video_id != real_vid and real_vid in ("M5t4UHllkUM", "aQvGIIdgFDM"):
 290         # youtube redirected us to a clip called "Video Not Available". indicates a long-term ip ban.
 291         return None, None, {}, "banned", "instance is probably ip banned"
 292     playabilityStatus = metadata['playabilityStatus']['status']
 293     if playabilityStatus != "OK":
 294         playabilityReason = metadata['playabilityStatus'].get('reason',
 295                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 296         player_error = f"{playabilityStatus}: {playabilityReason}"
 297         if (is_agegated(metadata)
 298             and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
 299             and not _agegate_bypass
 300         ):
 301             _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
 302             if error_embed == "player": # agegate bypass failed?
 303                 return None, None, metadata, 'agegated', player_error
 304             elif not error_embed or error_embed in ('livestream','geolocked','scrambled', 'throttled'):
 305                 metadata = metadata_embed
 306             else:
 307                 return None, None, metadata, error_embed, errormsg_embed
 308         else:
 309             # without videoDetails, there's only the error message
 310             maybe_metadata = metadata if 'videoDetails' in metadata else None
 311             return None, None, maybe_metadata, 'player', player_error
 312
 313     # livestreams have no adaptive/muxed formats:
 314     is_live = metadata['videoDetails'].get('isLive', False)
 315
 316     if not 'formats' in metadata['streamingData'] and not is_live:
 317         return None, None, metadata, 'no-url', player_error
 318
 319     formats = metadata['streamingData'].get('formats',[])
 320     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 321     stream_map = {
 322         'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
 323         'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
 324         'muxed': formats,
 325         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 326     }
 327
 328     try:
 329         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 330
 331         query = parse_qs(urlparse(url).query)
 332         # ip-locked videos can be recovered if the proxy module is loaded:
 333         is_geolocked = 'gcr' in query
 334         # "n-signature" requires javascript descrambling (not implemented):
 335         is_throttled = 'ns' in query
 336     except:
 337         url = None
 338         is_geolocked = False
 339         is_throttled = False
 340
 341     is_drm = formats and 'signatureCipher' in formats[0]
 342
 343     nonfatal = 'livestream' if is_live \
 344         else 'geolocked' if is_geolocked \
 345         else 'scrambled' if is_drm \
 346         else 'throttled' if is_throttled \
 347         else None
 348
 349     return url, stream_map, metadata, nonfatal, None
 350
 351 def video_metadata(metadata):
 352     if not metadata:
 353         return {}
 354
 355     meta1 = metadata['videoDetails']
 356     # With ANDROID player API, we don't get microformat => no publishDate!
 357     meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
 358
 359     # sometimes, we receive the notification so early that the length is not
 360     # yet populated. Nothing we can do about it. meta1 and meta2 use a
 361     # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
 362     length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
 363
 364     views = int(meta1['viewCount']) if 'viewCount' in meta1 else None
 365
 366     scheduled_time = metadata.get('playabilityStatus',{}) \
 367         .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
 368         .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
 369         .get('scheduledStartTime')
 370     if scheduled_time:
 371         scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
 372             .strftime("%Y-%m-%dT%H:%M:%SZ")
 373     published_at = (
 374         meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
 375         scheduled_time or
 376         meta2.get('publishDate','1970-01-01T00:00:00Z')
 377     )
 378
 379     # the actual video streams have exact information:
 380     # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
 381     try:
 382         sd = metadata['streamingData']
 383         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 384         aspect_ratio =  some_stream['width'] / some_stream['height']
 385     # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
 386     except:
 387         aspect_ratio = None
 388
 389     is_livestream = meta1['isLiveContent']
 390     is_premiere = meta1.get('isUpcoming', False) and not is_livestream
 391     # shorts are <= 60 seconds and vertical or square. they can't be premiere
 392     # or livestreams. if we were unable to determine it, we set it to None.
 393     is_short = (
 394         True if (length or 61) <= 60 and (aspect_ratio or 2) <= 1 else
 395         False if (length or 0) > 60 or (aspect_ratio or 0) > 1 else
 396         None if not is_premiere and not is_livestream else False
 397     )
 398
 399     # Note: 'premiere' videos have livestream=False and published= will be the
 400     # start of the premiere.
 401     return {
 402         'title': meta1['title'],
 403         'author': meta1['author'],
 404         'channel_id': meta1['channelId'],
 405         'published': published_at,
 406         'views': views,
 407         'length': length,
 408         'aspect': aspect_ratio or 16/9,
 409         'livestream': is_livestream,
 410         'premiere': is_premiere,
 411         'shorts': is_short,
 412     }
 413
 414 def mkthumbs(thumbs):
 415     output = {str(e['height']): e['url'] for e in thumbs}
 416     largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None)
 417     return {**output, 'largest': largest}
 418
 419 def store_video_metadata(video_id):
 420     # check if we know about it, and if not, fetch and store video metadata
 421     with sqlite3.connect(cf['global']['database']) as conn:
 422         c = conn.cursor()
 423         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 424         new_video = len(c.fetchall()) < 1
 425         if new_video:
 426             _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
 427             if meta:
 428                 meta = video_metadata(meta)
 429                 c.execute("""
 430                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 431                                    VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 432                 """, (
 433                     video_id,
 434                     meta['channel_id'],
 435                     meta['title'],
 436                     meta['length'],
 437                     meta['livestream'],
 438                     meta['premiere'],
 439                     meta['shorts'],
 440                     meta['published'],
 441                     meta['published'],
 442                 ))
 443                 c.execute("""
 444                     INSERT OR REPLACE INTO channels (id, name)
 445                                     VALUES (?, ?)
 446                 """, (meta['channel_id'], meta['author']))
 447
 448 def fetch_video_flags(token, video_ids):
 449     with sqlite3.connect(cf['global']['database']) as conn:
 450         c = conn.cursor()
 451         c.execute("""
 452             SELECT video_id,display
 453               FROM flags
 454              WHERE user = ?
 455                AND display IS NOT NULL
 456                AND video_id IN ({})
 457                -- AND display = 'pinned'
 458         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 459         flags = c.fetchall()
 460         pinned = [video for video,disp in flags if disp == 'pinned']
 461         hidden = [video for video,disp in flags if disp == 'hidden']
 462
 463         return pinned, hidden
 464
 465 def apply_video_flags(token, rows, settings={}):
 466     video_ids = [card['content']['video_id'] for card in rows if 'video_id' in card['content']]
 467     pinned, hidden = fetch_video_flags(token, video_ids)
 468     noshorts = settings.get('noshorts') or False
 469     return sorted([
 470         {'type':v['type'], 'content':{**v['content'], 'pinned': v['content']['video_id'] in pinned if 'video_id' in v['content'] else False}}
 471         for v in rows
 472         if (
 473             'video_id' not in v['content'] or v['content']['video_id'] not in hidden
 474         ) and (
 475             not (noshorts and v['content'].get('shorts'))
 476         )
 477     ], key=lambda v:v['content']['pinned'], reverse=True)
 478
 479 from werkzeug.exceptions import NotFound
 480 class NoFallbackException(NotFound): pass
 481 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 482     """
 483     finds the next route that matches the current url rule, and executes it.
 484     args, kwargs: pass all arguments of the current route
 485     """
 486     from flask import current_app, request, g
 487
 488     # build a list of endpoints that match the current request's url rule:
 489     matching = [
 490         rule.endpoint
 491         for rule in current_app.url_map.iter_rules()
 492         if rule.rule == request.url_rule.rule
 493     ]
 494     current = matching.index(request.endpoint)
 495
 496     # since we can't change request.endpoint, we always get the original
 497     # endpoint back. so for repeated fall throughs, we use the g object to
 498     # increment how often we want to fall through.
 499     if not '_fallback_next' in g:
 500         g._fallback_next = 0
 501     g._fallback_next += 1
 502
 503     next_ep = current + g._fallback_next
 504
 505     if next_ep < len(matching):
 506         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 507     else:
 508         raise NoFallbackException
 509
 510 def websub_url_hmac(key, feed_id, timestamp, nonce):
 511     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 512     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 513     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 514
 515 def websub_body_hmac(key, body):
 516     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 517
 518 def flask_logger(msg, level="warning"):
 519     level = dict(
 520         CRITICAL=50,
 521         ERROR=40,
 522         WARNING=30,
 523         INFO=20,
 524         DEBUG=10,
 525         NOTSET=0,
 526     ).get(level.upper(), 0)
 527     try:
 528         from flask import current_app
 529         current_app.logger.log(level, msg)
 530     except:
 531         pass
 532
 533 def log_unknown_card(data):
 534     import json
 535     try:
 536         from flask import request
 537         source = request.url
 538     except: source = "unknown"
 539     with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f:
 540         f.write(f"\n/***** {source} *****/\n")
 541         json.dump(data, f, indent=2)