app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, json=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params=params, data=data, json=json, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, json, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 def fetch_xml(feed_type, feed_id):
  54     # TODO: handle requests.exceptions.ConnectionError
  55     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  56         feed_type: feed_id,
  57     })
  58     if not r.ok:
  59         return None
  60
  61     return r.content
  62
  63 def parse_xml(xmldata):
  64     ns = {
  65         'atom':"http://www.w3.org/2005/Atom",
  66         'yt': "http://www.youtube.com/xml/schemas/2015",
  67         'media':"http://search.yahoo.com/mrss/",
  68         'at': "http://purl.org/atompub/tombstones/1.0",
  69     }
  70
  71     feed = ElementTree.fromstring(xmldata)
  72
  73     if feed.find('at:deleted-entry',ns):
  74         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  75         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  76
  77     title = feed.find('atom:title',ns).text
  78     author = feed.find('atom:author/atom:name',ns).text \
  79         if feed.find('atom:author',ns) else None
  80     # for /user/<> endpoint: find out UC-id:
  81     # for playlists: this is who created the playlist:
  82     try:   channel_id = feed.find('yt:channelId',ns).text
  83     except:channel_id=None # XXX: why does ternary not work!?
  84     # for pullsub: if this exists, we're looking at a playlist:
  85     try:   playlist_id = feed.find('yt:playlistId',ns).text
  86     except:playlist_id=None # XXX: why does ternary not work!?
  87     videos = []
  88     for entry in feed.findall('atom:entry',ns):
  89         videos.append({
  90             'video_id': entry.find('yt:videoId',ns).text,
  91             'title': entry.find('atom:title',ns).text,
  92             'published': entry.find('atom:published',ns).text,
  93             'channel_id': entry.find('yt:channelId',ns).text,
  94             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  95             # extra fields for pull_subs/webhook:
  96             'updated': entry.find('atom:updated',ns).text,
  97         })
  98
  99     return title, author, videos, channel_id, playlist_id
 100
 101 def update_channel(db, xmldata, from_webhook=False):
 102     if not xmldata: return False
 103
 104     # Note: websub does not return global author, hence taking from first video
 105     title, author, videos, channel, playlist = parse_xml(xmldata)
 106
 107     c = db.cursor()
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             # Note: Deletion events are not just fired for actual deletions,
 111             # but also for unlisting videos and livestreams that just ended
 112             # (even postLiveDVR ones). Hence, we don't follow it.
 113             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 114             break
 115
 116         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if new_video:
 119             # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
 120             _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
 121             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 122             # video gets uploaded as unlisted on day A and set to public on day B;
 123             # the webhook is sent on day B, but 'published' says A. The video
 124             # therefore looks like it's just an update to an older video).
 125             # g_v_i gives is the date the video was published to viewers, so we
 126             # prefer that. But since g_v_i only returns the date without time,
 127             # we still use xmlfeed's date if it's the same date.
 128             published = dateutil.parser.parse(video['published'])
 129             length = None
 130             livestream = None
 131             premiere = None
 132             shorts = None
 133             if meta:
 134                 meta = video_metadata(meta)
 135                 published2 = dateutil.parser.parse(meta['published'])
 136                 if published < published2: # g_v_i date is more accurate:
 137                     published = published2
 138                 length = meta['length']
 139                 livestream = meta['livestream']
 140                 premiere = meta['premiere']
 141                 shorts = meta['shorts']
 142
 143             now = datetime.now(timezone.utc)
 144
 145             # we pretend that all videos uploaded this week were uploaded just
 146             # now, so the user sees it at the top of the feed, and it doesn't
 147             # get inserted somewhere further down.
 148             if (now - published).days < 7:
 149                 timestamp = now
 150             else:#, it's just an update to an older video.
 151                 timestamp = published
 152
 153             c.execute("""
 154                 INSERT OR IGNORE INTO videos
 155                     (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 156                 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 157             """, (
 158                 video['video_id'],
 159                 video['channel_id'],
 160                 video['title'],
 161                 length,
 162                 livestream,
 163                 premiere,
 164                 shorts,
 165                 published,
 166                 timestamp
 167             ))
 168         else:
 169             # update video title (everything else can't change)
 170             c.execute("""
 171                 UPDATE OR IGNORE videos
 172                     SET title = ?
 173                     WHERE id = ?
 174             """, (
 175                 video['title'],
 176                 video['video_id'],
 177             ))
 178
 179         # for channels, this is obviously always the same, but playlists can
 180         # consist of videos from different channels:
 181         if i == 0 or playlist:
 182             c.execute("""
 183                 INSERT OR REPLACE INTO channels (id, name)
 184                                 VALUES (?, ?)
 185             """, (video['channel_id'], video['author']))
 186
 187         # keep track of which videos are in a playlist, so we can show the user
 188         # why a video is in their feed:
 189         if playlist:
 190             c.execute("""
 191                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 192                                VALUES (?, ?)
 193             """, (video['video_id'], playlist))
 194
 195     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 196         c.execute("""
 197             INSERT OR REPLACE INTO playlists (id, name, author)
 198                             VALUES (?, ?, ?)
 199             """, (playlist, title, channel))
 200         c.execute("""
 201             INSERT OR REPLACE INTO channels (id, name)
 202                             VALUES (?, ?)
 203         """, (channel, author))
 204
 205     db.commit()
 206
 207     return True
 208
 209 def is_agegated(metadata):
 210     playabilityStatus = metadata['playabilityStatus']
 211     return bool(
 212         playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
 213         or playabilityStatus.get("desktopLegacyAgeGateReason")
 214     )
 215
 216 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
 217     """
 218     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 219     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 220     """
 221     player_error, metadata = None, None # for 'exhausted'
 222     with sqlite3.connect(cf['global']['database']) as conn:
 223         c = conn.cursor()
 224         c.execute("SELECT * FROM captcha_cookies")
 225         cookies = dict(c.fetchall())
 226     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 227     # XXX: anticaptcha hasn't been adapted
 228     key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
 229     # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
 230     # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
 231     # TODO: unscramble TVHTML5* streams (especially &n= throttling)
 232     client = {
 233         (False, False): { 'clientName': 'ANDROID',                        'clientVersion': '17.31.35',       'androidSdkVersion': 30},
 234         (False, True):  { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0'                                    },
 235         (True, False):  { 'clientName': 'WEB',                            'clientVersion':f'2.{today}.01.01'                        },
 236     }[(metaOnly, _agegate_bypass)]
 237     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
 238         'videoId': video_id,
 239         'context': {
 240             'client': {
 241                 'gl': 'US',
 242                 'hl': 'en',
 243                 **client,
 244             },
 245             'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
 246         },
 247         "racyCheckOk": True, # seems to do nothing, cargo-culted
 248         "contentCheckOk": True, # fix "This video may be inappropriate for some users."
 249     }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
 250
 251     if not r or r.status_code == 429:
 252         return None, None, None, 'banned', 'possible IP ban'
 253
 254     metadata = r.json()
 255     if "error" in metadata:
 256         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 257     playabilityStatus = metadata['playabilityStatus']['status']
 258     if playabilityStatus != "OK":
 259         playabilityReason = metadata['playabilityStatus'].get('reason',
 260                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 261         player_error = f"{playabilityStatus}: {playabilityReason}"
 262         if (is_agegated(metadata)
 263             and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
 264             and not _agegate_bypass
 265         ):
 266             _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
 267             if error_embed == "player": # agegate bypass failed?
 268                 return None, None, metadata, 'agegated', player_error
 269             elif not error_embed or error_embed in ('livestream','geolocked','scrambled'):
 270                 metadata = metadata_embed
 271             else:
 272                 return None, None, metadata, error_embed, errormsg_embed
 273         else:
 274             # without videoDetails, there's only the error message
 275             maybe_metadata = metadata if 'videoDetails' in metadata else None
 276             return None, None, maybe_metadata, 'player', player_error
 277
 278     # livestreams have no adaptive/muxed formats:
 279     is_live = metadata['videoDetails'].get('isLive', False)
 280
 281     if not 'formats' in metadata['streamingData'] and not is_live:
 282         return None, None, metadata, 'no-url', player_error
 283
 284     formats = metadata['streamingData'].get('formats',[])
 285     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 286     stream_map = {
 287         'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
 288         'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
 289         'muxed': formats,
 290         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 291     }
 292
 293     try:
 294         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 295
 296         # ip-locked videos can be recovered if the proxy module is loaded:
 297         is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 298     except:
 299         url = None
 300         is_geolocked = False
 301
 302     is_drm = formats and 'signatureCipher' in formats[0]
 303
 304     nonfatal = 'livestream' if is_live \
 305         else 'geolocked' if is_geolocked \
 306         else 'scrambled' if is_drm \
 307         else None
 308
 309     return url, stream_map, metadata, nonfatal, None
 310
 311 def video_metadata(metadata):
 312     if not metadata:
 313         return {}
 314
 315     meta1 = metadata['videoDetails']
 316     # With ANDROID player API, we don't get microformat => no publishDate!
 317     meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
 318
 319     # sometimes, we receive the notification so early that the length is not
 320     # yet populated. Nothing we can do about it. meta1 and meta2 use a
 321     # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
 322     length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None
 323
 324     scheduled_time = metadata.get('playabilityStatus',{}) \
 325         .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
 326         .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
 327         .get('scheduledStartTime')
 328     if scheduled_time:
 329         scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
 330             .strftime("%Y-%m-%dT%H:%M:%SZ")
 331     published_at = (
 332         meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
 333         scheduled_time or
 334         f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
 335     )
 336
 337     # the actual video streams have exact information:
 338     # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
 339     try:
 340         sd = metadata['streamingData']
 341         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 342         aspect_ratio =  some_stream['width'] / some_stream['height']
 343     # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
 344     except:
 345         aspect_ratio = None
 346
 347     # shorts are <= 60 seconds and vertical or square. if we were unable to
 348     # determine it, we set it to None.
 349     is_short = (
 350         None if length is None and aspect_ratio is None else
 351         True if ((length or 61) <= 60) and ((aspect_ratio or 2) <= 1) else
 352         False # length > 60 or aspect_ratio > 1
 353     )
 354
 355     # Note: 'premiere' videos have livestream=False and published= will be the
 356     # start of the premiere.
 357     return {
 358         'title': meta1['title'],
 359         'author': meta1['author'],
 360         'channel_id': meta1['channelId'],
 361         'published': published_at,
 362         'views': int(meta1['viewCount']),
 363         'length': length,
 364         'aspect': aspect_ratio or 16/9,
 365         'livestream': meta1['isLiveContent'],
 366         'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
 367         'shorts': is_short,
 368     }
 369
 370 def store_video_metadata(video_id):
 371     # check if we know about it, and if not, fetch and store video metadata
 372     with sqlite3.connect(cf['global']['database']) as conn:
 373         c = conn.cursor()
 374         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 375         new_video = len(c.fetchall()) < 1
 376         if new_video:
 377             _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
 378             if meta:
 379                 meta = video_metadata(meta)
 380                 c.execute("""
 381                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
 382                                    VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 383                 """, (
 384                     video_id,
 385                     meta['channel_id'],
 386                     meta['title'],
 387                     meta['length'],
 388                     meta['livestream'],
 389                     meta['premiere'],
 390                     meta['shorts'],
 391                     meta['published'],
 392                     meta['published'],
 393                 ))
 394                 c.execute("""
 395                     INSERT OR REPLACE INTO channels (id, name)
 396                                     VALUES (?, ?)
 397                 """, (meta['channel_id'], meta['author']))
 398
 399 def fetch_video_flags(token, video_ids):
 400     with sqlite3.connect(cf['global']['database']) as conn:
 401         c = conn.cursor()
 402         c.execute("""
 403             SELECT video_id,display
 404               FROM flags
 405              WHERE user = ?
 406                AND display IS NOT NULL
 407                AND video_id IN ({})
 408                -- AND display = 'pinned'
 409         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 410         flags = c.fetchall()
 411         pinned = [video for video,disp in flags if disp == 'pinned']
 412         hidden = [video for video,disp in flags if disp == 'hidden']
 413
 414         return pinned, hidden
 415
 416 from werkzeug.exceptions import NotFound
 417 class NoFallbackException(NotFound): pass
 418 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 419     """
 420     finds the next route that matches the current url rule, and executes it.
 421     args, kwargs: pass all arguments of the current route
 422     """
 423     from flask import current_app, request, g
 424
 425     # build a list of endpoints that match the current request's url rule:
 426     matching = [
 427         rule.endpoint
 428         for rule in current_app.url_map.iter_rules()
 429         if rule.rule == request.url_rule.rule
 430     ]
 431     current = matching.index(request.endpoint)
 432
 433     # since we can't change request.endpoint, we always get the original
 434     # endpoint back. so for repeated fall throughs, we use the g object to
 435     # increment how often we want to fall through.
 436     if not '_fallback_next' in g:
 437         g._fallback_next = 0
 438     g._fallback_next += 1
 439
 440     next_ep = current + g._fallback_next
 441
 442     if next_ep < len(matching):
 443         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 444     else:
 445         raise NoFallbackException
 446
 447 def websub_url_hmac(key, feed_id, timestamp, nonce):
 448     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 449     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 450     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 451
 452 def websub_body_hmac(key, body):
 453     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 454
 455 def flask_logger(msg, level="warning"):
 456     level = dict(
 457         CRITICAL=50,
 458         ERROR=40,
 459         WARNING=30,
 460         INFO=20,
 461         DEBUG=10,
 462         NOTSET=0,
 463     ).get(level.upper(), 0)
 464     try:
 465         from flask import current_app
 466         current_app.logger.log(level, msg)
 467     except:
 468         pass
 469
 470 def pp(*args):
 471     from pprint import pprint
 472     import sys, codecs
 473     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))