app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,), allowable_methods=('GET', 'HEAD', 'POST'))
  23
  24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
  25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
  26 # TODO: only run for long-running processes, i.e. the frontend
  27 from threading import Timer
  28 def purge_cache(sec):
  29     requests_cache.remove_expired_responses()
  30     t = Timer(sec, purge_cache, args=(sec,))
  31     t.setDaemon(True)
  32     t.start()
  33 purge_cache(10*60)
  34
  35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  37 from flask import g
  38 import requests
  39 from requests import Session as OriginalSession
  40 class _NSASession(OriginalSession):
  41     def request(self, method, url, params=None, data=None, **kwargs):
  42         response = super(_NSASession, self).request(
  43             method, url, params, data, **kwargs
  44             )
  45         try:
  46             if 'api_requests' not in g:
  47                 g.api_requests = []
  48             g.api_requests.append((url, params, response.text))
  49         except RuntimeError: pass # not within flask (e.g. utils.py)
  50         return response
  51 requests.Session = requests.sessions.Session = _NSASession
  52
  53 def fetch_xml(feed_type, feed_id):
  54     # TODO: handle requests.exceptions.ConnectionError
  55     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  56         feed_type: feed_id,
  57     })
  58     if not r.ok:
  59         return None
  60
  61     return r.content
  62
  63 def parse_xml(xmldata):
  64     ns = {
  65         'atom':"http://www.w3.org/2005/Atom",
  66         'yt': "http://www.youtube.com/xml/schemas/2015",
  67         'media':"http://search.yahoo.com/mrss/",
  68         'at': "http://purl.org/atompub/tombstones/1.0",
  69     }
  70
  71     feed = ElementTree.fromstring(xmldata)
  72
  73     if feed.find('at:deleted-entry',ns):
  74         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  75         return None, None, [{'deleted': True, 'video_id': vid}], None, None
  76
  77     title = feed.find('atom:title',ns).text
  78     author = feed.find('atom:author/atom:name',ns).text \
  79         if feed.find('atom:author',ns) else None
  80     # for /user/<> endpoint: find out UC-id:
  81     # for playlists: this is who created the playlist:
  82     try:   channel_id = feed.find('yt:channelId',ns).text
  83     except:channel_id=None # XXX: why does ternary not work!?
  84     # for pullsub: if this exists, we're looking at a playlist:
  85     try:   playlist_id = feed.find('yt:playlistId',ns).text
  86     except:playlist_id=None # XXX: why does ternary not work!?
  87     videos = []
  88     for entry in feed.findall('atom:entry',ns):
  89         videos.append({
  90             'video_id': entry.find('yt:videoId',ns).text,
  91             'title': entry.find('atom:title',ns).text,
  92             'published': entry.find('atom:published',ns).text,
  93             'channel_id': entry.find('yt:channelId',ns).text,
  94             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  95             # extra fields for pull_subs/webhook:
  96             'updated': entry.find('atom:updated',ns).text,
  97         })
  98
  99     return title, author, videos, channel_id, playlist_id
 100
 101 def update_channel(db, xmldata, from_webhook=False):
 102     if not xmldata: return False
 103
 104     # Note: websub does not return global author, hence taking from first video
 105     title, author, videos, channel, playlist = parse_xml(xmldata)
 106
 107     c = db.cursor()
 108     for i, video in enumerate(videos):
 109         if video.get('deleted'):
 110             # Note: Deletion events are not just fired for actual deletions,
 111             # but also for unlisting videos and livestreams that just ended
 112             # (even postLiveDVR ones). Hence, we don't follow it.
 113             flask_logger(f"ignoring deleted/unlisted/ended video/stream {video['video_id']}")
 114             break
 115
 116         c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if new_video:
 119             # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
 120             _, _, meta, _, _ = get_video_info(video['video_id'], metaOnly=True)
 121             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 122             # video gets uploaded as unlisted on day A and set to public on day B;
 123             # the webhook is sent on day B, but 'published' says A. The video
 124             # therefore looks like it's just an update to an older video).
 125             # g_v_i gives is the date the video was published to viewers, so we
 126             # prefer that. But since g_v_i only returns the date without time,
 127             # we still use xmlfeed's date if it's the same date.
 128             published = dateutil.parser.parse(video['published'])
 129             length = None
 130             livestream = None
 131             premiere = None
 132             if meta:
 133                 meta = video_metadata(meta)
 134                 published2 = dateutil.parser.parse(meta['published'])
 135                 if published < published2: # g_v_i date is more accurate:
 136                     published = published2
 137                 length = meta['length']
 138                 livestream = meta['livestream']
 139                 premiere = meta['premiere']
 140
 141             now = datetime.now(timezone.utc)
 142
 143             # we pretend that all videos uploaded this week were uploaded just
 144             # now, so the user sees it at the top of the feed, and it doesn't
 145             # get inserted somewhere further down.
 146             if (now - published).days < 7:
 147                 timestamp = now
 148             else:#, it's just an update to an older video.
 149                 timestamp = published
 150
 151             c.execute("""
 152                 INSERT OR IGNORE INTO videos
 153                     (id, channel_id, title, length, livestream, premiere, published, crawled)
 154                 VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 155             """, (
 156                 video['video_id'],
 157                 video['channel_id'],
 158                 video['title'],
 159                 length,
 160                 livestream,
 161                 premiere,
 162                 published,
 163                 timestamp
 164             ))
 165         else:
 166             # update video title (everything else can't change)
 167             c.execute("""
 168                 UPDATE OR IGNORE videos
 169                     SET title = ?
 170                     WHERE id = ?
 171             """, (
 172                 video['title'],
 173                 video['video_id'],
 174             ))
 175
 176         # for channels, this is obviously always the same, but playlists can
 177         # consist of videos from different channels:
 178         if i == 0 or playlist:
 179             c.execute("""
 180                 INSERT OR REPLACE INTO channels (id, name)
 181                                 VALUES (?, ?)
 182             """, (video['channel_id'], video['author']))
 183
 184         # keep track of which videos are in a playlist, so we can show the user
 185         # why a video is in their feed:
 186         if playlist:
 187             c.execute("""
 188                 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
 189                                VALUES (?, ?)
 190             """, (video['video_id'], playlist))
 191
 192     if playlist and not from_webhook: # Note: playlists can't get updated via websub
 193         c.execute("""
 194             INSERT OR REPLACE INTO playlists (id, name, author)
 195                             VALUES (?, ?, ?)
 196             """, (playlist, title, channel))
 197         c.execute("""
 198             INSERT OR REPLACE INTO channels (id, name)
 199                             VALUES (?, ?)
 200         """, (channel, author))
 201
 202     db.commit()
 203
 204     return True
 205
 206 def is_agegated(metadata):
 207     playabilityStatus = metadata['playabilityStatus']
 208     return bool(
 209         playabilityStatus.get("status") == "CONTENT_CHECK_REQUIRED"
 210         or playabilityStatus.get("desktopLegacyAgeGateReason")
 211     )
 212
 213 def get_video_info(video_id, *, metaOnly=False, _agegate_bypass=False):
 214     """
 215     returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
 216     error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
 217     """
 218     player_error, metadata = None, None # for 'exhausted'
 219     with sqlite3.connect(cf['global']['database']) as conn:
 220         c = conn.cursor()
 221         c.execute("SELECT * FROM captcha_cookies")
 222         cookies = dict(c.fetchall())
 223     today = datetime.now(timezone.utc).strftime("%Y%m%d")
 224     # XXX: anticaptcha hasn't been adapted
 225     key = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
 226     # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
 227     # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
 228     # TODO: unscramble TVHTML5* streams (especially &n= throttling)
 229     client = {
 230         (False, False): { 'clientName': 'ANDROID',                        'clientVersion': '17.31.35',       'androidSdkVersion': 30},
 231         (False, True):  { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0'                                    },
 232         (True, False):  { 'clientName': 'WEB',                            'clientVersion':f'2.{today}.01.01'                        },
 233     }[(metaOnly, _agegate_bypass)]
 234     r = requests.post("https://www.youtube-nocookie.com/youtubei/v1/player", params={'key': key}, json={
 235         'videoId': video_id,
 236         'context': {
 237             'client': {
 238                 'gl': 'US',
 239                 'hl': 'en',
 240                 **client,
 241             },
 242             'thirdParty': {'embedUrl': 'https://www.youtube.com/'}
 243         },
 244         "racyCheckOk": True, # seems to do nothing, cargo-culted
 245         "contentCheckOk": True, # fix "This video may be inappropriate for some users."
 246     }, cookies=cookies, headers={"User-Agent": "com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip"})
 247
 248     if not r or r.status_code == 429:
 249         return None, None, None, 'banned', 'possible IP ban'
 250
 251     metadata = r.json()
 252     if "error" in metadata:
 253         return None, None, metadata, "malformed", metadata.get("error",{}).get("message","")
 254     playabilityStatus = metadata['playabilityStatus']['status']
 255     if playabilityStatus != "OK":
 256         playabilityReason = metadata['playabilityStatus'].get('reason',
 257                 '//'.join(metadata['playabilityStatus'].get('messages',[])))
 258         player_error = f"{playabilityStatus}: {playabilityReason}"
 259         if (is_agegated(metadata)
 260             and not metaOnly # only need metadata (e.g. called from pubsubhubbub)
 261             and not _agegate_bypass
 262         ):
 263             _, _, metadata_embed, error_embed, errormsg_embed = get_video_info(video_id, _agegate_bypass=True)
 264             if error_embed == "player": # agegate bypass failed?
 265                 return None, None, metadata, 'agegated', player_error
 266             elif not error_embed or error_embed in ('livestream','geolocked'):
 267                 metadata = metadata_embed
 268             else:
 269                 return None, None, metadata, error_embed, errormsg_embed
 270         else:
 271             # without videoDetails, there's only the error message
 272             maybe_metadata = metadata if 'videoDetails' in metadata else None
 273             return None, None, maybe_metadata, 'player', player_error
 274
 275     # livestreams have no adaptive/muxed formats:
 276     is_live = metadata['videoDetails'].get('isLive', False)
 277
 278     if not 'formats' in metadata['streamingData'] and not is_live:
 279         return None, None, metadata, 'no-url', player_error
 280
 281     formats = metadata['streamingData'].get('formats',[])
 282     adaptive = metadata['streamingData'].get('adaptiveFormats',[])
 283     stream_map = {
 284         'adaptive_video': [a for a in adaptive if a['mimeType'].startswith('video/')],
 285         'adaptive_audio': [a for a in adaptive if a['mimeType'].startswith('audio/')],
 286         'muxed': formats,
 287         'hlsManifestUrl': metadata['streamingData'].get('hlsManifestUrl'),
 288     }
 289
 290     try:
 291         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 292
 293         # ip-locked videos can be recovered if the proxy module is loaded:
 294         is_geolocked = 'gcr' in parse_qs(urlparse(url).query)
 295     except:
 296         url = None
 297         is_geolocked = False
 298
 299     is_drm = formats and 'signatureCipher' in formats[0]
 300
 301     nonfatal = 'livestream' if is_live \
 302         else 'geolocked' if is_geolocked \
 303         else 'scrambled' if is_drm \
 304         else None
 305
 306     return url, stream_map, metadata, nonfatal, None
 307
 308 def video_metadata(metadata):
 309     if not metadata:
 310         return {}
 311
 312     meta1 = metadata['videoDetails']
 313     # With ANDROID player API, we don't get microformat => no publishDate!
 314     meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{})
 315
 316     # sometimes, we receive the notification so early that the length is not
 317     # yet populated. Nothing we can do about it.
 318     length = int(meta2.get('lengthSeconds',0)) or int(meta1.get('lengthSeconds',0)) or None
 319
 320     scheduled_time = metadata.get('playabilityStatus',{}) \
 321         .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \
 322         .get('offlineSlate',{}).get('liveStreamOfflineSlateRenderer',{}) \
 323         .get('scheduledStartTime')
 324     if scheduled_time:
 325         scheduled_time = datetime.fromtimestamp(int(scheduled_time)) \
 326             .strftime("%Y-%m-%dT%H:%M:%SZ")
 327     published_at = (
 328         meta2.get('liveBroadcastDetails',{}) .get('startTimestamp') or
 329         scheduled_time or
 330         f"{meta2.get('publishDate','1970-01-01')}T00:00:00Z"
 331     )
 332
 333     # Note: 'premiere' videos have livestream=False and published= will be the
 334     # start of the premiere.
 335     return {
 336         'title': meta1['title'],
 337         'author': meta1['author'],
 338         'channel_id': meta1['channelId'],
 339         'published': published_at,
 340         'views': int(meta1['viewCount']),
 341         'length': length,
 342         'livestream': meta1['isLiveContent'],
 343         'premiere': meta1.get('isUpcoming') and not meta1['isLiveContent'],
 344     }
 345
 346 def store_video_metadata(video_id):
 347     # check if we know about it, and if not, fetch and store video metadata
 348     with sqlite3.connect(cf['global']['database']) as conn:
 349         c = conn.cursor()
 350         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 351         new_video = len(c.fetchall()) < 1
 352         if new_video:
 353             _, _, meta, _, _ = get_video_info(video_id, metaOnly=True)
 354             if meta:
 355                 meta = video_metadata(meta)
 356                 c.execute("""
 357                     INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, published, crawled)
 358                                    VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
 359                 """, (
 360                     video_id,
 361                     meta['channel_id'],
 362                     meta['title'],
 363                     meta['length'],
 364                     meta['livestream'],
 365                     meta['premiere'],
 366                     meta['published'],
 367                     meta['published'],
 368                 ))
 369                 c.execute("""
 370                     INSERT OR REPLACE INTO channels (id, name)
 371                                     VALUES (?, ?)
 372                 """, (meta['channel_id'], meta['author']))
 373
 374 def fetch_video_flags(token, video_ids):
 375     with sqlite3.connect(cf['global']['database']) as conn:
 376         c = conn.cursor()
 377         c.execute("""
 378             SELECT video_id,display
 379               FROM flags
 380              WHERE user = ?
 381                AND display IS NOT NULL
 382                AND video_id IN ({})
 383                -- AND display = 'pinned'
 384         """.format(",".join(["?"]*len(video_ids))), (token,*video_ids))
 385         flags = c.fetchall()
 386         pinned = [video for video,disp in flags if disp == 'pinned']
 387         hidden = [video for video,disp in flags if disp == 'hidden']
 388
 389         return pinned, hidden
 390
 391 from werkzeug.exceptions import NotFound
 392 class NoFallbackException(NotFound): pass
 393 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 394     """
 395     finds the next route that matches the current url rule, and executes it.
 396     args, kwargs: pass all arguments of the current route
 397     """
 398     from flask import current_app, request, g
 399
 400     # build a list of endpoints that match the current request's url rule:
 401     matching = [
 402         rule.endpoint
 403         for rule in current_app.url_map.iter_rules()
 404         if rule.rule == request.url_rule.rule
 405     ]
 406     current = matching.index(request.endpoint)
 407
 408     # since we can't change request.endpoint, we always get the original
 409     # endpoint back. so for repeated fall throughs, we use the g object to
 410     # increment how often we want to fall through.
 411     if not '_fallback_next' in g:
 412         g._fallback_next = 0
 413     g._fallback_next += 1
 414
 415     next_ep = current + g._fallback_next
 416
 417     if next_ep < len(matching):
 418         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 419     else:
 420         raise NoFallbackException
 421
 422 def websub_url_hmac(key, feed_id, timestamp, nonce):
 423     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 424     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 425     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 426
 427 def websub_body_hmac(key, body):
 428     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 429
 430 def flask_logger(msg, level="warning"):
 431     level = dict(
 432         CRITICAL=50,
 433         ERROR=40,
 434         WARNING=30,
 435         INFO=20,
 436         DEBUG=10,
 437         NOTSET=0,
 438     ).get(level.upper(), 0)
 439     try:
 440         from flask import current_app
 441         current_app.logger.log(level, msg)
 442     except:
 443         pass
 444
 445 def pp(*args):
 446     from pprint import pprint
 447     import sys, codecs
 448     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))