app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import base64
   5 import sqlite3
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}]
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata, from_webhook=False):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     _, _, videos = parse_xml(xmldata)
  98
  99     c = db.cursor()
 100     from flask import current_app # XXX: remove
 101     for i, video in enumerate(videos):
 102         if video.get('deleted'):
 103             if from_webhook: current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 104             # TODO: enable once we enforce hmac validation:
 105             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 106             break
 107
 108         now = datetime.now(timezone.utc)
 109         updated = dateutil.parser.parse(video['updated'])
 110         published = dateutil.parser.parse(video['published'])
 111         # if update and published time are near-identical, we assume it's new.
 112         # checking if it was posted this week is necessary during xmlfeed pulling.
 113         if (updated - published).seconds < 60 and (now - published).days < 7:
 114             timestamp = now
 115             if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove
 116         else:#, it might just an update to an older video, or a previously unlisted one.
 117             # first, assume it's an older video (correct when pulling xmlfeeds)
 118             timestamp = published
 119             # then, check if we don't know about it and if so, look up the real date.
 120
 121             # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 122             # video gets uploaded as unlisted on day A and set to public on day B;
 123             # the webhook is sent on day B, but 'published' says A. The video
 124             # therefore looks like it's just an update to an older video). If
 125             # that's the case, we fetch get_video_info and double-check.
 126             # We only need to do this to not-yet-in-the-database videos.
 127             c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
 128             new_video = len(c.fetchall()) < 1
 129             if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove
 130             if from_webhook and new_video:
 131                 if from_webhook: current_app.logger.warning(f"  is webhook and new") # XXX: remove
 132                 _, meta, _, _ = get_video_info(video['video_id'])
 133                 if meta:
 134                     meta = prepare_metadata(meta)
 135                     published = dateutil.parser.parse(meta['published'])
 136                     if from_webhook: current_app.logger.warning(f"  uploaded {published}") # XXX: remove
 137                     if (now - published).days < 7:
 138                         timestamp = now
 139                     else:#, it's just an update to an older video.
 140                         timestamp = published
 141
 142         c.execute("""
 143             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 144                            VALUES (?, ?, ?, datetime(?), datetime(?))
 145         """, (
 146             video['video_id'],
 147             video['channel_id'],
 148             video['title'],
 149             video['published'],
 150             timestamp
 151         ))
 152
 153         if i == 0: # only required once per feed
 154             c.execute("""
 155                 INSERT OR REPLACE INTO channels (id, name)
 156                                 VALUES (?, ?)
 157             """, (video['channel_id'], video['author']))
 158     db.commit()
 159
 160     return True
 161
 162 def get_video_info(video_id, sts=0, algo=""):
 163     """
 164     returns: best-quality muxed video stream, player_response, error-type/mesage
 165     error types: player, malformed, livestream, geolocked, exhausted
 166     """
 167     player_error = None # for 'exhausted'
 168     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 169         r = requests.get("https://www.youtube.com/get_video_info", {
 170             "video_id": video_id,
 171             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 172             "el": el,
 173             "sts": sts,
 174             "hl": "en_US",
 175         })
 176         params = parse_qs(r.text)
 177         if 'errorcode' in params: # status=fail
 178             return None, None, 'malformed', params['reason'][0]
 179
 180         metadata = json.loads(params.get('player_response')[0])
 181         playabilityStatus = metadata['playabilityStatus']['status']
 182         if playabilityStatus != "OK":
 183             playabilityReason = metadata['playabilityStatus'].get('reason',
 184                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 185             player_error = f"{playabilityStatus}: {playabilityReason}"
 186             if playabilityStatus == "UNPLAYABLE":
 187                 continue  # try again with next el value (or fail as exhausted)
 188             # without videoDetails, there's only the error message
 189             maybe_metadata = metadata if 'videoDetails' in metadata else None
 190             return None, maybe_metadata, 'player', player_error
 191         if metadata['videoDetails'].get('isLive', False):
 192             return None, metadata, 'livestream', None
 193
 194         if not 'formats' in metadata['streamingData']:
 195             continue # no urls
 196
 197         formats = metadata['streamingData']['formats']
 198         for (i,v) in enumerate(formats):
 199             if not ('cipher' in v or 'signatureCipher' in v): continue
 200             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 201             formats[i]['url'] = unscramble(cipher, algo)
 202
 203         # todo: check if we have urls or try again
 204         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 205
 206         # ip-locked videos can be recovered if the proxy module is loaded:
 207         is_geolocked = 'geolocked' if 'gcr' in parse_qs(urlparse(url).query) else None
 208
 209         return url, metadata, is_geolocked, None
 210     else:
 211         return None, metadata, 'exhausted', player_error
 212
 213 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 214     signature = list(cipher['s'][0])
 215     for c in algo.split():
 216         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 217         ix = int(ix) % len(signature) if ix else 0
 218         if not op: continue
 219         if op == 'r': signature = list(reversed(signature))
 220         if op == 's': signature = signature[ix:]
 221         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 222     sp = cipher.get('sp', ['signature'])[0]
 223     sig = cipher.get('sig', [''.join(signature)])[0]
 224     return f"{cipher['url'][0]}&{sp}={sig}"
 225
 226 def prepare_metadata(metadata):
 227     meta1 = metadata['videoDetails']
 228     meta2 = metadata['microformat']['playerMicroformatRenderer']
 229     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 230         if 'cards' in metadata else []
 231     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 232         if 'endscreen' in metadata else []
 233
 234     # the actual video streams have exact information:
 235     try:
 236         sd = metadata['streamingData']
 237         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 238         aspect_ratio = some_stream['width'] / some_stream['height']
 239     # if that's unavailable (e.g. on livestreams), fall back to
 240     # thumbnails (only either 4:3 or 16:9).
 241     except:
 242         some_img = meta2['thumbnail']['thumbnails'][0]
 243         aspect_ratio = some_img['width'] / some_img['height']
 244
 245     # Note: we could get subtitles in multiple formats directly by querying
 246     # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
 247     # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
 248     # but that won't give us autogenerated subtitles (and is an extra request).
 249     # we can still add &fmt= to the extracted URLs below (first one takes precedence).
 250     try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
 251         default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
 252         main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
 253     except:
 254         main_subtitle = -1
 255     subtitles = sorted([
 256         {'url':cc['baseUrl'],
 257          'code':cc['languageCode'],
 258          'autogenerated':cc.get('kind')=="asr",
 259          'name':cc['name']['simpleText'],
 260          'default':i==main_subtitle,
 261          'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
 262         for i,cc in enumerate(metadata.get('captions',{})
 263             .get('playerCaptionsTracklistRenderer',{})
 264             .get('captionTracks',[]))
 265     # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
 266     ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
 267
 268     def clean_url(url):
 269         # externals URLs are redirected through youtube.com/redirect, but we
 270         # may encounter internal URLs, too
 271         return parse_qs(urlparse(url).query).get('q',[url])[0]
 272     # Remove left-/rightmost word from string:
 273     delL = lambda s: s.partition(' ')[2]
 274     delR = lambda s: s.rpartition(' ')[0]
 275     # Thousands seperator aware int():
 276     intT = lambda s: int(s.replace(',', ''))
 277
 278     def parse_infocard(card):
 279         card = card['cardRenderer']
 280         ctype = list(card['content'].keys())[0]
 281         content = card['content'][ctype]
 282         if ctype == "pollRenderer":
 283             ctype = "POLL"
 284             content = {
 285                 'question': content['question']['simpleText'],
 286                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 287                     for a in content['choices']],
 288             }
 289         elif ctype == "videoInfoCardContentRenderer":
 290             ctype = "VIDEO"
 291             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 292             # TODO: this is ugly; cleanup.
 293             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 294             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 295             content = {
 296                 'video_id': content['action']['watchEndpoint']['videoId'],
 297                 'title': content['videoTitle']['simpleText'],
 298                 'author': delL(content['channelName']['simpleText']),
 299                 'length': length,
 300                 'views': intT(delR(content['viewCountText']['simpleText'])),
 301             }
 302         elif ctype == "playlistInfoCardContentRenderer":
 303             ctype = "PLAYLIST"
 304             content = {
 305                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 306                 'video_id': content['action']['watchEndpoint']['videoId'],
 307                 'title': content['playlistTitle']['simpleText'],
 308                 'author': delL(content['channelName']['simpleText']),
 309                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 310             }
 311         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 312             ctype = "WEBSITE"
 313             content = {
 314                 'url': clean_url(content['command']['urlEndpoint']['url']),
 315                 'domain': content['displayDomain']['simpleText'],
 316                 'title': content['title']['simpleText'],
 317                 # XXX: no thumbnails for infocards
 318             }
 319         elif ctype == "collaboratorInfoCardContentRenderer":
 320             ctype = "CHANNEL"
 321             content = {
 322                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 323                 'title': content['channelName']['simpleText'],
 324                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 325                 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
 326             }
 327         else:
 328             import pprint
 329             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 330
 331         return {'type': ctype, 'content': content}
 332
 333     def mkthumbs(thumbs):
 334         return {e['height']: e['url'] for e in thumbs}
 335     def parse_endcard(card):
 336         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 337         ctype = card['style']
 338         if ctype == "CHANNEL":
 339             content = {
 340                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 341                 'title': card['title']['simpleText'],
 342                 'icons': mkthumbs(card['image']['thumbnails']),
 343             }
 344         elif ctype == "VIDEO":
 345             content = {
 346                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 347                 'title': card['title']['simpleText'],
 348                 'length': card['videoDuration']['simpleText'],  # '12:21'
 349                 'views': delR(card['metadata']['simpleText']),
 350                 # XXX: no channel name
 351             }
 352         elif ctype == "PLAYLIST":
 353             content = {
 354                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 355                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 356                 'title': card['title']['simpleText'],
 357                 'author': delL(card['metadata']['simpleText']),
 358                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 359             }
 360         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 361             ctype = "WEBSITE"
 362             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 363             content = {
 364                 'url': url,
 365                 'domain': urlparse(url).netloc,
 366                 'title': card['title']['simpleText'],
 367                 'icons': mkthumbs(card['image']['thumbnails']),
 368             }
 369         else:
 370             import pprint
 371             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 372
 373         return {'type': ctype, 'content': content}
 374
 375     infocards = [parse_infocard(card) for card in cards]
 376     endcards = [parse_endcard(card) for card in endsc]
 377     # combine cards to weed out duplicates. for videos and playlists prefer
 378     # infocards, for channels and websites prefer endcards, as those have more
 379     # information than the other.
 380     # if the card type is not in ident, we use the whole card for comparison
 381     # (otherwise they'd all replace each other)
 382     ident = { # ctype -> ident
 383         'VIDEO': 'video_id',
 384         'PLAYLIST': 'playlist_id',
 385         'CHANNEL': 'channel_id',
 386         'WEBSITE': 'url',
 387         'POLL': 'question',
 388     }
 389     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 390     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 391     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 392
 393     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 394                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 395
 396     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 397         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 398         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 399         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 400         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 401         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 402         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 403         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 404         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 405         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 406         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 407     whitelisted = sorted(meta2.get('availableCountries',[]))
 408     blacklisted = sorted(set(all_countries) - set(whitelisted))
 409
 410     published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
 411     # 'premiere' videos (and livestreams?) have a ISO8601 date available:
 412     if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
 413         published_at = meta2['liveBroadcastDetails']['startTimestamp']
 414
 415     return {
 416         'title': meta1['title'],
 417         'author': meta1['author'],
 418         'channel_id': meta1['channelId'],
 419         'description': meta1['shortDescription'],
 420         'published': published_at,
 421         'views': meta1['viewCount'],
 422         'length': int(meta1['lengthSeconds']),
 423         'rating': meta1['averageRating'],
 424         'category': meta2['category'],
 425         'aspectr': aspect_ratio,
 426         'unlisted': meta2['isUnlisted'],
 427         'whitelisted': whitelisted,
 428         'blacklisted': blacklisted,
 429         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 430         'infocards': infocards,
 431         'endcards': endcards,
 432         'all_cards': allcards,
 433         'subtitles': subtitles,
 434     }
 435
 436 def store_video_metadata(video_id):
 437     # check if we know about it, and if not, fetch and store video metadata
 438     with sqlite3.connect(cf['global']['database']) as conn:
 439         c = conn.cursor()
 440         c.execute("SELECT 1 from videos where id = ?", (video_id,))
 441         new_video = len(c.fetchall()) < 1
 442         if new_video:
 443             _, meta, _, _ = get_video_info(video_id)
 444             if meta:
 445                 meta = prepare_metadata(meta)
 446                 c.execute("""
 447                     INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 448                                    VALUES (?, ?, ?, datetime(?), datetime(?))
 449                 """, (
 450                     video_id,
 451                     meta['channel_id'],
 452                     meta['title'],
 453                     meta['published'],
 454                     meta['published'],
 455                 ))
 456                 c.execute("""
 457                     INSERT OR REPLACE INTO channels (id, name)
 458                                     VALUES (?, ?)
 459                 """, (meta['channel_id'], meta['author']))
 460
 461 from werkzeug.exceptions import NotFound
 462 class NoFallbackException(NotFound): pass
 463 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 464     """
 465     finds the next route that matches the current url rule, and executes it.
 466     args, kwargs: pass all arguments of the current route
 467     """
 468     from flask import current_app, request, g
 469
 470     # build a list of endpoints that match the current request's url rule:
 471     matching = [
 472         rule.endpoint
 473         for rule in current_app.url_map.iter_rules()
 474         if rule.rule == request.url_rule.rule
 475     ]
 476     current = matching.index(request.endpoint)
 477
 478     # since we can't change request.endpoint, we always get the original
 479     # endpoint back. so for repeated fall throughs, we use the g object to
 480     # increment how often we want to fall through.
 481     if not '_fallback_next' in g:
 482         g._fallback_next = 0
 483     g._fallback_next += 1
 484
 485     next_ep = current + g._fallback_next
 486
 487     if next_ep < len(matching):
 488         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 489     else:
 490         raise NoFallbackException
 491
 492 def websub_url_hmac(key, feed_id, timestamp, nonce):
 493     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 494     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 495     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 496
 497 def websub_body_hmac(key, body):
 498     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 499
 500 def pp(*args):
 501     from pprint import pprint
 502     import sys, codecs
 503     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))