app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import base64
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.text
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}]
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     _, _, videos = parse_xml(xmldata)
  98
  99     c = db.cursor()
 100     for i, video in enumerate(videos):
 101         if video.get('deleted'):
 102             from flask import current_app # XXX: remove
 103             current_app.logger.info(f"ignoring deleted video {video['video_id']}") # XXX: remove
 104             # TODO: enable once we enforce hmac validation:
 105             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 106             break
 107         now = datetime.now(timezone.utc)
 108         updated = dateutil.parser.parse(video['updated'])
 109         published = dateutil.parser.parse(video['published'])
 110         # if update and published time are near-identical, we assume it's new.
 111         if (updated - published).seconds < 60 and (now - published).days < 7:
 112             timestamp = now
 113         else:#, it's just an update to an older video.
 114             timestamp = published
 115
 116         c.execute("""
 117             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 118                            VALUES (?, ?, ?, datetime(?), datetime(?))
 119         """, (
 120             video['video_id'],
 121             video['channel_id'],
 122             video['title'],
 123             video['published'],
 124             timestamp
 125         ))
 126
 127         if i == 0: # only required once per feed
 128             c.execute("""
 129                 INSERT OR REPLACE INTO channels (id, name)
 130                                 VALUES (?, ?)
 131             """, (video['channel_id'], video['author']))
 132     db.commit()
 133
 134     return True
 135
 136 def get_video_info(video_id, sts=0, algo=""):
 137     """
 138     returns: best-quality muxed video stream, player_response, error-type/mesage
 139     error types: player, malformed, livestream, geolocked, exhausted
 140     """
 141     player_error = None # for 'exhausted'
 142     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 143         r = requests.get("https://www.youtube.com/get_video_info", {
 144             "video_id": video_id,
 145             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 146             "el": el,
 147             "sts": sts,
 148             "hl": "en_US",
 149         })
 150         params = parse_qs(r.text)
 151         if 'errorcode' in params: # status=fail
 152             return None, None, 'malformed', params['reason'][0]
 153
 154         metadata = json.loads(params.get('player_response')[0])
 155         playabilityStatus = metadata['playabilityStatus']['status']
 156         if playabilityStatus != "OK":
 157             playabilityReason = metadata['playabilityStatus'].get('reason',
 158                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 159             player_error = f"{playabilityStatus}: {playabilityReason}"
 160             if playabilityStatus == "UNPLAYABLE":
 161                 continue  # try again with next el value (or fail as exhausted)
 162             # without videoDetails, there's only the error message
 163             maybe_metadata = metadata if 'videoDetails' in metadata else None
 164             return None, maybe_metadata, 'player', player_error
 165         if metadata['videoDetails']['isLiveContent'] and \
 166                 (metadata['videoDetails'].get('isLive', False) or \
 167                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 168             return None, metadata, 'livestream', None
 169
 170         if not 'formats' in metadata['streamingData']:
 171             continue # no urls
 172
 173         formats = metadata['streamingData']['formats']
 174         for (i,v) in enumerate(formats):
 175             if not ('cipher' in v or 'signatureCipher' in v): continue
 176             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 177             formats[i]['url'] = unscramble(cipher, algo)
 178
 179         # todo: check if we have urls or try again
 180         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 181
 182         if 'gcr' in parse_qs(url):
 183             return None, metadata, 'geolocked', None
 184
 185         return url, metadata, None, None
 186     else:
 187         return None, metadata, 'exhausted', player_error
 188
 189 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 190     signature = list(cipher['s'][0])
 191     for c in algo.split():
 192         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 193         ix = int(ix) % len(signature) if ix else 0
 194         if not op: continue
 195         if op == 'r': signature = list(reversed(signature))
 196         if op == 's': signature = signature[ix:]
 197         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 198     sp = cipher.get('sp', ['signature'])[0]
 199     sig = cipher.get('sig', [''.join(signature)])[0]
 200     return f"{cipher['url'][0]}&{sp}={sig}"
 201
 202 def prepare_metadata(metadata):
 203     meta1 = metadata['videoDetails']
 204     meta2 = metadata['microformat']['playerMicroformatRenderer']
 205     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 206         if 'cards' in metadata else []
 207     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 208         if 'endscreen' in metadata else []
 209
 210     # the actual video streams have exact information:
 211     try:
 212         sd = metadata['streamingData']
 213         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 214         aspect_ratio = some_stream['width'] / some_stream['height']
 215     # if that's unavailable (e.g. on livestreams), fall back to
 216     # thumbnails (only either 4:3 or 16:9).
 217     except:
 218         some_img = meta2['thumbnail']['thumbnails'][0]
 219         aspect_ratio = some_img['width'] / some_img['height']
 220
 221     subtitles = sorted([
 222         {'url':cc['baseUrl'],
 223          'code':cc['languageCode'],
 224          'autogenerated':cc.get('kind')=="asr",
 225          'name':cc['name']['simpleText']}
 226         for cc in metadata.get('captions',{})
 227             .get('playerCaptionsTracklistRenderer',{})
 228             .get('captionTracks',[])
 229     ], key=lambda cc: cc['autogenerated'])
 230
 231     def clean_url(url):
 232         # externals URLs are redirected through youtube.com/redirect, but we
 233         # may encounter internal URLs, too
 234         return parse_qs(urlparse(url).query).get('q',[url])[0]
 235     # Remove left-/rightmost word from string:
 236     delL = lambda s: s.partition(' ')[2]
 237     delR = lambda s: s.rpartition(' ')[0]
 238     # Thousands seperator aware int():
 239     intT = lambda s: int(s.replace(',', ''))
 240
 241     def parse_infocard(card):
 242         card = card['cardRenderer']
 243         ctype = list(card['content'].keys())[0]
 244         content = card['content'][ctype]
 245         if ctype == "pollRenderer":
 246             ctype = "POLL"
 247             content = {
 248                 'question': content['question']['simpleText'],
 249                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 250                     for a in content['choices']],
 251             }
 252         elif ctype == "videoInfoCardContentRenderer":
 253             ctype = "VIDEO"
 254             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 255             # TODO: this is ugly; cleanup.
 256             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 257             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 258             content = {
 259                 'video_id': content['action']['watchEndpoint']['videoId'],
 260                 'title': content['videoTitle']['simpleText'],
 261                 'author': delL(content['channelName']['simpleText']),
 262                 'length': length,
 263                 'views': intT(delR(content['viewCountText']['simpleText'])),
 264             }
 265         elif ctype == "playlistInfoCardContentRenderer":
 266             ctype = "PLAYLIST"
 267             content = {
 268                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 269                 'video_id': content['action']['watchEndpoint']['videoId'],
 270                 'title': content['playlistTitle']['simpleText'],
 271                 'author': delL(content['channelName']['simpleText']),
 272                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 273             }
 274         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 275             ctype = "WEBSITE"
 276             content = {
 277                 'url': clean_url(content['command']['urlEndpoint']['url']),
 278                 'domain': content['displayDomain']['simpleText'],
 279                 'title': content['title']['simpleText'],
 280                 # XXX: no thumbnails for infocards
 281             }
 282         elif ctype == "collaboratorInfoCardContentRenderer":
 283             ctype = "CHANNEL"
 284             content = {
 285                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 286                 'title': content['channelName']['simpleText'],
 287                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 288                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 289             }
 290         else:
 291             import pprint
 292             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 293
 294         return {'type': ctype, 'content': content}
 295
 296     def mkthumbs(thumbs):
 297         return {e['height']: e['url'] for e in thumbs}
 298     def parse_endcard(card):
 299         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 300         ctype = card['style']
 301         if ctype == "CHANNEL":
 302             content = {
 303                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 304                 'title': card['title']['simpleText'],
 305                 'icons': mkthumbs(card['image']['thumbnails']),
 306             }
 307         elif ctype == "VIDEO":
 308             content = {
 309                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 310                 'title': card['title']['simpleText'],
 311                 'length': card['videoDuration']['simpleText'],  # '12:21'
 312                 'views': delR(card['metadata']['simpleText']),
 313                 # XXX: no channel name
 314             }
 315         elif ctype == "PLAYLIST":
 316             content = {
 317                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 318                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 319                 'title': card['title']['simpleText'],
 320                 'author': delL(card['metadata']['simpleText']),
 321                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 322             }
 323         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 324             ctype = "WEBSITE"
 325             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 326             content = {
 327                 'url': url,
 328                 'domain': urlparse(url).netloc,
 329                 'title': card['title']['simpleText'],
 330                 'icons': mkthumbs(card['image']['thumbnails']),
 331             }
 332         else:
 333             import pprint
 334             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 335
 336         return {'type': ctype, 'content': content}
 337
 338     infocards = [parse_infocard(card) for card in cards]
 339     endcards = [parse_endcard(card) for card in endsc]
 340     # combine cards to weed out duplicates. for videos and playlists prefer
 341     # infocards, for channels and websites prefer endcards, as those have more
 342     # information than the other.
 343     # if the card type is not in ident, we use the whole card for comparison
 344     # (otherwise they'd all replace each other)
 345     ident = { # ctype -> ident
 346         'VIDEO': 'video_id',
 347         'PLAYLIST': 'playlist_id',
 348         'CHANNEL': 'channel_id',
 349         'WEBSITE': 'url',
 350         'POLL': 'question',
 351     }
 352     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 353     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 354     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 355
 356     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 357                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 358
 359     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 360         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 361         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 362         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 363         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 364         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 365         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 366         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 367         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 368         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 369         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 370     whitelisted = sorted(meta2.get('availableCountries',[]))
 371     blacklisted = sorted(set(all_countries) - set(whitelisted))
 372
 373     return {
 374         'title': meta1['title'],
 375         'author': meta1['author'],
 376         'channel_id': meta1['channelId'],
 377         'description': meta1['shortDescription'],
 378         'published': meta2['publishDate'],
 379         'views': meta1['viewCount'],
 380         'length': int(meta1['lengthSeconds']),
 381         'rating': meta1['averageRating'],
 382         'category': meta2['category'],
 383         'aspectr': aspect_ratio,
 384         'unlisted': meta2['isUnlisted'],
 385         'whitelisted': whitelisted,
 386         'blacklisted': blacklisted,
 387         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 388         'infocards': infocards,
 389         'endcards': endcards,
 390         'all_cards': allcards,
 391         'subtitles': subtitles,
 392     }
 393
 394 class RedditException(Exception): pass
 395 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 396         count=None, before=None, after=None):
 397     """
 398     fetches data from a subreddit (or a multireddit like gif+gifs) and
 399     filters/sorts results.
 400     sorted_by values: hot, new, rising, controversial, top
 401     time values: hour, day, week, month, year, all (for top and controversial)
 402     """
 403
 404     if not subreddits:
 405         return None
 406
 407     query = {k:v for k,v in {
 408         'count':count,
 409         'before':before,
 410         'after':after,
 411         'limit':limit, # 1..100 (default 25)
 412         't': time, # hour,week,month,year,all
 413     }.items() if v}
 414     multireddit = '+'.join(subreddits)
 415     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 416             query, headers={'User-Agent':'Mozilla/5.0'})
 417     if not r.ok or not 'data' in r.json():
 418         raise RedditException(r.text)
 419
 420     return r.json()
 421
 422 def fetch_reddit_post(post_id):
 423     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 424     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 425             headers={'User-Agent':'Mozilla/5.0'})
 426     if not r.ok or not 'data' in r.json():
 427         raise RedditException(r.text)
 428
 429     return r.json()
 430
 431 def parse_reddit_videos(data):
 432     videos = []
 433     entries = sorted(data['data']['children'],
 434             key=lambda e: e['data']['score'] > 1,
 435             reverse=True)
 436     for entry in entries:
 437         e = entry['data']
 438         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 439             continue
 440         try:
 441             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 442             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 443         except:
 444             continue # XXX: should we log that?
 445         if not video_id: continue
 446         videos.append({
 447             'video_id': video_id,
 448             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 449             'url': e['permalink'],
 450             'n_comments': e['num_comments'],
 451             'n_karma': e['score'],
 452             'subreddit': e['subreddit'],
 453             'post_id': e['id'],
 454         })
 455
 456     return videos
 457
 458 class NoFallbackException(Exception): pass
 459 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 460     """
 461     finds the next route that matches the current url rule, and executes it.
 462     args, kwargs: pass all arguments of the current route
 463     """
 464     from flask import current_app, request, g
 465     from werkzeug.exceptions import NotFound
 466
 467     # build a list of endpoints that match the current request's url rule:
 468     matching = [
 469         rule.endpoint
 470         for rule in current_app.url_map.iter_rules()
 471         if rule.rule == request.url_rule.rule
 472     ]
 473     current = matching.index(request.endpoint)
 474
 475     # since we can't change request.endpoint, we always get the original
 476     # endpoint back. so for repeated fall throughs, we use the g object to
 477     # increment how often we want to fall through.
 478     if not '_fallback_next' in g:
 479         g._fallback_next = 0
 480     g._fallback_next += 1
 481
 482     next_ep = current + g._fallback_next
 483
 484     if next_ep < len(matching):
 485         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 486     else:
 487         raise NoFallbackException
 488
 489 def websub_url_hmac(key, feed_id, timestamp, nonce):
 490     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 491     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 492     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 493
 494 def websub_body_hmac(key, body):
 495     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 496
 497 def pp(*args):
 498     from pprint import pprint
 499     import sys, codecs
 500     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))