app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import base64
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.text
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71     if feed.find('at:deleted-entry',ns):
  72         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  73         return None, None, [{'deleted': True, 'video_id': vid}]
  74         #author = feed.find('at:deleted-entry/at:by/atom:name',ns).text
  75         #channel_url = feed.find('at:deleted-entry/at:by/atom:uri',ns).text
  76         #match = re.search(r"(UC[A-Za-z0-9_-]{22})", channel_url)
  77         #channel_id = match.group(1) if match else None
  78         #ref = feed.find('at:deleted-entry',ns).get('ref')
  79         #(_, _, video_id) = ref.rpartition(':')
  80         #return None, None, [{
  81         #    'video_id': video_id,
  82         #    'deleted': True,
  83         #    'channel_id': channel_id,
  84         #    'author': author,
  85         #}]
  86     title = feed.find('atom:title',ns).text
  87     author = feed.find('atom:author/atom:name',ns).text \
  88         if feed.find('atom:author',ns) else None
  89     videos = []
  90     for entry in feed.findall('atom:entry',ns):
  91         videos.append({
  92             'video_id': entry.find('yt:videoId',ns).text,
  93             'title': entry.find('atom:title',ns).text,
  94             'published': entry.find('atom:published',ns).text,
  95             'channel_id': entry.find('yt:channelId',ns).text,
  96             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  97             # extra fields for pull_subs/webhook:
  98             'updated': entry.find('atom:updated',ns).text,
  99         })
 100
 101     return title, author, videos
 102
 103 def update_channel(db, xmldata):
 104     if not xmldata: return False
 105
 106     # Note: websub does not return global author, hence taking from first video
 107     _, _, videos = parse_xml(xmldata)
 108
 109     c = db.cursor()
 110     for i, video in enumerate(videos):
 111         if video.get('deleted'):
 112             from flask import current_app
 113             current_app.logger.info(f"ignoring deleted video {video_id} from {channel_id}") # XXX: remove
 114             # TODO: enable once we enforce hmac validation:
 115             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 116             break
 117         now = datetime.now(timezone.utc)
 118         updated = dateutil.parser.parse(video['updated'])
 119         published = dateutil.parser.parse(video['published'])
 120         # if update and published time are near-identical, we assume it's new.
 121         if (updated - published).seconds < 60 and (now - published).days < 7:
 122             timestamp = now
 123         else:#, it's just an update to an older video.
 124             timestamp = published
 125
 126         c.execute("""
 127             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 128                            VALUES (?, ?, ?, datetime(?), datetime(?))
 129         """, (
 130             video['video_id'],
 131             video['channel_id'],
 132             video['title'],
 133             video['published'],
 134             timestamp
 135         ))
 136
 137         if i == 0: # only required once per feed
 138             c.execute("""
 139                 INSERT OR REPLACE INTO channels (id, name)
 140                                 VALUES (?, ?)
 141             """, (video['channel_id'], video['author']))
 142     db.commit()
 143
 144     return True
 145
 146 def get_video_info(video_id, sts=0, algo=""):
 147     """
 148     returns: best-quality muxed video stream, player_response, error-type/mesage
 149     error types: player, malformed, livestream, geolocked, exhausted
 150     """
 151     player_error = None # for 'exhausted'
 152     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 153         r = requests.get("https://www.youtube.com/get_video_info", {
 154             "video_id": video_id,
 155             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 156             "el": el,
 157             "sts": sts,
 158             "hl": "en_US",
 159         })
 160         params = parse_qs(r.text)
 161         if 'errorcode' in params: # status=fail
 162             return None, None, 'malformed', params['reason'][0]
 163
 164         metadata = json.loads(params.get('player_response')[0])
 165         playabilityStatus = metadata['playabilityStatus']['status']
 166         if playabilityStatus != "OK":
 167             playabilityReason = metadata['playabilityStatus'].get('reason',
 168                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 169             player_error = f"{playabilityStatus}: {playabilityReason}"
 170             if playabilityStatus == "UNPLAYABLE":
 171                 continue  # try again with next el value (or fail as exhausted)
 172             # without videoDetails, there's only the error message
 173             maybe_metadata = metadata if 'videoDetails' in metadata else None
 174             return None, maybe_metadata, 'player', player_error
 175         if metadata['videoDetails']['isLiveContent'] and \
 176                 (metadata['videoDetails'].get('isLive', False) or \
 177                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 178             return None, metadata, 'livestream', None
 179
 180         if not 'formats' in metadata['streamingData']:
 181             continue # no urls
 182
 183         formats = metadata['streamingData']['formats']
 184         for (i,v) in enumerate(formats):
 185             if not ('cipher' in v or 'signatureCipher' in v): continue
 186             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 187             formats[i]['url'] = unscramble(cipher, algo)
 188
 189         # todo: check if we have urls or try again
 190         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 191
 192         if 'gcr' in parse_qs(url):
 193             return None, metadata, 'geolocked', None
 194
 195         return url, metadata, None, None
 196     else:
 197         return None, metadata, 'exhausted', player_error
 198
 199 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 200     signature = list(cipher['s'][0])
 201     for c in algo.split():
 202         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 203         ix = int(ix) % len(signature) if ix else 0
 204         if not op: continue
 205         if op == 'r': signature = list(reversed(signature))
 206         if op == 's': signature = signature[ix:]
 207         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 208     sp = cipher.get('sp', ['signature'])[0]
 209     sig = cipher.get('sig', [''.join(signature)])[0]
 210     return f"{cipher['url'][0]}&{sp}={sig}"
 211
 212 def prepare_metadata(metadata):
 213     meta1 = metadata['videoDetails']
 214     meta2 = metadata['microformat']['playerMicroformatRenderer']
 215     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 216         if 'cards' in metadata else []
 217     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 218         if 'endscreen' in metadata else []
 219
 220     # the actual video streams have exact information:
 221     try:
 222         sd = metadata['streamingData']
 223         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 224         aspect_ratio = some_stream['width'] / some_stream['height']
 225     # if that's unavailable (e.g. on livestreams), fall back to
 226     # thumbnails (only either 4:3 or 16:9).
 227     except:
 228         some_img = meta2['thumbnail']['thumbnails'][0]
 229         aspect_ratio = some_img['width'] / some_img['height']
 230
 231     subtitles = sorted([
 232         {'url':cc['baseUrl'],
 233          'code':cc['languageCode'],
 234          'autogenerated':cc.get('kind')=="asr",
 235          'name':cc['name']['simpleText']}
 236         for cc in metadata.get('captions',{})
 237             .get('playerCaptionsTracklistRenderer',{})
 238             .get('captionTracks',[])
 239     ], key=lambda cc: cc['autogenerated'])
 240
 241     def clean_url(url):
 242         # externals URLs are redirected through youtube.com/redirect, but we
 243         # may encounter internal URLs, too
 244         return parse_qs(urlparse(url).query).get('q',[url])[0]
 245     # Remove left-/rightmost word from string:
 246     delL = lambda s: s.partition(' ')[2]
 247     delR = lambda s: s.rpartition(' ')[0]
 248     # Thousands seperator aware int():
 249     intT = lambda s: int(s.replace(',', ''))
 250
 251     def parse_infocard(card):
 252         card = card['cardRenderer']
 253         ctype = list(card['content'].keys())[0]
 254         content = card['content'][ctype]
 255         if ctype == "pollRenderer":
 256             ctype = "POLL"
 257             content = {
 258                 'question': content['question']['simpleText'],
 259                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 260                     for a in content['choices']],
 261             }
 262         elif ctype == "videoInfoCardContentRenderer":
 263             ctype = "VIDEO"
 264             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 265             # TODO: this is ugly; cleanup.
 266             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 267             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 268             content = {
 269                 'video_id': content['action']['watchEndpoint']['videoId'],
 270                 'title': content['videoTitle']['simpleText'],
 271                 'author': delL(content['channelName']['simpleText']),
 272                 'length': length,
 273                 'views': intT(delR(content['viewCountText']['simpleText'])),
 274             }
 275         elif ctype == "playlistInfoCardContentRenderer":
 276             ctype = "PLAYLIST"
 277             content = {
 278                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 279                 'video_id': content['action']['watchEndpoint']['videoId'],
 280                 'title': content['playlistTitle']['simpleText'],
 281                 'author': delL(content['channelName']['simpleText']),
 282                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 283             }
 284         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 285             ctype = "WEBSITE"
 286             content = {
 287                 'url': clean_url(content['command']['urlEndpoint']['url']),
 288                 'domain': content['displayDomain']['simpleText'],
 289                 'title': content['title']['simpleText'],
 290                 # XXX: no thumbnails for infocards
 291             }
 292         elif ctype == "collaboratorInfoCardContentRenderer":
 293             ctype = "CHANNEL"
 294             content = {
 295                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 296                 'title': content['channelName']['simpleText'],
 297                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 298                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 299             }
 300         else:
 301             import pprint
 302             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 303
 304         return {'type': ctype, 'content': content}
 305
 306     def mkthumbs(thumbs):
 307         return {e['height']: e['url'] for e in thumbs}
 308     def parse_endcard(card):
 309         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 310         ctype = card['style']
 311         if ctype == "CHANNEL":
 312             content = {
 313                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 314                 'title': card['title']['simpleText'],
 315                 'icons': mkthumbs(card['image']['thumbnails']),
 316             }
 317         elif ctype == "VIDEO":
 318             content = {
 319                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 320                 'title': card['title']['simpleText'],
 321                 'length': card['videoDuration']['simpleText'],  # '12:21'
 322                 'views': delR(card['metadata']['simpleText']),
 323                 # XXX: no channel name
 324             }
 325         elif ctype == "PLAYLIST":
 326             content = {
 327                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 328                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 329                 'title': card['title']['simpleText'],
 330                 'author': delL(card['metadata']['simpleText']),
 331                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 332             }
 333         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 334             ctype = "WEBSITE"
 335             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 336             content = {
 337                 'url': url,
 338                 'domain': urlparse(url).netloc,
 339                 'title': card['title']['simpleText'],
 340                 'icons': mkthumbs(card['image']['thumbnails']),
 341             }
 342         else:
 343             import pprint
 344             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 345
 346         return {'type': ctype, 'content': content}
 347
 348     infocards = [parse_infocard(card) for card in cards]
 349     endcards = [parse_endcard(card) for card in endsc]
 350     # combine cards to weed out duplicates. for videos and playlists prefer
 351     # infocards, for channels and websites prefer endcards, as those have more
 352     # information than the other.
 353     # if the card type is not in ident, we use the whole card for comparison
 354     # (otherwise they'd all replace each other)
 355     ident = { # ctype -> ident
 356         'VIDEO': 'video_id',
 357         'PLAYLIST': 'playlist_id',
 358         'CHANNEL': 'channel_id',
 359         'WEBSITE': 'url',
 360         'POLL': 'question',
 361     }
 362     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 363     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 364     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 365
 366     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 367                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 368
 369     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 370         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 371         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 372         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 373         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 374         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 375         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 376         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 377         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 378         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 379         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 380     whitelisted = sorted(meta2.get('availableCountries',[]))
 381     blacklisted = sorted(set(all_countries) - set(whitelisted))
 382
 383     return {
 384         'title': meta1['title'],
 385         'author': meta1['author'],
 386         'channel_id': meta1['channelId'],
 387         'description': meta1['shortDescription'],
 388         'published': meta2['publishDate'],
 389         'views': meta1['viewCount'],
 390         'length': int(meta1['lengthSeconds']),
 391         'rating': meta1['averageRating'],
 392         'category': meta2['category'],
 393         'aspectr': aspect_ratio,
 394         'unlisted': meta2['isUnlisted'],
 395         'countries': whitelisted,
 396         'blacklisted': blacklisted,
 397         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 398         'infocards': infocards,
 399         'endcards': endcards,
 400         'all_cards': allcards,
 401         'subtitles': subtitles,
 402     }
 403
 404 class RedditException(Exception): pass
 405 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 406         count=None, before=None, after=None):
 407     """
 408     fetches data from a subreddit (or a multireddit like gif+gifs) and
 409     filters/sorts results.
 410     sorted_by values: hot, new, rising, controversial, top
 411     time values: hour, day, week, month, year, all (for top and controversial)
 412     """
 413
 414     if not subreddits:
 415         return None
 416
 417     query = {k:v for k,v in {
 418         'count':count,
 419         'before':before,
 420         'after':after,
 421         'limit':limit, # 1..100 (default 25)
 422         't': time, # hour,week,month,year,all
 423     }.items() if v}
 424     multireddit = '+'.join(subreddits)
 425     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 426             query, headers={'User-Agent':'Mozilla/5.0'})
 427     if not r.ok or not 'data' in r.json():
 428         raise RedditException(r.text)
 429
 430     return r.json()
 431
 432 def fetch_reddit_post(post_id):
 433     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 434     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 435             headers={'User-Agent':'Mozilla/5.0'})
 436     if not r.ok or not 'data' in r.json():
 437         raise RedditException(r.text)
 438
 439     return r.json()
 440
 441 def parse_reddit_videos(data):
 442     videos = []
 443     entries = sorted(data['data']['children'],
 444             key=lambda e: e['data']['score'] > 1,
 445             reverse=True)
 446     for entry in entries:
 447         e = entry['data']
 448         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 449             continue
 450         try:
 451             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 452             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 453         except:
 454             continue # XXX: should we log that?
 455         if not video_id: continue
 456         videos.append({
 457             'video_id': video_id,
 458             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 459             'url': e['permalink'],
 460             'n_comments': e['num_comments'],
 461             'n_karma': e['score'],
 462             'subreddit': e['subreddit'],
 463             'post_id': e['id'],
 464         })
 465
 466     return videos
 467
 468 class NoFallbackException(Exception): pass
 469 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 470     """
 471     finds the next route that matches the current url rule, and executes it.
 472     args, kwargs: pass all arguments of the current route
 473     """
 474     from flask import current_app, request, g
 475     from werkzeug.exceptions import NotFound
 476
 477     # build a list of endpoints that match the current request's url rule:
 478     matching = [
 479         rule.endpoint
 480         for rule in current_app.url_map.iter_rules()
 481         if rule.rule == request.url_rule.rule
 482     ]
 483     current = matching.index(request.endpoint)
 484
 485     # since we can't change request.endpoint, we always get the original
 486     # endpoint back. so for repeated fall throughs, we use the g object to
 487     # increment how often we want to fall through.
 488     if not '_fallback_next' in g:
 489         g._fallback_next = 0
 490     g._fallback_next += 1
 491
 492     next_ep = current + g._fallback_next
 493
 494     if next_ep < len(matching):
 495         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 496     else:
 497         raise NoFallbackException
 498
 499 def websub_url_hmac(key, feed_id, timestamp, nonce):
 500     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 501     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 502     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 503
 504 def websub_body_hmac(key, body):
 505     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 506
 507 def pp(*args):
 508     from pprint import pprint
 509     import sys, codecs
 510     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))