app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import requests
   6 import requests_cache
   7 import dateutil.parser
   8 from xml.etree import ElementTree
   9 from configparser import ConfigParser
  10 from datetime import datetime, timezone
  11 from urllib.parse import parse_qs, urlparse
  12
  13 cf = ConfigParser()
  14 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  15 cf.read(config_filename)
  16 if not 'global' in cf: # todo: full config check
  17     raise Exception("Configuration file not found or empty")
  18
  19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  20 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  21
  22 # Note: this should only be required for the 'memory' backed cache.
  23 # TODO: only run for long-running processes, i.e. the frontend
  24 from threading import Timer
  25 def purge_cache(sec):
  26     requests_cache.remove_expired_responses()
  27     t = Timer(sec, purge_cache, args=(sec,))
  28     t.setDaemon(True)
  29     t.start()
  30 purge_cache(10*60)
  31
  32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  34 from flask import g
  35 import requests
  36 from requests import Session as OriginalSession
  37 class _NSASession(OriginalSession):
  38     def request(self, method, url, params=None, data=None, **kwargs):
  39         response = super(_NSASession, self).request(
  40             method, url, params, data, **kwargs
  41             )
  42         try:
  43             if 'api_requests' not in g:
  44                 g.api_requests = []
  45             g.api_requests.append((url, params, response.text))
  46         except RuntimeError: pass # not within flask (e.g. utils.py)
  47         return response
  48 requests.Session = requests.sessions.Session = _NSASession
  49
  50 def fetch_xml(feed_type, feed_id):
  51     # TODO: handle requests.exceptions.ConnectionError
  52     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  53         feed_type: feed_id,
  54     })
  55     if not r.ok:
  56         return None
  57
  58     return r.text
  59
  60 def parse_xml(xmldata):
  61     ns = {
  62         'atom':"http://www.w3.org/2005/Atom",
  63         'yt': "http://www.youtube.com/xml/schemas/2015",
  64         'media':"http://search.yahoo.com/mrss/",
  65         'at': "http://purl.org/atompub/tombstones/1.0",
  66     }
  67
  68     feed = ElementTree.fromstring(xmldata)
  69     if feed.find('at:deleted-entry',ns):
  70         author = feed.find('at:deleted-entry/at:by/name',ns).text
  71         ref = feed.find('at:deleted-entry',ns).get('ref')
  72         (_, _, video_id) = ref.rpartition(':')
  73         return None, None, []
  74     title = feed.find('atom:title',ns).text
  75     author = feed.find('atom:author/atom:name',ns).text \
  76         if feed.find('atom:author',ns) else None
  77     videos = []
  78     for entry in feed.findall('atom:entry',ns):
  79         videos.append({
  80             'video_id': entry.find('yt:videoId',ns).text,
  81             'title': entry.find('atom:title',ns).text,
  82             'published': entry.find('atom:published',ns).text,
  83             'channel_id': entry.find('yt:channelId',ns).text,
  84             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  85             # extra fields for pull_subs/webhook:
  86             'updated': entry.find('atom:updated',ns).text,
  87         })
  88
  89     return title, author, videos
  90
  91 def update_channel(db, xmldata):
  92     if not xmldata: return False
  93
  94     # Note: websub does not return global author, hence taking from first video
  95     title, _, videos = parse_xml(xmldata)
  96
  97     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
  98
  99     c = db.cursor()
 100     for i, video in enumerate(videos):
 101         now = datetime.now(timezone.utc)
 102         updated = dateutil.parser.parse(video['updated'])
 103         published = dateutil.parser.parse(video['published'])
 104         # if update and published time are near-identical, we assume it's new.
 105         if (updated - published).seconds < 60 and (now - published).days < 7:
 106             timestamp = now
 107         else:#, it's just an update to an older video.
 108             timestamp = published
 109
 110         c.execute("""
 111             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 112                            VALUES (?, ?, ?, datetime(?), datetime(?))
 113         """, (
 114             video['video_id'],
 115             video['channel_id'],
 116             video['title'],
 117             video['published'],
 118             timestamp
 119         ))
 120
 121         if i == 0: # only required once per feed
 122             c.execute("""
 123                 INSERT OR REPLACE INTO channels (id, name)
 124                                 VALUES (?, ?)
 125             """, (video['channel_id'], video['author']))
 126     db.commit()
 127
 128     return True
 129
 130 def get_video_info(video_id, sts=0, algo=""):
 131     """
 132     returns: best-quality muxed video stream, player_response, error-type/mesage
 133     error types: player, malformed, livestream, geolocked, exhausted
 134     """
 135     player_error = None # for 'exhausted'
 136     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 137         r = requests.get("https://www.youtube.com/get_video_info", {
 138             "video_id": video_id,
 139             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 140             "el": el,
 141             "sts": sts,
 142             "hl": "en_US",
 143         })
 144         params = parse_qs(r.text)
 145         if 'errorcode' in params: # status=fail
 146             return None, None, 'malformed', params['reason'][0]
 147
 148         metadata = json.loads(params.get('player_response')[0])
 149         playabilityStatus = metadata['playabilityStatus']['status']
 150         if playabilityStatus != "OK":
 151             playabilityReason = metadata['playabilityStatus'].get('reason',
 152                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 153             player_error = f"{playabilityStatus}: {playabilityReason}"
 154             if playabilityStatus == "UNPLAYABLE":
 155                 continue  # try again with next el value (or fail as exhausted)
 156             # without videoDetails, there's only the error message
 157             maybe_metadata = metadata if 'videoDetails' in metadata else None
 158             return None, maybe_metadata, 'player', player_error
 159         if metadata['videoDetails']['isLiveContent'] and \
 160                 (metadata['videoDetails'].get('isLive', False) or \
 161                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 162             return None, metadata, 'livestream', None
 163
 164         if not 'formats' in metadata['streamingData']:
 165             continue # no urls
 166
 167         formats = metadata['streamingData']['formats']
 168         for (i,v) in enumerate(formats):
 169             if not ('cipher' in v or 'signatureCipher' in v): continue
 170             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 171             formats[i]['url'] = unscramble(cipher, algo)
 172
 173         # todo: check if we have urls or try again
 174         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 175
 176         if 'gcr' in parse_qs(url):
 177             return None, metadata, 'geolocked', None
 178
 179         return url, metadata, None, None
 180     else:
 181         return None, metadata, 'exhausted', player_error
 182
 183 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 184     signature = list(cipher['s'][0])
 185     for c in algo.split():
 186         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 187         ix = int(ix) % len(signature) if ix else 0
 188         if not op: continue
 189         if op == 'r': signature = list(reversed(signature))
 190         if op == 's': signature = signature[ix:]
 191         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 192     sp = cipher.get('sp', ['signature'])[0]
 193     sig = cipher.get('sig', [''.join(signature)])[0]
 194     return f"{cipher['url'][0]}&{sp}={sig}"
 195
 196 def prepare_metadata(metadata):
 197     meta1 = metadata['videoDetails']
 198     meta2 = metadata['microformat']['playerMicroformatRenderer']
 199     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 200         if 'cards' in metadata else []
 201     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 202         if 'endscreen' in metadata else []
 203
 204     # the actual video streams have exact information:
 205     try:
 206         sd = metadata['streamingData']
 207         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 208         aspect_ratio = some_stream['width'] / some_stream['height']
 209     # if that's unavailable (e.g. on livestreams), fall back to
 210     # thumbnails (only either 4:3 or 16:9).
 211     except:
 212         some_img = meta2['thumbnail']['thumbnails'][0]
 213         aspect_ratio = some_img['width'] / some_img['height']
 214
 215     subtitles = sorted([
 216         {'url':cc['baseUrl'],
 217          'code':cc['languageCode'],
 218          'autogenerated':cc.get('kind')=="asr",
 219          'name':cc['name']['simpleText']}
 220         for cc in metadata.get('captions',{})
 221             .get('playerCaptionsTracklistRenderer',{})
 222             .get('captionTracks',[])
 223     ], key=lambda cc: cc['autogenerated'])
 224
 225     def clean_url(url):
 226         # externals URLs are redirected through youtube.com/redirect, but we
 227         # may encounter internal URLs, too
 228         return parse_qs(urlparse(url).query).get('q',[url])[0]
 229     # Remove left-/rightmost word from string:
 230     delL = lambda s: s.partition(' ')[2]
 231     delR = lambda s: s.rpartition(' ')[0]
 232     # Thousands seperator aware int():
 233     intT = lambda s: int(s.replace(',', ''))
 234
 235     def parse_infocard(card):
 236         card = card['cardRenderer']
 237         ctype = list(card['content'].keys())[0]
 238         content = card['content'][ctype]
 239         if ctype == "pollRenderer":
 240             ctype = "POLL"
 241             content = {
 242                 'question': content['question']['simpleText'],
 243                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 244                     for a in content['choices']],
 245             }
 246         elif ctype == "videoInfoCardContentRenderer":
 247             ctype = "VIDEO"
 248             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 249             # TODO: this is ugly; cleanup.
 250             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 251             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 252             content = {
 253                 'video_id': content['action']['watchEndpoint']['videoId'],
 254                 'title': content['videoTitle']['simpleText'],
 255                 'author': delL(content['channelName']['simpleText']),
 256                 'length': length,
 257                 'views': intT(delR(content['viewCountText']['simpleText'])),
 258             }
 259         elif ctype == "playlistInfoCardContentRenderer":
 260             ctype = "PLAYLIST"
 261             content = {
 262                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 263                 'video_id': content['action']['watchEndpoint']['videoId'],
 264                 'title': content['playlistTitle']['simpleText'],
 265                 'author': delL(content['channelName']['simpleText']),
 266                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 267             }
 268         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 269             ctype = "WEBSITE"
 270             content = {
 271                 'url': clean_url(content['command']['urlEndpoint']['url']),
 272                 'domain': content['displayDomain']['simpleText'],
 273                 'title': content['title']['simpleText'],
 274                 # XXX: no thumbnails for infocards
 275             }
 276         elif ctype == "collaboratorInfoCardContentRenderer":
 277             ctype = "CHANNEL"
 278             content = {
 279                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 280                 'title': content['channelName']['simpleText'],
 281                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 282                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 283             }
 284         else:
 285             import pprint
 286             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 287
 288         return {'type': ctype, 'content': content}
 289
 290     def mkthumbs(thumbs):
 291         return {e['height']: e['url'] for e in thumbs}
 292     def parse_endcard(card):
 293         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 294         ctype = card['style']
 295         if ctype == "CHANNEL":
 296             content = {
 297                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 298                 'title': card['title']['simpleText'],
 299                 'icons': mkthumbs(card['image']['thumbnails']),
 300             }
 301         elif ctype == "VIDEO":
 302             content = {
 303                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 304                 'title': card['title']['simpleText'],
 305                 'length': card['videoDuration']['simpleText'],  # '12:21'
 306                 'views': delR(card['metadata']['simpleText']),
 307                 # XXX: no channel name
 308             }
 309         elif ctype == "PLAYLIST":
 310             content = {
 311                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 312                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 313                 'title': card['title']['simpleText'],
 314                 'author': delL(card['metadata']['simpleText']),
 315                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 316             }
 317         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 318             ctype = "WEBSITE"
 319             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 320             content = {
 321                 'url': url,
 322                 'domain': urlparse(url).netloc,
 323                 'title': card['title']['simpleText'],
 324                 'icons': mkthumbs(card['image']['thumbnails']),
 325             }
 326         else:
 327             import pprint
 328             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 329
 330         return {'type': ctype, 'content': content}
 331
 332     infocards = [parse_infocard(card) for card in cards]
 333     endcards = [parse_endcard(card) for card in endsc]
 334     # combine cards to weed out duplicates. for videos and playlists prefer
 335     # infocards, for channels and websites prefer endcards, as those have more
 336     # information than the other.
 337     # if the card type is not in ident, we use the whole card for comparison
 338     # (otherwise they'd all replace each other)
 339     ident = { # ctype -> ident
 340         'VIDEO': 'video_id',
 341         'PLAYLIST': 'playlist_id',
 342         'CHANNEL': 'channel_id',
 343         'WEBSITE': 'url',
 344         'POLL': 'question',
 345     }
 346     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 347     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 348     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 349
 350     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 351                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 352
 353     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 354         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 355         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 356         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 357         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 358         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 359         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 360         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 361         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 362         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 363         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 364     whitelisted = sorted(meta2.get('availableCountries',[]))
 365     blacklisted = sorted(set(all_countries) - set(whitelisted))
 366
 367     return {
 368         'title': meta1['title'],
 369         'author': meta1['author'],
 370         'channel_id': meta1['channelId'],
 371         'description': meta1['shortDescription'],
 372         'published': meta2['publishDate'],
 373         'views': meta1['viewCount'],
 374         'length': int(meta1['lengthSeconds']),
 375         'rating': meta1['averageRating'],
 376         'category': meta2['category'],
 377         'aspectr': aspect_ratio,
 378         'unlisted': meta2['isUnlisted'],
 379         'countries': whitelisted,
 380         'blacklisted': blacklisted,
 381         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 382         'infocards': infocards,
 383         'endcards': endcards,
 384         'all_cards': allcards,
 385         'subtitles': subtitles,
 386     }
 387
 388 class RedditException(Exception): pass
 389 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 390         count=None, before=None, after=None):
 391     """
 392     fetches data from a subreddit (or a multireddit like gif+gifs) and
 393     filters/sorts results.
 394     sorted_by values: hot, new, rising, controversial, top
 395     time values: hour, day, week, month, year, all (for top and controversial)
 396     """
 397
 398     if not subreddits:
 399         return None
 400
 401     query = {k:v for k,v in {
 402         'count':count,
 403         'before':before,
 404         'after':after,
 405         'limit':limit, # 1..100 (default 25)
 406         't': time, # hour,week,month,year,all
 407     }.items() if v}
 408     multireddit = '+'.join(subreddits)
 409     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 410             query, headers={'User-Agent':'Mozilla/5.0'})
 411     if not r.ok or not 'data' in r.json():
 412         raise RedditException(r.text)
 413
 414     return r.json()
 415
 416 def fetch_reddit_post(post_id):
 417     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 418     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 419             headers={'User-Agent':'Mozilla/5.0'})
 420     if not r.ok or not 'data' in r.json():
 421         raise RedditException(r.text)
 422
 423     return r.json()
 424
 425 def parse_reddit_videos(data):
 426     videos = []
 427     entries = sorted(data['data']['children'],
 428             key=lambda e: e['data']['score'] > 1,
 429             reverse=True)
 430     for entry in entries:
 431         e = entry['data']
 432         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 433             continue
 434         try:
 435             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 436             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 437         except:
 438             continue # XXX: should we log that?
 439         if not video_id: continue
 440         videos.append({
 441             'video_id': video_id,
 442             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 443             'url': e['permalink'],
 444             'n_comments': e['num_comments'],
 445             'n_karma': e['score'],
 446             'subreddit': e['subreddit'],
 447             'post_id': e['id'],
 448         })
 449
 450     return videos
 451
 452 class NoFallbackException(Exception): pass
 453 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 454     """
 455     finds the next route that matches the current url rule, and executes it.
 456     args, kwargs: pass all arguments of the current route
 457     """
 458     from flask import current_app, request, g
 459     from werkzeug.exceptions import NotFound
 460
 461     # build a list of endpoints that match the current request's url rule:
 462     matching = [
 463         rule.endpoint
 464         for rule in current_app.url_map.iter_rules()
 465         if rule.rule == request.url_rule.rule
 466     ]
 467     current = matching.index(request.endpoint)
 468
 469     # since we can't change request.endpoint, we always get the original
 470     # endpoint back. so for repeated fall throughs, we use the g object to
 471     # increment how often we want to fall through.
 472     if not '_fallback_next' in g:
 473         g._fallback_next = 0
 474     g._fallback_next += 1
 475
 476     next_ep = current + g._fallback_next
 477
 478     if next_ep < len(matching):
 479         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 480     else:
 481         raise NoFallbackException
 482
 483 def pp(*args):
 484     from pprint import pprint
 485     import sys, codecs
 486     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))