app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import requests
   6 import requests_cache
   7 import dateutil.parser
   8 from xml.etree import ElementTree
   9 from configparser import ConfigParser
  10 from datetime import datetime, timezone
  11 from urllib.parse import parse_qs, urlparse
  12
  13 cf = ConfigParser()
  14 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  15 cf.read(config_filename)
  16 if not 'global' in cf: # todo: full config check
  17     raise Exception("Configuration file not found or empty")
  18
  19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  20 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  21
  22 # Note: this should only be required for the 'memory' backed cache.
  23 # TODO: only run for long-running processes, i.e. the frontend
  24 from threading import Timer
  25 def purge_cache(sec):
  26     requests_cache.remove_expired_responses()
  27     t = Timer(sec, purge_cache, args=(sec,))
  28     t.setDaemon(True)
  29     t.start()
  30 purge_cache(10*60)
  31
  32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  34 try:
  35     #raise Exception()
  36     from flask import g
  37     import requests
  38     from requests import Session as OriginalSession
  39     class _NSASession(OriginalSession):
  40         def request(self, method, url, params=None, data=None, **kwargs):
  41             response = super(_NSASession, self).request(
  42                 method, url, params, data, **kwargs
  43             )
  44             if 'api_requests' not in g:
  45                 g.api_requests = []
  46             g.api_requests.append((url, params, response.text))
  47             return response
  48     requests.Session = requests.sessions.Session = _NSASession
  49 except:
  50     pass
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.text
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71     if feed.find('at:deleted-entry',ns):
  72         author = feed.find('at:deleted-entry/at:by/name',ns).text
  73         ref = feed.find('at:deleted-entry',ns).get('ref')
  74         (_, _, video_id) = ref.rpartition(':')
  75         return None, None, []
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     title, _, videos = parse_xml(xmldata)
  98
  99     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
 100
 101     c = db.cursor()
 102     for i, video in enumerate(videos):
 103         now = datetime.now(timezone.utc)
 104         updated = dateutil.parser.parse(video['updated'])
 105         published = dateutil.parser.parse(video['published'])
 106         # if update and published time are near-identical, we assume it's new.
 107         if (updated - published).seconds < 60 and (now - published).days < 7:
 108             timestamp = now
 109         else:#, it's just an update to an older video.
 110             timestamp = published
 111
 112         c.execute("""
 113             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 114                            VALUES (?, ?, ?, datetime(?), datetime(?))
 115         """, (
 116             video['video_id'],
 117             video['channel_id'],
 118             video['title'],
 119             video['published'],
 120             timestamp
 121         ))
 122
 123         if i == 0: # only required once per feed
 124             c.execute("""
 125                 INSERT OR REPLACE INTO channels (id, name)
 126                                 VALUES (?, ?)
 127             """, (video['channel_id'], video['author']))
 128     db.commit()
 129
 130     return True
 131
 132 def get_video_info(video_id, sts=0, algo=""):
 133     """
 134     returns: best-quality muxed video stream, player_response, error-type/mesage
 135     error types: player, malformed, livestream, geolocked, exhausted
 136     """
 137     player_error = None # for 'exhausted'
 138     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 139         r = requests.get("https://www.youtube.com/get_video_info", {
 140             "video_id": video_id,
 141             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 142             "el": el,
 143             "sts": sts,
 144             "hl": "en_US",
 145         })
 146         params = parse_qs(r.text)
 147         if 'errorcode' in params: # status=fail
 148             return None, None, 'malformed', params['reason'][0]
 149
 150         metadata = json.loads(params.get('player_response')[0])
 151         playabilityStatus = metadata['playabilityStatus']['status']
 152         if playabilityStatus != "OK":
 153             playabilityReason = metadata['playabilityStatus'].get('reason',
 154                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 155             player_error = f"{playabilityStatus}: {playabilityReason}"
 156             if playabilityStatus == "UNPLAYABLE":
 157                 continue  # try again with next el value (or fail as exhausted)
 158             # without videoDetails, there's only the error message
 159             maybe_metadata = metadata if 'videoDetails' in metadata else None
 160             return None, maybe_metadata, 'player', player_error
 161         if metadata['videoDetails']['isLiveContent'] and \
 162                 (metadata['videoDetails'].get('isLive', False) or \
 163                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 164             return None, metadata, 'livestream', None
 165
 166         if not 'formats' in metadata['streamingData']:
 167             continue # no urls
 168
 169         formats = metadata['streamingData']['formats']
 170         for (i,v) in enumerate(formats):
 171             if not ('cipher' in v or 'signatureCipher' in v): continue
 172             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 173             formats[i]['url'] = unscramble(cipher, algo)
 174
 175         # todo: check if we have urls or try again
 176         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 177
 178         if 'gcr' in parse_qs(url):
 179             return None, metadata, 'geolocked', None
 180
 181         return url, metadata, None, None
 182     else:
 183         return None, metadata, 'exhausted', player_error
 184
 185 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 186     signature = list(cipher['s'][0])
 187     for c in algo.split():
 188         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 189         ix = int(ix) % len(signature) if ix else 0
 190         if not op: continue
 191         if op == 'r': signature = list(reversed(signature))
 192         if op == 's': signature = signature[ix:]
 193         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 194     sp = cipher.get('sp', ['signature'])[0]
 195     sig = cipher.get('sig', [''.join(signature)])[0]
 196     return f"{cipher['url'][0]}&{sp}={sig}"
 197
 198 def prepare_metadata(metadata):
 199     meta1 = metadata['videoDetails']
 200     meta2 = metadata['microformat']['playerMicroformatRenderer']
 201     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 202         if 'cards' in metadata else []
 203     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 204         if 'endscreen' in metadata else []
 205
 206     # the actual video streams have exact information:
 207     try:
 208         sd = metadata['streamingData']
 209         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 210         aspect_ratio = some_stream['width'] / some_stream['height']
 211     # if that's unavailable (e.g. on livestreams), fall back to
 212     # thumbnails (only either 4:3 or 16:9).
 213     except:
 214         some_img = meta2['thumbnail']['thumbnails'][0]
 215         aspect_ratio = some_img['width'] / some_img['height']
 216
 217     subtitles = sorted([
 218         {'url':cc['baseUrl'],
 219          'code':cc['languageCode'],
 220          'autogenerated':cc.get('kind')=="asr",
 221          'name':cc['name']['simpleText']}
 222         for cc in metadata.get('captions',{})
 223             .get('playerCaptionsTracklistRenderer',{})
 224             .get('captionTracks',[])
 225     ], key=lambda cc: cc['autogenerated'])
 226
 227     def clean_url(url):
 228         # externals URLs are redirected through youtube.com/redirect, but we
 229         # may encounter internal URLs, too
 230         return parse_qs(urlparse(url).query).get('q',[url])[0]
 231     # Remove left-/rightmost word from string:
 232     delL = lambda s: s.partition(' ')[2]
 233     delR = lambda s: s.rpartition(' ')[0]
 234     # Thousands seperator aware int():
 235     intT = lambda s: int(s.replace(',', ''))
 236
 237     def parse_infocard(card):
 238         card = card['cardRenderer']
 239         ctype = list(card['content'].keys())[0]
 240         content = card['content'][ctype]
 241         if ctype == "pollRenderer":
 242             ctype = "POLL"
 243             content = {
 244                 'question': content['question']['simpleText'],
 245                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 246                     for a in content['choices']],
 247             }
 248         elif ctype == "videoInfoCardContentRenderer":
 249             ctype = "VIDEO"
 250             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 251             # TODO: this is ugly; cleanup.
 252             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 253             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 254             content = {
 255                 'video_id': content['action']['watchEndpoint']['videoId'],
 256                 'title': content['videoTitle']['simpleText'],
 257                 'author': delL(content['channelName']['simpleText']),
 258                 'length': length,
 259                 'views': intT(delR(content['viewCountText']['simpleText'])),
 260             }
 261         elif ctype == "playlistInfoCardContentRenderer":
 262             ctype = "PLAYLIST"
 263             content = {
 264                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 265                 'video_id': content['action']['watchEndpoint']['videoId'],
 266                 'title': content['playlistTitle']['simpleText'],
 267                 'author': delL(content['channelName']['simpleText']),
 268                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 269             }
 270         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 271             ctype = "WEBSITE"
 272             content = {
 273                 'url': clean_url(content['command']['urlEndpoint']['url']),
 274                 'domain': content['displayDomain']['simpleText'],
 275                 'title': content['title']['simpleText'],
 276                 # XXX: no thumbnails for infocards
 277             }
 278         elif ctype == "collaboratorInfoCardContentRenderer":
 279             ctype = "CHANNEL"
 280             content = {
 281                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 282                 'title': content['channelName']['simpleText'],
 283                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 284                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 285             }
 286         else:
 287             import pprint
 288             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 289
 290         return {'type': ctype, 'content': content}
 291
 292     def mkthumbs(thumbs):
 293         return {e['height']: e['url'] for e in thumbs}
 294     def parse_endcard(card):
 295         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 296         ctype = card['style']
 297         if ctype == "CHANNEL":
 298             content = {
 299                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 300                 'title': card['title']['simpleText'],
 301                 'icons': mkthumbs(card['image']['thumbnails']),
 302             }
 303         elif ctype == "VIDEO":
 304             content = {
 305                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 306                 'title': card['title']['simpleText'],
 307                 'length': card['videoDuration']['simpleText'],  # '12:21'
 308                 'views': delR(card['metadata']['simpleText']),
 309                 # XXX: no channel name
 310             }
 311         elif ctype == "PLAYLIST":
 312             content = {
 313                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 314                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 315                 'title': card['title']['simpleText'],
 316                 'author': delL(card['metadata']['simpleText']),
 317                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 318             }
 319         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 320             ctype = "WEBSITE"
 321             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 322             content = {
 323                 'url': url,
 324                 'domain': urlparse(url).netloc,
 325                 'title': card['title']['simpleText'],
 326                 'icons': mkthumbs(card['image']['thumbnails']),
 327             }
 328         else:
 329             import pprint
 330             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 331
 332         return {'type': ctype, 'content': content}
 333
 334     infocards = [parse_infocard(card) for card in cards]
 335     endcards = [parse_endcard(card) for card in endsc]
 336     # combine cards to weed out duplicates. for videos and playlists prefer
 337     # infocards, for channels and websites prefer endcards, as those have more
 338     # information than the other.
 339     # if the card type is not in ident, we use the whole card for comparison
 340     # (otherwise they'd all replace each other)
 341     ident = { # ctype -> ident
 342         'VIDEO': 'video_id',
 343         'PLAYLIST': 'playlist_id',
 344         'CHANNEL': 'channel_id',
 345         'WEBSITE': 'url',
 346         'POLL': 'question',
 347     }
 348     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 349     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 350     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 351
 352     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 353                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 354
 355     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 356         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 357         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 358         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 359         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 360         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 361         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 362         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 363         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 364         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 365         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 366     whitelisted = sorted(meta2.get('availableCountries',[]))
 367     blacklisted = sorted(set(all_countries) - set(whitelisted))
 368
 369     return {
 370         'title': meta1['title'],
 371         'author': meta1['author'],
 372         'channel_id': meta1['channelId'],
 373         'description': meta1['shortDescription'],
 374         'published': meta2['publishDate'],
 375         'views': meta1['viewCount'],
 376         'length': int(meta1['lengthSeconds']),
 377         'rating': meta1['averageRating'],
 378         'category': meta2['category'],
 379         'aspectr': aspect_ratio,
 380         'unlisted': meta2['isUnlisted'],
 381         'countries': whitelisted,
 382         'blacklisted': blacklisted,
 383         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 384         'infocards': infocards,
 385         'endcards': endcards,
 386         'all_cards': allcards,
 387         'subtitles': subtitles,
 388     }
 389
 390 class RedditException(Exception): pass
 391 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 392         count=None, before=None, after=None):
 393     """
 394     fetches data from a subreddit (or a multireddit like gif+gifs) and
 395     filters/sorts results.
 396     sorted_by values: hot, new, rising, controversial, top
 397     time values: hour, day, week, month, year, all (for top and controversial)
 398     """
 399
 400     if not subreddits:
 401         return None
 402
 403     query = {k:v for k,v in {
 404         'count':count,
 405         'before':before,
 406         'after':after,
 407         'limit':limit, # 1..100 (default 25)
 408         't': time, # hour,week,month,year,all
 409     }.items() if v}
 410     multireddit = '+'.join(subreddits)
 411     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 412             query, headers={'User-Agent':'Mozilla/5.0'})
 413     if not r.ok or not 'data' in r.json():
 414         raise RedditException(r.text)
 415
 416     return r.json()
 417
 418 def fetch_reddit_post(post_id):
 419     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 420     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 421             headers={'User-Agent':'Mozilla/5.0'})
 422     if not r.ok or not 'data' in r.json():
 423         raise RedditException(r.text)
 424
 425     return r.json()
 426
 427 def parse_reddit_videos(data):
 428     videos = []
 429     entries = sorted(data['data']['children'],
 430             key=lambda e: e['data']['score'] > 1,
 431             reverse=True)
 432     for entry in entries:
 433         e = entry['data']
 434         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 435             continue
 436         try:
 437             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 438             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 439         except:
 440             continue # XXX: should we log that?
 441         if not video_id: continue
 442         videos.append({
 443             'video_id': video_id,
 444             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 445             'url': e['permalink'],
 446             'n_comments': e['num_comments'],
 447             'n_karma': e['score'],
 448             'subreddit': e['subreddit'],
 449             'post_id': e['id'],
 450         })
 451
 452     return videos
 453
 454 class NoFallbackException(Exception): pass
 455 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 456     """
 457     finds the next route that matches the current url rule, and executes it.
 458     args, kwargs: pass all arguments of the current route
 459     """
 460     from flask import current_app, request, g
 461     from werkzeug.exceptions import NotFound
 462
 463     # build a list of endpoints that match the current request's url rule:
 464     matching = [
 465         rule.endpoint
 466         for rule in current_app.url_map.iter_rules()
 467         if rule.rule == request.url_rule.rule
 468     ]
 469     current = matching.index(request.endpoint)
 470
 471     # since we can't change request.endpoint, we always get the original
 472     # endpoint back. so for repeated fall throughs, we use the g object to
 473     # increment how often we want to fall through.
 474     if not '_fallback_next' in g:
 475         g._fallback_next = 0
 476     g._fallback_next += 1
 477
 478     next_ep = current + g._fallback_next
 479
 480     if next_ep < len(matching):
 481         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 482     else:
 483         raise NoFallbackException
 484
 485 def pp(*args):
 486     from pprint import pprint
 487     import sys, codecs
 488     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))