app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import base64
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.content
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}]
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata, from_webhook=False):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     _, _, videos = parse_xml(xmldata)
  98
  99     c = db.cursor()
 100     from flask import current_app # XXX: remove
 101     for i, video in enumerate(videos):
 102         if video.get('deleted'):
 103             current_app.logger.warning(f"ignoring deleted video {video['video_id']}") # XXX: remove
 104             # TODO: enable once we enforce hmac validation:
 105             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 106             break
 107
 108         now = datetime.now(timezone.utc)
 109         timestamp, published = None, None
 110         # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 111         # video gets uploaded as unlisted on day A and set to public on day B;
 112         # the webhook is sent on day B, but 'published' says A. The video
 113         # therefore looks like it's just an update to an older video). If
 114         # that's the case, we fetch get_video_info and double-check.
 115         # We only need to do this to not-yet-in-the-database videos.
 116         c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         current_app.logger.warning(f"new video {video['video_id']}") # XXX: remove
 119         if from_webhook and new_video:
 120             current_app.logger.warning(f"  is webhook and new") # XXX: remove
 121             _, meta, _, _ = get_video_info(video['video_id'])
 122             if meta:
 123                 meta = prepare_metadata(meta)
 124                 published = dateutil.parser.parse(meta['published'])
 125                 current_app.logger.warning(f"  uploaded {published}") # XXX: remove
 126                 # if published within the last week, assume it's new
 127                 if (now - published).days < 7:
 128                     timestamp = now
 129                 else:#, it's just an update to an older video.
 130                     timestamp = published
 131         # if we update from an rss-pull, we can rely on the embedded published
 132         # dates (and don't have to fire off a whole bunch of requests)
 133         else:
 134             updated = dateutil.parser.parse(video['updated'])
 135             published = dateutil.parser.parse(video['published'])
 136             if (updated - published).seconds < 60 and (now - published).days < 7:
 137                 timestamp = now
 138             else:#, it's just an update to an older video.
 139                 timestamp = published
 140
 141
 142         c.execute("""
 143             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 144                            VALUES (?, ?, ?, datetime(?), datetime(?))
 145         """, (
 146             video['video_id'],
 147             video['channel_id'],
 148             video['title'],
 149             video['published'],
 150             timestamp
 151         ))
 152
 153         if i == 0: # only required once per feed
 154             c.execute("""
 155                 INSERT OR REPLACE INTO channels (id, name)
 156                                 VALUES (?, ?)
 157             """, (video['channel_id'], video['author']))
 158     db.commit()
 159
 160     return True
 161
 162 def get_video_info(video_id, sts=0, algo=""):
 163     """
 164     returns: best-quality muxed video stream, player_response, error-type/mesage
 165     error types: player, malformed, livestream, geolocked, exhausted
 166     """
 167     player_error = None # for 'exhausted'
 168     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 169         r = requests.get("https://www.youtube.com/get_video_info", {
 170             "video_id": video_id,
 171             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 172             "el": el,
 173             "sts": sts,
 174             "hl": "en_US",
 175         })
 176         params = parse_qs(r.text)
 177         if 'errorcode' in params: # status=fail
 178             return None, None, 'malformed', params['reason'][0]
 179
 180         metadata = json.loads(params.get('player_response')[0])
 181         playabilityStatus = metadata['playabilityStatus']['status']
 182         if playabilityStatus != "OK":
 183             playabilityReason = metadata['playabilityStatus'].get('reason',
 184                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 185             player_error = f"{playabilityStatus}: {playabilityReason}"
 186             if playabilityStatus == "UNPLAYABLE":
 187                 continue  # try again with next el value (or fail as exhausted)
 188             # without videoDetails, there's only the error message
 189             maybe_metadata = metadata if 'videoDetails' in metadata else None
 190             return None, maybe_metadata, 'player', player_error
 191         if metadata['videoDetails']['isLiveContent'] and \
 192                 (metadata['videoDetails'].get('isLive', False) or \
 193                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 194             return None, metadata, 'livestream', None
 195
 196         if not 'formats' in metadata['streamingData']:
 197             continue # no urls
 198
 199         formats = metadata['streamingData']['formats']
 200         for (i,v) in enumerate(formats):
 201             if not ('cipher' in v or 'signatureCipher' in v): continue
 202             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 203             formats[i]['url'] = unscramble(cipher, algo)
 204
 205         # todo: check if we have urls or try again
 206         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 207
 208         if 'gcr' in parse_qs(url):
 209             return None, metadata, 'geolocked', None
 210
 211         return url, metadata, None, None
 212     else:
 213         return None, metadata, 'exhausted', player_error
 214
 215 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 216     signature = list(cipher['s'][0])
 217     for c in algo.split():
 218         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 219         ix = int(ix) % len(signature) if ix else 0
 220         if not op: continue
 221         if op == 'r': signature = list(reversed(signature))
 222         if op == 's': signature = signature[ix:]
 223         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 224     sp = cipher.get('sp', ['signature'])[0]
 225     sig = cipher.get('sig', [''.join(signature)])[0]
 226     return f"{cipher['url'][0]}&{sp}={sig}"
 227
 228 def prepare_metadata(metadata):
 229     meta1 = metadata['videoDetails']
 230     meta2 = metadata['microformat']['playerMicroformatRenderer']
 231     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 232         if 'cards' in metadata else []
 233     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 234         if 'endscreen' in metadata else []
 235
 236     # the actual video streams have exact information:
 237     try:
 238         sd = metadata['streamingData']
 239         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 240         aspect_ratio = some_stream['width'] / some_stream['height']
 241     # if that's unavailable (e.g. on livestreams), fall back to
 242     # thumbnails (only either 4:3 or 16:9).
 243     except:
 244         some_img = meta2['thumbnail']['thumbnails'][0]
 245         aspect_ratio = some_img['width'] / some_img['height']
 246
 247     subtitles = sorted([
 248         {'url':cc['baseUrl'],
 249          'code':cc['languageCode'],
 250          'autogenerated':cc.get('kind')=="asr",
 251          'name':cc['name']['simpleText']}
 252         for cc in metadata.get('captions',{})
 253             .get('playerCaptionsTracklistRenderer',{})
 254             .get('captionTracks',[])
 255     ], key=lambda cc: cc['autogenerated'])
 256
 257     def clean_url(url):
 258         # externals URLs are redirected through youtube.com/redirect, but we
 259         # may encounter internal URLs, too
 260         return parse_qs(urlparse(url).query).get('q',[url])[0]
 261     # Remove left-/rightmost word from string:
 262     delL = lambda s: s.partition(' ')[2]
 263     delR = lambda s: s.rpartition(' ')[0]
 264     # Thousands seperator aware int():
 265     intT = lambda s: int(s.replace(',', ''))
 266
 267     def parse_infocard(card):
 268         card = card['cardRenderer']
 269         ctype = list(card['content'].keys())[0]
 270         content = card['content'][ctype]
 271         if ctype == "pollRenderer":
 272             ctype = "POLL"
 273             content = {
 274                 'question': content['question']['simpleText'],
 275                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 276                     for a in content['choices']],
 277             }
 278         elif ctype == "videoInfoCardContentRenderer":
 279             ctype = "VIDEO"
 280             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 281             # TODO: this is ugly; cleanup.
 282             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 283             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 284             content = {
 285                 'video_id': content['action']['watchEndpoint']['videoId'],
 286                 'title': content['videoTitle']['simpleText'],
 287                 'author': delL(content['channelName']['simpleText']),
 288                 'length': length,
 289                 'views': intT(delR(content['viewCountText']['simpleText'])),
 290             }
 291         elif ctype == "playlistInfoCardContentRenderer":
 292             ctype = "PLAYLIST"
 293             content = {
 294                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 295                 'video_id': content['action']['watchEndpoint']['videoId'],
 296                 'title': content['playlistTitle']['simpleText'],
 297                 'author': delL(content['channelName']['simpleText']),
 298                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 299             }
 300         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 301             ctype = "WEBSITE"
 302             content = {
 303                 'url': clean_url(content['command']['urlEndpoint']['url']),
 304                 'domain': content['displayDomain']['simpleText'],
 305                 'title': content['title']['simpleText'],
 306                 # XXX: no thumbnails for infocards
 307             }
 308         elif ctype == "collaboratorInfoCardContentRenderer":
 309             ctype = "CHANNEL"
 310             content = {
 311                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 312                 'title': content['channelName']['simpleText'],
 313                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 314                 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
 315             }
 316         else:
 317             import pprint
 318             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 319
 320         return {'type': ctype, 'content': content}
 321
 322     def mkthumbs(thumbs):
 323         return {e['height']: e['url'] for e in thumbs}
 324     def parse_endcard(card):
 325         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 326         ctype = card['style']
 327         if ctype == "CHANNEL":
 328             content = {
 329                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 330                 'title': card['title']['simpleText'],
 331                 'icons': mkthumbs(card['image']['thumbnails']),
 332             }
 333         elif ctype == "VIDEO":
 334             content = {
 335                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 336                 'title': card['title']['simpleText'],
 337                 'length': card['videoDuration']['simpleText'],  # '12:21'
 338                 'views': delR(card['metadata']['simpleText']),
 339                 # XXX: no channel name
 340             }
 341         elif ctype == "PLAYLIST":
 342             content = {
 343                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 344                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 345                 'title': card['title']['simpleText'],
 346                 'author': delL(card['metadata']['simpleText']),
 347                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 348             }
 349         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 350             ctype = "WEBSITE"
 351             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 352             content = {
 353                 'url': url,
 354                 'domain': urlparse(url).netloc,
 355                 'title': card['title']['simpleText'],
 356                 'icons': mkthumbs(card['image']['thumbnails']),
 357             }
 358         else:
 359             import pprint
 360             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 361
 362         return {'type': ctype, 'content': content}
 363
 364     infocards = [parse_infocard(card) for card in cards]
 365     endcards = [parse_endcard(card) for card in endsc]
 366     # combine cards to weed out duplicates. for videos and playlists prefer
 367     # infocards, for channels and websites prefer endcards, as those have more
 368     # information than the other.
 369     # if the card type is not in ident, we use the whole card for comparison
 370     # (otherwise they'd all replace each other)
 371     ident = { # ctype -> ident
 372         'VIDEO': 'video_id',
 373         'PLAYLIST': 'playlist_id',
 374         'CHANNEL': 'channel_id',
 375         'WEBSITE': 'url',
 376         'POLL': 'question',
 377     }
 378     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 379     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 380     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 381
 382     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 383                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 384
 385     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 386         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 387         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 388         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 389         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 390         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 391         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 392         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 393         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 394         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 395         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 396     whitelisted = sorted(meta2.get('availableCountries',[]))
 397     blacklisted = sorted(set(all_countries) - set(whitelisted))
 398
 399     published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
 400     # 'premiere' videos (and livestreams?) have a ISO8601 date available:
 401     if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
 402         published_at = meta2['liveBroadcastDetails']['startTimestamp']
 403
 404     return {
 405         'title': meta1['title'],
 406         'author': meta1['author'],
 407         'channel_id': meta1['channelId'],
 408         'description': meta1['shortDescription'],
 409         'published': published_at,
 410         'views': meta1['viewCount'],
 411         'length': int(meta1['lengthSeconds']),
 412         'rating': meta1['averageRating'],
 413         'category': meta2['category'],
 414         'aspectr': aspect_ratio,
 415         'unlisted': meta2['isUnlisted'],
 416         'whitelisted': whitelisted,
 417         'blacklisted': blacklisted,
 418         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 419         'infocards': infocards,
 420         'endcards': endcards,
 421         'all_cards': allcards,
 422         'subtitles': subtitles,
 423     }
 424
 425 class RedditException(Exception): pass
 426 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 427         count=None, before=None, after=None):
 428     """
 429     fetches data from a subreddit (or a multireddit like gif+gifs) and
 430     filters/sorts results.
 431     sorted_by values: hot, new, rising, controversial, top
 432     time values: hour, day, week, month, year, all (for top and controversial)
 433     """
 434
 435     if not subreddits:
 436         return None
 437
 438     query = {k:v for k,v in {
 439         'count':count,
 440         'before':before,
 441         'after':after,
 442         'limit':limit, # 1..100 (default 25)
 443         't': time, # hour,week,month,year,all
 444     }.items() if v}
 445     multireddit = '+'.join(subreddits)
 446     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 447             query, headers={'User-Agent':'Mozilla/5.0'})
 448     if not r.ok or not 'data' in r.json():
 449         raise RedditException(r.text)
 450
 451     return r.json()
 452
 453 def fetch_reddit_post(post_id):
 454     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 455     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 456             headers={'User-Agent':'Mozilla/5.0'})
 457     if not r.ok or not 'data' in r.json():
 458         raise RedditException(r.text)
 459
 460     return r.json()
 461
 462 def parse_reddit_videos(data):
 463     videos = []
 464     entries = sorted(data['data']['children'],
 465             key=lambda e: e['data']['score'] > 1,
 466             reverse=True)
 467     for entry in entries:
 468         e = entry['data']
 469         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 470             continue
 471         try:
 472             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 473             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 474         except:
 475             continue # XXX: should we log that?
 476         if not video_id: continue
 477         videos.append({
 478             'video_id': video_id,
 479             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 480             'url': e['permalink'],
 481             'n_comments': e['num_comments'],
 482             'n_karma': e['score'],
 483             'subreddit': e['subreddit'],
 484             'post_id': e['id'],
 485         })
 486
 487     return videos
 488
 489 class NoFallbackException(Exception): pass
 490 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 491     """
 492     finds the next route that matches the current url rule, and executes it.
 493     args, kwargs: pass all arguments of the current route
 494     """
 495     from flask import current_app, request, g
 496     from werkzeug.exceptions import NotFound
 497
 498     # build a list of endpoints that match the current request's url rule:
 499     matching = [
 500         rule.endpoint
 501         for rule in current_app.url_map.iter_rules()
 502         if rule.rule == request.url_rule.rule
 503     ]
 504     current = matching.index(request.endpoint)
 505
 506     # since we can't change request.endpoint, we always get the original
 507     # endpoint back. so for repeated fall throughs, we use the g object to
 508     # increment how often we want to fall through.
 509     if not '_fallback_next' in g:
 510         g._fallback_next = 0
 511     g._fallback_next += 1
 512
 513     next_ep = current + g._fallback_next
 514
 515     if next_ep < len(matching):
 516         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 517     else:
 518         raise NoFallbackException
 519
 520 def websub_url_hmac(key, feed_id, timestamp, nonce):
 521     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 522     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 523     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 524
 525 def websub_body_hmac(key, body):
 526     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 527
 528 def pp(*args):
 529     from pprint import pprint
 530     import sys, codecs
 531     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))