app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import base64
   6 import requests
   7 import hmac, hashlib
   8 import requests_cache
   9 import dateutil.parser
  10 from xml.etree import ElementTree
  11 from configparser import ConfigParser
  12 from datetime import datetime, timezone
  13 from urllib.parse import parse_qs, urlparse
  14
  15 cf = ConfigParser()
  16 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  17 cf.read(config_filename)
  18 if not 'global' in cf: # todo: full config check
  19     raise Exception("Configuration file not found or empty")
  20
  21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  22 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  23
  24 # Note: this should only be required for the 'memory' backed cache.
  25 # TODO: only run for long-running processes, i.e. the frontend
  26 from threading import Timer
  27 def purge_cache(sec):
  28     requests_cache.remove_expired_responses()
  29     t = Timer(sec, purge_cache, args=(sec,))
  30     t.setDaemon(True)
  31     t.start()
  32 purge_cache(10*60)
  33
  34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  36 from flask import g
  37 import requests
  38 from requests import Session as OriginalSession
  39 class _NSASession(OriginalSession):
  40     def request(self, method, url, params=None, data=None, **kwargs):
  41         response = super(_NSASession, self).request(
  42             method, url, params, data, **kwargs
  43             )
  44         try:
  45             if 'api_requests' not in g:
  46                 g.api_requests = []
  47             g.api_requests.append((url, params, response.text))
  48         except RuntimeError: pass # not within flask (e.g. utils.py)
  49         return response
  50 requests.Session = requests.sessions.Session = _NSASession
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.text
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71
  72     if feed.find('at:deleted-entry',ns):
  73         (_,_,vid) = feed.find('at:deleted-entry',ns).get('ref').rpartition(':')
  74         return None, None, [{'deleted': True, 'video_id': vid}]
  75
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata, from_webhook=False):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     _, _, videos = parse_xml(xmldata)
  98
  99     c = db.cursor()
 100     from flask import current_app # XXX: remove
 101     for i, video in enumerate(videos):
 102         if video.get('deleted'):
 103             current_app.logger.info(f"ignoring deleted video {video['video_id']}") # XXX: remove
 104             # TODO: enable once we enforce hmac validation:
 105             #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
 106             break
 107
 108         now = datetime.now(timezone.utc)
 109         timestamp, published = None, None
 110         # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
 111         # video gets uploaded as unlisted on day A and set to public on day B;
 112         # the webhook is sent on day B, but 'published' says A. The video
 113         # therefore looks like it's just an update to an older video). If
 114         # that's the case, we fetch get_video_info and double-check.
 115         # We only need to do this to not-yet-in-the-database videos.
 116         c.execute("SELECT 1 from videos where id = ?", (video['video_id'],))
 117         new_video = len(c.fetchall()) < 1
 118         if from_webhook and new_video:
 119             _, meta, _, _ = get_video_info(video['video_id'])
 120             if meta:
 121                 meta = prepare_metadata(meta)
 122                 published = dateutil.parser.parse(meta['published'])
 123                 current_app.logger.info(f"new video {video['video_id']}, uploaded {published}") # XXX: remove
 124                 # if published within the last week, assume it's new
 125                 if (now - published).days < 7:
 126                     timestamp = now
 127                 else:#, it's just an update to an older video.
 128                     timestamp = published
 129         # if we update from an rss-pull, we can rely on the embedded published
 130         # dates (and don't have to fire off a whole bunch of requests)
 131         else:
 132             updated = dateutil.parser.parse(video['updated'])
 133             published = dateutil.parser.parse(video['published'])
 134             if (updated - published).seconds < 60 and (now - published).days < 7:
 135                 timestamp = now
 136             else:#, it's just an update to an older video.
 137                 timestamp = published
 138
 139
 140         c.execute("""
 141             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 142                            VALUES (?, ?, ?, datetime(?), datetime(?))
 143         """, (
 144             video['video_id'],
 145             video['channel_id'],
 146             video['title'],
 147             video['published'],
 148             timestamp
 149         ))
 150
 151         if i == 0: # only required once per feed
 152             c.execute("""
 153                 INSERT OR REPLACE INTO channels (id, name)
 154                                 VALUES (?, ?)
 155             """, (video['channel_id'], video['author']))
 156     db.commit()
 157
 158     return True
 159
 160 def get_video_info(video_id, sts=0, algo=""):
 161     """
 162     returns: best-quality muxed video stream, player_response, error-type/mesage
 163     error types: player, malformed, livestream, geolocked, exhausted
 164     """
 165     player_error = None # for 'exhausted'
 166     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 167         r = requests.get("https://www.youtube.com/get_video_info", {
 168             "video_id": video_id,
 169             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 170             "el": el,
 171             "sts": sts,
 172             "hl": "en_US",
 173         })
 174         params = parse_qs(r.text)
 175         if 'errorcode' in params: # status=fail
 176             return None, None, 'malformed', params['reason'][0]
 177
 178         metadata = json.loads(params.get('player_response')[0])
 179         playabilityStatus = metadata['playabilityStatus']['status']
 180         if playabilityStatus != "OK":
 181             playabilityReason = metadata['playabilityStatus'].get('reason',
 182                     '//'.join(metadata['playabilityStatus'].get('messages',[])))
 183             player_error = f"{playabilityStatus}: {playabilityReason}"
 184             if playabilityStatus == "UNPLAYABLE":
 185                 continue  # try again with next el value (or fail as exhausted)
 186             # without videoDetails, there's only the error message
 187             maybe_metadata = metadata if 'videoDetails' in metadata else None
 188             return None, maybe_metadata, 'player', player_error
 189         if metadata['videoDetails']['isLiveContent'] and \
 190                 (metadata['videoDetails'].get('isLive', False) or \
 191                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 192             return None, metadata, 'livestream', None
 193
 194         if not 'formats' in metadata['streamingData']:
 195             continue # no urls
 196
 197         formats = metadata['streamingData']['formats']
 198         for (i,v) in enumerate(formats):
 199             if not ('cipher' in v or 'signatureCipher' in v): continue
 200             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 201             formats[i]['url'] = unscramble(cipher, algo)
 202
 203         # todo: check if we have urls or try again
 204         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 205
 206         if 'gcr' in parse_qs(url):
 207             return None, metadata, 'geolocked', None
 208
 209         return url, metadata, None, None
 210     else:
 211         return None, metadata, 'exhausted', player_error
 212
 213 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 214     signature = list(cipher['s'][0])
 215     for c in algo.split():
 216         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 217         ix = int(ix) % len(signature) if ix else 0
 218         if not op: continue
 219         if op == 'r': signature = list(reversed(signature))
 220         if op == 's': signature = signature[ix:]
 221         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 222     sp = cipher.get('sp', ['signature'])[0]
 223     sig = cipher.get('sig', [''.join(signature)])[0]
 224     return f"{cipher['url'][0]}&{sp}={sig}"
 225
 226 def prepare_metadata(metadata):
 227     meta1 = metadata['videoDetails']
 228     meta2 = metadata['microformat']['playerMicroformatRenderer']
 229     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 230         if 'cards' in metadata else []
 231     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 232         if 'endscreen' in metadata else []
 233
 234     # the actual video streams have exact information:
 235     try:
 236         sd = metadata['streamingData']
 237         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 238         aspect_ratio = some_stream['width'] / some_stream['height']
 239     # if that's unavailable (e.g. on livestreams), fall back to
 240     # thumbnails (only either 4:3 or 16:9).
 241     except:
 242         some_img = meta2['thumbnail']['thumbnails'][0]
 243         aspect_ratio = some_img['width'] / some_img['height']
 244
 245     subtitles = sorted([
 246         {'url':cc['baseUrl'],
 247          'code':cc['languageCode'],
 248          'autogenerated':cc.get('kind')=="asr",
 249          'name':cc['name']['simpleText']}
 250         for cc in metadata.get('captions',{})
 251             .get('playerCaptionsTracklistRenderer',{})
 252             .get('captionTracks',[])
 253     ], key=lambda cc: cc['autogenerated'])
 254
 255     def clean_url(url):
 256         # externals URLs are redirected through youtube.com/redirect, but we
 257         # may encounter internal URLs, too
 258         return parse_qs(urlparse(url).query).get('q',[url])[0]
 259     # Remove left-/rightmost word from string:
 260     delL = lambda s: s.partition(' ')[2]
 261     delR = lambda s: s.rpartition(' ')[0]
 262     # Thousands seperator aware int():
 263     intT = lambda s: int(s.replace(',', ''))
 264
 265     def parse_infocard(card):
 266         card = card['cardRenderer']
 267         ctype = list(card['content'].keys())[0]
 268         content = card['content'][ctype]
 269         if ctype == "pollRenderer":
 270             ctype = "POLL"
 271             content = {
 272                 'question': content['question']['simpleText'],
 273                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 274                     for a in content['choices']],
 275             }
 276         elif ctype == "videoInfoCardContentRenderer":
 277             ctype = "VIDEO"
 278             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 279             # TODO: this is ugly; cleanup.
 280             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 281             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 282             content = {
 283                 'video_id': content['action']['watchEndpoint']['videoId'],
 284                 'title': content['videoTitle']['simpleText'],
 285                 'author': delL(content['channelName']['simpleText']),
 286                 'length': length,
 287                 'views': intT(delR(content['viewCountText']['simpleText'])),
 288             }
 289         elif ctype == "playlistInfoCardContentRenderer":
 290             ctype = "PLAYLIST"
 291             content = {
 292                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 293                 'video_id': content['action']['watchEndpoint']['videoId'],
 294                 'title': content['playlistTitle']['simpleText'],
 295                 'author': delL(content['channelName']['simpleText']),
 296                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 297             }
 298         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 299             ctype = "WEBSITE"
 300             content = {
 301                 'url': clean_url(content['command']['urlEndpoint']['url']),
 302                 'domain': content['displayDomain']['simpleText'],
 303                 'title': content['title']['simpleText'],
 304                 # XXX: no thumbnails for infocards
 305             }
 306         elif ctype == "collaboratorInfoCardContentRenderer":
 307             ctype = "CHANNEL"
 308             content = {
 309                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 310                 'title': content['channelName']['simpleText'],
 311                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 312                 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
 313             }
 314         else:
 315             import pprint
 316             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 317
 318         return {'type': ctype, 'content': content}
 319
 320     def mkthumbs(thumbs):
 321         return {e['height']: e['url'] for e in thumbs}
 322     def parse_endcard(card):
 323         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 324         ctype = card['style']
 325         if ctype == "CHANNEL":
 326             content = {
 327                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 328                 'title': card['title']['simpleText'],
 329                 'icons': mkthumbs(card['image']['thumbnails']),
 330             }
 331         elif ctype == "VIDEO":
 332             content = {
 333                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 334                 'title': card['title']['simpleText'],
 335                 'length': card['videoDuration']['simpleText'],  # '12:21'
 336                 'views': delR(card['metadata']['simpleText']),
 337                 # XXX: no channel name
 338             }
 339         elif ctype == "PLAYLIST":
 340             content = {
 341                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 342                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 343                 'title': card['title']['simpleText'],
 344                 'author': delL(card['metadata']['simpleText']),
 345                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 346             }
 347         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 348             ctype = "WEBSITE"
 349             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 350             content = {
 351                 'url': url,
 352                 'domain': urlparse(url).netloc,
 353                 'title': card['title']['simpleText'],
 354                 'icons': mkthumbs(card['image']['thumbnails']),
 355             }
 356         else:
 357             import pprint
 358             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 359
 360         return {'type': ctype, 'content': content}
 361
 362     infocards = [parse_infocard(card) for card in cards]
 363     endcards = [parse_endcard(card) for card in endsc]
 364     # combine cards to weed out duplicates. for videos and playlists prefer
 365     # infocards, for channels and websites prefer endcards, as those have more
 366     # information than the other.
 367     # if the card type is not in ident, we use the whole card for comparison
 368     # (otherwise they'd all replace each other)
 369     ident = { # ctype -> ident
 370         'VIDEO': 'video_id',
 371         'PLAYLIST': 'playlist_id',
 372         'CHANNEL': 'channel_id',
 373         'WEBSITE': 'url',
 374         'POLL': 'question',
 375     }
 376     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 377     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 378     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 379
 380     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 381                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 382
 383     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 384         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 385         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 386         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 387         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 388         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 389         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 390         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 391         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 392         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 393         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 394     whitelisted = sorted(meta2.get('availableCountries',[]))
 395     blacklisted = sorted(set(all_countries) - set(whitelisted))
 396
 397     published_at = f"{meta2['publishDate']}T00:00:00Z" # yyyy-mm-dd
 398     # 'premiere' videos (and livestreams?) have a ISO8601 date available:
 399     if 'liveBroadcastDetails' in meta2 and 'startTimestamp' in meta2['liveBroadcastDetails']: # TODO: tighten up
 400         published_at = meta2['liveBroadcastDetails']['startTimestamp']
 401
 402     return {
 403         'title': meta1['title'],
 404         'author': meta1['author'],
 405         'channel_id': meta1['channelId'],
 406         'description': meta1['shortDescription'],
 407         'published': published_at,
 408         'views': meta1['viewCount'],
 409         'length': int(meta1['lengthSeconds']),
 410         'rating': meta1['averageRating'],
 411         'category': meta2['category'],
 412         'aspectr': aspect_ratio,
 413         'unlisted': meta2['isUnlisted'],
 414         'whitelisted': whitelisted,
 415         'blacklisted': blacklisted,
 416         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 417         'infocards': infocards,
 418         'endcards': endcards,
 419         'all_cards': allcards,
 420         'subtitles': subtitles,
 421     }
 422
 423 class RedditException(Exception): pass
 424 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 425         count=None, before=None, after=None):
 426     """
 427     fetches data from a subreddit (or a multireddit like gif+gifs) and
 428     filters/sorts results.
 429     sorted_by values: hot, new, rising, controversial, top
 430     time values: hour, day, week, month, year, all (for top and controversial)
 431     """
 432
 433     if not subreddits:
 434         return None
 435
 436     query = {k:v for k,v in {
 437         'count':count,
 438         'before':before,
 439         'after':after,
 440         'limit':limit, # 1..100 (default 25)
 441         't': time, # hour,week,month,year,all
 442     }.items() if v}
 443     multireddit = '+'.join(subreddits)
 444     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 445             query, headers={'User-Agent':'Mozilla/5.0'})
 446     if not r.ok or not 'data' in r.json():
 447         raise RedditException(r.text)
 448
 449     return r.json()
 450
 451 def fetch_reddit_post(post_id):
 452     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 453     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
 454             headers={'User-Agent':'Mozilla/5.0'})
 455     if not r.ok or not 'data' in r.json():
 456         raise RedditException(r.text)
 457
 458     return r.json()
 459
 460 def parse_reddit_videos(data):
 461     videos = []
 462     entries = sorted(data['data']['children'],
 463             key=lambda e: e['data']['score'] > 1,
 464             reverse=True)
 465     for entry in entries:
 466         e = entry['data']
 467         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 468             continue
 469         try:
 470             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 471             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 472         except:
 473             continue # XXX: should we log that?
 474         if not video_id: continue
 475         videos.append({
 476             'video_id': video_id,
 477             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 478             'url': e['permalink'],
 479             'n_comments': e['num_comments'],
 480             'n_karma': e['score'],
 481             'subreddit': e['subreddit'],
 482             'post_id': e['id'],
 483         })
 484
 485     return videos
 486
 487 class NoFallbackException(Exception): pass
 488 def fallback_route(*args, **kwargs): # TODO: worthy as a flask-extension?
 489     """
 490     finds the next route that matches the current url rule, and executes it.
 491     args, kwargs: pass all arguments of the current route
 492     """
 493     from flask import current_app, request, g
 494     from werkzeug.exceptions import NotFound
 495
 496     # build a list of endpoints that match the current request's url rule:
 497     matching = [
 498         rule.endpoint
 499         for rule in current_app.url_map.iter_rules()
 500         if rule.rule == request.url_rule.rule
 501     ]
 502     current = matching.index(request.endpoint)
 503
 504     # since we can't change request.endpoint, we always get the original
 505     # endpoint back. so for repeated fall throughs, we use the g object to
 506     # increment how often we want to fall through.
 507     if not '_fallback_next' in g:
 508         g._fallback_next = 0
 509     g._fallback_next += 1
 510
 511     next_ep = current + g._fallback_next
 512
 513     if next_ep < len(matching):
 514         return current_app.view_functions[matching[next_ep]](*args, **kwargs)
 515     else:
 516         raise NoFallbackException
 517
 518 def websub_url_hmac(key, feed_id, timestamp, nonce):
 519     """ generate sha1 hmac, as required by websub/pubsubhubbub """
 520     sig_input = f"{feed_id}:{timestamp}:{nonce}".encode('ascii')
 521     return hmac.new(key.encode('ascii'), sig_input, hashlib.sha1).hexdigest()
 522
 523 def websub_body_hmac(key, body):
 524     return hmac.new(key.encode('ascii'), body, hashlib.sha1).hexdigest()
 525
 526 def pp(*args):
 527     from pprint import pprint
 528     import sys, codecs
 529     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))