app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import html
   5 import requests
   6 import requests_cache
   7 import dateutil.parser
   8 from xml.etree import ElementTree
   9 from configparser import ConfigParser
  10 from datetime import datetime, timezone
  11 from urllib.parse import parse_qs, urlparse
  12
  13 cf = ConfigParser()
  14 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  15 cf.read(config_filename)
  16 if not 'global' in cf: # todo: full config check
  17     raise Exception("Configuration file not found or empty")
  18
  19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  20 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  21
  22 # Note: this should only be required for the 'memory' backed cache.
  23 # TODO: only run for long-running processes, i.e. the frontend
  24 from threading import Timer
  25 def purge_cache(sec):
  26     requests_cache.remove_expired_responses()
  27     t = Timer(sec, purge_cache, args=(sec,))
  28     t.setDaemon(True)
  29     t.start()
  30 purge_cache(10*60)
  31
  32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  34 try:
  35     #raise Exception()
  36     from flask import g
  37     import requests
  38     from requests import Session as OriginalSession
  39     class _NSASession(OriginalSession):
  40         def request(self, method, url, params=None, data=None, **kwargs):
  41             response = super(_NSASession, self).request(
  42                 method, url, params, data, **kwargs
  43             )
  44             if 'api_requests' not in g:
  45                 g.api_requests = []
  46             g.api_requests.append((url, params, response.text))
  47             return response
  48     requests.Session = requests.sessions.Session = _NSASession
  49 except:
  50     pass
  51
  52 def fetch_xml(feed_type, feed_id):
  53     # TODO: handle requests.exceptions.ConnectionError
  54     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  55         feed_type: feed_id,
  56     })
  57     if not r.ok:
  58         return None
  59
  60     return r.text
  61
  62 def parse_xml(xmldata):
  63     ns = {
  64         'atom':"http://www.w3.org/2005/Atom",
  65         'yt': "http://www.youtube.com/xml/schemas/2015",
  66         'media':"http://search.yahoo.com/mrss/",
  67         'at': "http://purl.org/atompub/tombstones/1.0",
  68     }
  69
  70     feed = ElementTree.fromstring(xmldata)
  71     if feed.find('at:deleted-entry',ns):
  72         author = feed.find('at:deleted-entry/at:by/name',ns).text
  73         ref = feed.find('at:deleted-entry',ns).get('ref')
  74         (_, _, video_id) = ref.rpartition(':')
  75         return None, None, []
  76     title = feed.find('atom:title',ns).text
  77     author = feed.find('atom:author/atom:name',ns).text \
  78         if feed.find('atom:author',ns) else None
  79     videos = []
  80     for entry in feed.findall('atom:entry',ns):
  81         videos.append({
  82             'video_id': entry.find('yt:videoId',ns).text,
  83             'title': entry.find('atom:title',ns).text,
  84             'published': entry.find('atom:published',ns).text,
  85             'channel_id': entry.find('yt:channelId',ns).text,
  86             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  87             # extra fields for pull_subs/webhook:
  88             'updated': entry.find('atom:updated',ns).text,
  89         })
  90
  91     return title, author, videos
  92
  93 def update_channel(db, xmldata):
  94     if not xmldata: return False
  95
  96     # Note: websub does not return global author, hence taking from first video
  97     title, _, videos = parse_xml(xmldata)
  98
  99     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
 100
 101     c = db.cursor()
 102     for i, video in enumerate(videos):
 103         now = datetime.now(timezone.utc)
 104         updated = dateutil.parser.parse(video['updated'])
 105         published = dateutil.parser.parse(video['published'])
 106         # if update and published time are near-identical, we assume it's new.
 107         if (updated - published).seconds < 60 and (now - published).days < 7:
 108             timestamp = now
 109         else:#, it's just an update to an older video.
 110             timestamp = published
 111
 112         c.execute("""
 113             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 114                            VALUES (?, ?, ?, datetime(?), datetime(?))
 115         """, (
 116             video['video_id'],
 117             video['channel_id'],
 118             video['title'],
 119             video['published'],
 120             timestamp
 121         ))
 122
 123         if i == 0: # only required once per feed
 124             c.execute("""
 125                 INSERT OR REPLACE INTO channels (id, name)
 126                                 VALUES (?, ?)
 127             """, (video['channel_id'], video['author']))
 128     db.commit()
 129
 130     return True
 131
 132 def get_video_info(video_id, sts=0, algo=""):
 133     """
 134     returns: best-quality muxed video stream, player_response, error-type/mesage
 135     error types: player, malformed, livestream, geolocked, exhausted
 136     """
 137     player_error = None # for 'exhausted'
 138     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 139         r = requests.get("https://www.youtube.com/get_video_info", {
 140             "video_id": video_id,
 141             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 142             "el": el,
 143             "sts": sts,
 144             "hl": "en_US",
 145         })
 146         params = parse_qs(r.text)
 147         if 'errorcode' in params: # status=fail
 148             return None, None, 'malformed', params['reason'][0]
 149
 150         metadata = json.loads(params.get('player_response')[0])
 151         playabilityStatus = metadata['playabilityStatus']['status']
 152         if playabilityStatus != "OK":
 153             playabilityReason = metadata['playabilityStatus']['reason']
 154             player_error = f"{playabilityStatus}: {playabilityReason}"
 155             if playabilityStatus == "UNPLAYABLE":
 156                 continue  # try again with next el value (or fail as exhausted)
 157             # without videoDetails, there's only the error message
 158             maybe_metadata = metadata if 'videoDetails' in metadata else None
 159             return None, maybe_metadata, 'player', player_error
 160         if metadata['videoDetails']['isLiveContent'] and \
 161                 (metadata['videoDetails'].get('isLive', False) or \
 162                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 163             return None, metadata, 'livestream', None
 164
 165         if not 'formats' in metadata['streamingData']:
 166             continue # no urls
 167
 168         formats = metadata['streamingData']['formats']
 169         for (i,v) in enumerate(formats):
 170             if not ('cipher' in v or 'signatureCipher' in v): continue
 171             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 172             formats[i]['url'] = unscramble(cipher, algo)
 173
 174         # todo: check if we have urls or try again
 175         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 176
 177         if 'gcr' in parse_qs(url):
 178             return None, metadata, 'geolocked', None
 179
 180         return url, metadata, None, None
 181     else:
 182         return None, metadata, 'exhausted', player_error
 183
 184 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 185     signature = list(cipher['s'][0])
 186     for c in algo.split():
 187         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 188         ix = int(ix) % len(signature) if ix else 0
 189         if not op: continue
 190         if op == 'r': signature = list(reversed(signature))
 191         if op == 's': signature = signature[ix:]
 192         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 193     sp = cipher.get('sp', ['signature'])[0]
 194     sig = cipher.get('sig', [''.join(signature)])[0]
 195     return f"{cipher['url'][0]}&{sp}={sig}"
 196
 197 def prepare_metadata(metadata):
 198     meta1 = metadata['videoDetails']
 199     meta2 = metadata['microformat']['playerMicroformatRenderer']
 200     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 201         if 'cards' in metadata else []
 202     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 203         if 'endscreen' in metadata else []
 204
 205     # the actual video streams have exact information:
 206     try:
 207         sd = metadata['streamingData']
 208         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 209         aspect_ratio = some_stream['width'] / some_stream['height']
 210     # if that's unavailable (e.g. on livestreams), fall back to
 211     # thumbnails (only either 4:3 or 16:9).
 212     except:
 213         some_img = meta2['thumbnail']['thumbnails'][0]
 214         aspect_ratio = some_img['width'] / some_img['height']
 215
 216     subtitles = sorted([
 217         {'url':cc['baseUrl'],
 218          'code':cc['languageCode'],
 219          'autogenerated':cc.get('kind')=="asr",
 220          'name':cc['name']['simpleText']}
 221         for cc in metadata.get('captions',{})
 222             .get('playerCaptionsTracklistRenderer',{})
 223             .get('captionTracks',[])
 224     ], key=lambda cc: cc['autogenerated'])
 225
 226     def clean_url(url):
 227         # externals URLs are redirected through youtube.com/redirect, but we
 228         # may encounter internal URLs, too
 229         return parse_qs(urlparse(url).query).get('q',[url])[0]
 230     # Remove left-/rightmost word from string:
 231     delL = lambda s: s.partition(' ')[2]
 232     delR = lambda s: s.rpartition(' ')[0]
 233     # Thousands seperator aware int():
 234     intT = lambda s: int(s.replace(',', ''))
 235
 236     def parse_infocard(card):
 237         card = card['cardRenderer']
 238         ctype = list(card['content'].keys())[0]
 239         content = card['content'][ctype]
 240         if ctype == "pollRenderer":
 241             ctype = "POLL"
 242             content = {
 243                 'question': content['question']['simpleText'],
 244                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 245                     for a in content['choices']],
 246             }
 247         elif ctype == "videoInfoCardContentRenderer":
 248             ctype = "VIDEO"
 249             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 250             # TODO: this is ugly; cleanup.
 251             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 252             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 253             content = {
 254                 'video_id': content['action']['watchEndpoint']['videoId'],
 255                 'title': content['videoTitle']['simpleText'],
 256                 'author': delL(content['channelName']['simpleText']),
 257                 'length': length,
 258                 'views': intT(delR(content['viewCountText']['simpleText'])),
 259             }
 260         elif ctype == "playlistInfoCardContentRenderer":
 261             ctype = "PLAYLIST"
 262             content = {
 263                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 264                 'video_id': content['action']['watchEndpoint']['videoId'],
 265                 'title': content['playlistTitle']['simpleText'],
 266                 'author': delL(content['channelName']['simpleText']),
 267                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 268             }
 269         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 270             ctype = "WEBSITE"
 271             content = {
 272                 'url': clean_url(content['command']['urlEndpoint']['url']),
 273                 'domain': content['displayDomain']['simpleText'],
 274                 'title': content['title']['simpleText'],
 275                 # XXX: no thumbnails for infocards
 276             }
 277         elif ctype == "collaboratorInfoCardContentRenderer":
 278             ctype = "CHANNEL"
 279             content = {
 280                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 281                 'title': content['channelName']['simpleText'],
 282                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 283                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 284             }
 285         else:
 286             import pprint
 287             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 288
 289         return {'type': ctype, 'content': content}
 290
 291     def mkthumbs(thumbs):
 292         return {e['height']: e['url'] for e in thumbs}
 293     def parse_endcard(card):
 294         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 295         ctype = card['style']
 296         if ctype == "CHANNEL":
 297             content = {
 298                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 299                 'title': card['title']['simpleText'],
 300                 'icons': mkthumbs(card['image']['thumbnails']),
 301             }
 302         elif ctype == "VIDEO":
 303             content = {
 304                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 305                 'title': card['title']['simpleText'],
 306                 'length': card['videoDuration']['simpleText'],  # '12:21'
 307                 'views': delR(card['metadata']['simpleText']),
 308                 # XXX: no channel name
 309             }
 310         elif ctype == "PLAYLIST":
 311             content = {
 312                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 313                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 314                 'title': card['title']['simpleText'],
 315                 'author': delL(card['metadata']['simpleText']),
 316                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 317             }
 318         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 319             ctype = "WEBSITE"
 320             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 321             content = {
 322                 'url': url,
 323                 'domain': urlparse(url).netloc,
 324                 'title': card['title']['simpleText'],
 325                 'icons': mkthumbs(card['image']['thumbnails']),
 326             }
 327         else:
 328             import pprint
 329             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 330
 331         return {'type': ctype, 'content': content}
 332
 333     infocards = [parse_infocard(card) for card in cards]
 334     endcards = [parse_endcard(card) for card in endsc]
 335     # combine cards to weed out duplicates. for videos and playlists prefer
 336     # infocards, for channels and websites prefer endcards, as those have more
 337     # information than the other.
 338     # if the card type is not in ident, we use the whole card for comparison
 339     # (otherwise they'd all replace each other)
 340     ident = { # ctype -> ident
 341         'VIDEO': 'video_id',
 342         'PLAYLIST': 'playlist_id',
 343         'CHANNEL': 'channel_id',
 344         'WEBSITE': 'url',
 345         'POLL': 'question',
 346     }
 347     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 348     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 349     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 350
 351     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 352                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 353
 354     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 355         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 356         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 357         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 358         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 359         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 360         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 361         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 362         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 363         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 364         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 365     whitelisted = sorted(meta2.get('availableCountries',[]))
 366     blacklisted = sorted(set(all_countries) - set(whitelisted))
 367
 368     return {
 369         'title': meta1['title'],
 370         'author': meta1['author'],
 371         'channel_id': meta1['channelId'],
 372         'description': meta1['shortDescription'],
 373         'published': meta2['publishDate'],
 374         'views': meta1['viewCount'],
 375         'length': int(meta1['lengthSeconds']),
 376         'rating': meta1['averageRating'],
 377         'category': meta2['category'],
 378         'aspectr': aspect_ratio,
 379         'unlisted': meta2['isUnlisted'],
 380         'countries': whitelisted,
 381         'blacklisted': blacklisted,
 382         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 383         'infocards': infocards,
 384         'endcards': endcards,
 385         'all_cards': allcards,
 386         'subtitles': subtitles,
 387     }
 388
 389 class RedditException(Exception): pass
 390 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 391         count=None, before=None, after=None):
 392     """
 393     fetches data from a subreddit (or a multireddit like gif+gifs) and
 394     filters/sorts results.
 395     sorted_by values: hot, new, rising, controversial, top
 396     time values: hour, day, week, month, year, all (for top and controversial)
 397     returns a tuple of ([{video}],before,after)
 398     """
 399     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 400
 401     if not subreddits:
 402         return [], None, None
 403
 404     query = {k:v for k,v in {
 405         'count':count,
 406         'before':before,
 407         'after':after,
 408         'limit':limit, # 1..100 (default 25)
 409         't': time, # hour,week,month,year,all
 410     }.items() if v}
 411     multireddit = '+'.join(subreddits)
 412     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 413             query, headers={'User-Agent':'Mozilla/5.0'})
 414     if not r.ok or not 'data' in r.json():
 415         raise RedditException(r.text)
 416
 417     videos = []
 418     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 419     for entry in entries:
 420         e = entry['data']
 421         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 422             continue
 423         try:
 424             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 425             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 426         except:
 427             continue # XXX: should we log that?
 428         if not video_id: continue
 429         videos.append({
 430             'video_id': video_id,
 431             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
 432             'url': e['permalink'],
 433             'n_comments': e['num_comments'],
 434             'n_karma': e['score'],
 435             'subreddit': e['subreddit'],
 436             'post_id': e['id'],
 437         })
 438     before = r.json()['data']['before']
 439     after = r.json()['data']['after']
 440
 441     return videos, before, after
 442
 443 def pp(*args):
 444     from pprint import pprint
 445     import sys, codecs
 446     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))