app/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  31     if not r.ok:
  32         return None
  33
  34     return r.text
  35
  36 def parse_xml(xmldata):
  37     ns = {
  38         'atom':"http://www.w3.org/2005/Atom",
  39         'yt': "http://www.youtube.com/xml/schemas/2015",
  40         'media':"http://search.yahoo.com/mrss/"
  41     }
  42
  43     feed = ElementTree.fromstring(xmldata)
  44     title = feed.find('atom:title',ns).text
  45     author = feed.find('atom:author/atom:name',ns).text \
  46         if feed.find('atom:author',ns) else None
  47     videos = []
  48     for entry in feed.findall('atom:entry',ns):
  49         videos.append({
  50             'video_id': entry.find('yt:videoId',ns).text,
  51             'title': entry.find('atom:title',ns).text,
  52             'published': entry.find('atom:published',ns).text,
  53             'channel_id': entry.find('yt:channelId',ns).text,
  54             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  55             # extra fields for pull_subs/webhook:
  56             'updated': entry.find('atom:updated',ns).text,
  57         })
  58
  59     return title, author, videos
  60
  61 def update_channel(db, xmldata):
  62     if not xmldata: return False
  63
  64     # Note: websub does not return global author, hence taking from first video
  65     title, _, videos = parse_xml(xmldata)
  66
  67     c = db.cursor()
  68     for i, video in enumerate(videos):
  69         now = datetime.now(timezone.utc)
  70         updated = dateutil.parser.parse(video['updated'])
  71         published = dateutil.parser.parse(video['published'])
  72         # if update and published time are near-identical, we assume it's new.
  73         if (updated - published).seconds < 60 and (now - published).days < 7:
  74             timestamp = now
  75         else:#, it's just an update to an older video.
  76             timestamp = published
  77
  78         c.execute("""
  79             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  80                            VALUES (?, ?, ?, datetime(?), datetime(?))
  81         """, (
  82             video['video_id'],
  83             video['channel_id'],
  84             video['title'],
  85             video['published'],
  86             timestamp
  87         ))
  88
  89         if i == 0: # only required once per feed
  90             c.execute("""
  91                 INSERT OR REPLACE INTO channels (id, name)
  92                                 VALUES (?, ?)
  93             """, (video['channel_id'], video['author']))
  94     db.commit()
  95
  96     return True
  97
  98 def get_video_info(video_id, sts=0, algo=""):
  99     """
 100     returns: best-quality muxed video stream, player_response, error-type/mesage
 101     error types: 'initial':  the request to get_video_info was malformed
 102                  'player':   playabilityStatus != OK
 103                  'internal': [livestream, geolocked, exhausted]
 104     """
 105     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 106         r = requests.get(f"https://www.youtube.com/get_video_info"+
 107             f"?video_id={video_id}"+
 108             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 109             f"&el={el}"+
 110             f"&sts={sts}"+
 111             f"&hl=en_US") #"&hl=en&gl=US"
 112         params = parse_qs(r.text)
 113         if 'errorcode' in params: # status=fail
 114             return None, None, 'initial', f"MALFORMED: {params['reason'][0]}" # TODO: assuming we haven't fucked it up, this error comes up if the video id is garbage. give better error message
 115
 116         metadata = json.loads(params.get('player_response')[0])
 117         playabilityStatus = metadata['playabilityStatus']['status']
 118         if playabilityStatus != "OK":
 119             if playabilityStatus == "UNPLAYABLE":
 120                 continue  # try again with next el value (or fail as exhausted)
 121             reason = metadata['playabilityStatus']['reason']
 122             return None, None, 'player', f"{playabilityStatus}: {reason}"
 123         if 'liveStreamability' in metadata['playabilityStatus']:
 124             # can also check .microformat.liveBroadcastDetails.isLiveNow
 125             return None, metadata, 'internal', "livestream"
 126
 127         if not 'formats' in metadata['streamingData']:
 128             #TODO: hls only video with those params (kAZCrtJJaAo):
 129             #   "videoDetails": {
 130             #    "isLiveDefaultBroadcast": true,
 131             #    "isLowLatencyLiveStream": true,
 132             #    "isLiveContent": true,
 133             #    "isPostLiveDvr": true
 134             continue # no urls
 135
 136         formats = metadata['streamingData']['formats']
 137         for (i,v) in enumerate(formats):
 138             if not ('cipher' in v or 'signatureCipher' in v): continue
 139             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 140             formats[i]['url'] = unscramble(cipher, algo)
 141
 142         # todo: check if we have urls or try again
 143         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 144
 145         if 'gcr' in parse_qs(url):
 146             return None, metadata, 'internal', "geolocked"
 147
 148         return url, metadata, None, None
 149     else:
 150         return None, metadata, 'internal', "exhausted"
 151
 152 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 153     signature = list(cipher['s'][0])
 154     for c in algo.split():
 155         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 156         ix = int(ix) % len(signature) if ix else 0
 157         if not op: continue
 158         if op == 'r': signature = list(reversed(signature))
 159         if op == 's': signature = signature[ix:]
 160         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 161     sp = cipher.get('sp', ['signature'])[0]
 162     sig = cipher.get('sig', [''.join(signature)])[0]
 163     return f"{cipher['url'][0]}&{sp}={sig}"
 164
 165 def prepare_metadata(metadata):
 166     meta1 = metadata['videoDetails']
 167     meta2 = metadata['microformat']['playerMicroformatRenderer']
 168     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 169         if 'cards' in metadata else []
 170     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 171         if 'endscreen' in metadata else []
 172
 173     # TODO: wrong on non-4:3 and non-16:9 videos! (e.g. l06PlYNShpQ)
 174     #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
 175     aspect_ratio = meta2['thumbnail']['thumbnails'][0]['width'] / meta2['thumbnail']['thumbnails'][0]['height']
 176
 177     subtitles = sorted([
 178         {'url':cc['baseUrl'],
 179          'code':cc['languageCode'],
 180          'autogenerated':cc.get('kind')=="asr",
 181          'name':cc['name']['simpleText']}
 182         for cc in metadata['captions']['playerCaptionsTracklistRenderer']['captionTracks']
 183     ], key=lambda cc: cc['autogenerated']) if 'captions' in metadata and 'captionTracks' in metadata['captions']['playerCaptionsTracklistRenderer'] else [] # TODO<,^: cleanup
 184
 185     def clean_url(url):
 186         # externals URLs are redirected through youtube.com/redirect, but we
 187         # may encounter internal URLs, too
 188         url = parse_qs(urlparse(url).query).get('q',[url])[0]
 189     # Remove left-/rightmost word from string:
 190     delL = lambda s: s.partition(' ')[2]
 191     delR = lambda s: s.rpartition(' ')[0]
 192     # Thousands seperator aware int():
 193     intT = lambda s: int(s.replace(',', ''))
 194
 195     def parse_infocard(card):
 196         card = card['cardRenderer']
 197         ctype = list(card['content'].keys())[0]
 198         content = card['content'][ctype]
 199         if ctype == "pollRenderer":
 200             ctype = "POLL"
 201             content = {
 202                 'question': content['question']['simpleText'],
 203                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 204                     for a in content['choices']],
 205             }
 206         elif ctype == "videoInfoCardContentRenderer":
 207             ctype = "VIDEO"
 208             content = {
 209                 'video_id': content['action']['watchEndpoint']['videoId'],
 210                 'title': content['videoTitle']['simpleText'],
 211                 'author': delL(content['channelName']['simpleText']),
 212                 'length': content['lengthString']['simpleText'],  # '23:03'
 213                 'views': intT(delR(content['viewCountText']['simpleText'])),
 214             }
 215         elif ctype == "playlistInfoCardContentRenderer":
 216             ctype = "PLAYLIST"
 217             content = {
 218                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 219                 'video_id': content['action']['watchEndpoint']['videoId'],
 220                 'title': content['playlistTitle']['simpleText'],
 221                 'author': delL(content['channelName']['simpleText']),
 222                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 223             }
 224         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content.get('command',{}).keys(): # <TODO: cleanup
 225             ctype = "WEBSITE"
 226             content = {
 227                 'url': clean_url(content['command']['urlEndpoint']['url']),
 228                 'domain': content['displayDomain']['simpleText'],
 229                 'title': content['title']['simpleText'],
 230                 # XXX: no thumbnails for infocards
 231             }
 232         else:
 233             import pprint
 234             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 235
 236         return {'type': ctype, 'content': content}
 237
 238     def mkthumbs(thumbs):
 239         return {e['height']: e['url'] for e in thumbs}
 240     def parse_endcard(card):
 241         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 242         ctype = card['style']
 243         if ctype == "CHANNEL":
 244             content = {
 245                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 246                 'title': card['title']['simpleText'],
 247                 'icons': mkthumbs(card['image']['thumbnails']),
 248             }
 249         elif ctype == "VIDEO":
 250             content = {
 251                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 252                 'title': card['title']['simpleText'],
 253                 'length': card['videoDuration']['simpleText'],  # '12:21'
 254                 'views': delR(card['metadata']['simpleText']),
 255                 # XXX: no channel name
 256             }
 257         elif ctype == "PLAYLIST":
 258             content = {
 259                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 260                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 261                 'title': card['title']['simpleText'],
 262                 'author': delL(card['metadata']['simpleText']),
 263                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 264             }
 265         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 266             ctype = "WEBSITE"
 267             content = {
 268                 'url': clean_url(card['endpoint']['urlEndpoint']['url']),
 269                 'domain': urlparse(url).netloc, # TODO: remove .domain
 270                 'title': card['title']['simpleText'],
 271                 'icons': mkthumbs(card['image']['thumbnails']),
 272             }
 273         else:
 274             import pprint
 275             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 276
 277         return {'type': ctype, 'content': content}
 278
 279     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 280         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 281         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 282         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 283         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 284         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 285         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 286         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 287         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 288         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 289         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 290     whitelisted = sorted(meta2['availableCountries'])
 291     blacklisted = sorted(set(all_countries) - set(whitelisted))
 292
 293     return {
 294         'title': meta1['title'],
 295         'author': meta1['author'],
 296         'channel_id': meta1['channelId'],
 297         'description': meta1['shortDescription'],
 298         'published': meta2['publishDate'],
 299         'views': meta1['viewCount'],
 300         'length': int(meta1['lengthSeconds']),
 301         'rating': meta1['averageRating'],
 302         'category': meta2['category'],
 303         'aspectr': aspect_ratio,
 304         'unlisted': meta2['isUnlisted'],
 305         'countries': whitelisted,
 306         'blacklisted': blacklisted,
 307         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 308         'infocards': [parse_infocard(card) for card in cards],
 309         'endcards': [parse_endcard(card) for card in endsc],
 310         'subtitles': subtitles,
 311     }
 312
 313 def pp(*args):
 314     from pprint import pprint
 315     import sys, codecs
 316     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))