app/common/innertube.py

   1 # functions that deal with parsing data from youtube's internal API ("innertube")
   2
   3 from urllib.parse import parse_qs, urlparse
   4
   5 def findall(obj, key):
   6     """
   7     given a list of dicts, where one dict contains a given key, return said key.
   8     """
   9     if obj is None: return []
  10     return [ obj[key] for obj in obj if key in obj.keys() ]
  11 def listget(obj, index, fallback=None):
  12     return next(iter(obj[index:]), fallback)
  13 flatten = lambda l: [item for sublist in l for item in sublist] # https://stackoverflow.com/a/952952
  14 first = lambda l: next(iter(l),{})
  15 listfind = lambda obj,key: first(findall(obj,key))
  16
  17 def prepare_searchresults(yt_results):
  18     contents = listfind(yt_results, 'response') \
  19         .get('contents',{})\
  20         .get('twoColumnSearchResultsRenderer',{})\
  21         .get('primaryContents',{})\
  22         .get('sectionListRenderer',{})\
  23         .get('contents',[])
  24     contents = flatten([c.get('contents',[]) for c in findall(contents, 'itemSectionRenderer')])
  25
  26     return parse_result_items(contents)
  27
  28 def prepare_infocards(metadata):
  29     cards = metadata.get('cards',{}).get('cardCollectionRenderer',{}).get('cards',[])
  30     return list(filter(None, map(parse_infocard, cards)))
  31
  32 def prepare_endcards(metadata):
  33     endsc = metadata.get('endscreen',{}).get('endscreenRenderer',{}).get('elements',[])
  34     return list(filter(None, map(parse_endcard, endsc)))
  35
  36 def prepare_channel(result, channel_id):
  37     response = listfind(result,'response')
  38
  39     if 'alerts' in response: # possibly got an error back
  40         from flask import current_app
  41         current_app.logger.error([(alert['alertRenderer']['type'],alert['alertRenderer']['text']['simpleText']) for alert in response['alerts']])
  42         return None,None,[],[],False
  43
  44     meta1 = response.get('metadata',{}).get('channelMetadataRenderer',{})
  45     meta2 = response.get('microformat',{}).get('microformatDataRenderer',{})
  46     title = meta1.get('title', meta2.get('title'))
  47     descr = meta1.get('description', meta2.get('description')) # meta2.description is capped at 160chars
  48     thumb = mkthumbs(meta2.get('thumbnail',meta1.get('avatar',{})).get('thumbnails',{})) # .avatar ~ 900px
  49
  50     contents = response.get('continuationContents')
  51     if not contents: # overran end of list
  52         return title, descr, thumb, [], False
  53
  54     unparsed = contents.get('gridContinuation',{}).get('items') or \
  55             contents.get('sectionListContinuation',{}).get('contents') or []
  56     items, extra = parse_channel_items(unparsed, channel_id, title)
  57     has_more = 'continuations' in (contents.get('gridContinuation') or
  58             contents.get('sectionListContinuation') or {})
  59
  60     return title, descr, thumb, items, has_more
  61
  62 def prepare_playlist(result):
  63     contents = listfind(result,'response')['continuationContents']['playlistVideoListContinuation'] \
  64         .get('contents',[]) # no .contents if overran end of playlist
  65     return list(filter(None, map(parse_playlist, contents)))
  66
  67 def mkthumbs(thumbs):
  68     output = {str(e['height']): e['url'] for e in thumbs}
  69     largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None)
  70     return {**output, 'largest': largest}
  71
  72 def clean_url(url):
  73     # externals URLs are redirected through youtube.com/redirect, but we
  74     # may encounter internal URLs, too
  75     return parse_qs(urlparse(url).query).get('q',[url])[0]
  76
  77 def toInt(s, fallback=0):
  78     if s is None:
  79         return fallback
  80     try:
  81         return int(''.join(filter(str.isdigit, s)))
  82     except ValueError:
  83         return fallback
  84
  85 # Remove left-/rightmost word from string:
  86 delL = lambda s: s.partition(' ')[2]
  87
  88 def age(s):
  89     if s is None: # missing from autogen'd music, some livestreams
  90         return None
  91     # Some livestreams have "Streamed 7 hours ago"
  92     s = s.replace("Streamed ","")
  93     # Now, everything should be in the form "1 year ago"
  94     value, unit, _ = s.split(" ")
  95     suffix = dict(
  96         month='mn',
  97         months='mn',
  98     ).get(unit, unit[0]) # first letter otherwise (e.g. year(s) => y)
  99
 100     return f"{value}{suffix}"
 101
 102 def log_unknown_card(data):
 103     import json
 104     try:
 105         from flask import request
 106         source = request.url
 107     except: source = "unknown"
 108     with open("/tmp/innertube.err", "a") as f:
 109         f.write(f"\n/***** {source} *****/\n")
 110         json.dump(data, f, indent=2)
 111
 112 def parse_result_items(items):
 113     # TODO: use .get() for most non-essential attributes
 114     """
 115     parses youtube search response into an easier to use format.
 116     """
 117     results = []
 118     extras = []
 119     for item in items:
 120         key = next(iter(item.keys()), None)
 121         content = item[key]
 122         if key == 'videoRenderer':
 123             is_live = listfind(content.get('badges',[]), 'metadataBadgeRenderer').get('style') == 'BADGE_STYLE_TYPE_LIVE_NOW'
 124             results.append({'type': 'VIDEO', 'content': {
 125                 'video_id': content['videoId'],
 126                 'title': content['title']['runs'][0]['text'],
 127                 'author': content['longBylineText']['runs'][0]['text'] or \
 128                           content['shortBylineText']['runs'][0]['text'],
 129                 'channel_id': content['ownerText']['runs'][0] \
 130                     ['navigationEndpoint']['browseEndpoint']['browseId'],
 131                 'length': content.get('lengthText',{}).get('simpleText') \
 132                     if not is_live else 'LIVE', # "44:07", "1:41:50"
 133                 'views': toInt(content.get('viewCountText',{}).get('simpleText') or # "123,456 views"
 134                     listget(content.get('viewCountText',{}).get('runs'),0,{}).get('text')), # "1,234 watching"
 135                 'published': age(content.get('publishedTimeText',{}).get('simpleText')),
 136             }})
 137         elif key == 'playlistRenderer':
 138             results.append({'type': 'PLAYLIST', 'content': {
 139                 'playlist_id': content['navigationEndpoint']['watchEndpoint']['playlistId'],
 140                 'video_id': content['navigationEndpoint']['watchEndpoint']['videoId'],
 141                 'title': content['title']['simpleText'],
 142                 'author': content['longBylineText']['runs'][0]['text'] or
 143                      content['shortBylineText']['runs'][0]['text'],
 144                 'channel_id': content['longBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], # OR .shortBylineText
 145                 'n_videos': toInt(content['videoCount']),
 146             }})
 147         elif key == 'radioRenderer': # "Mix" playlists
 148             results.append({'type': 'PLAYLIST', 'content': {
 149                 'playlist_id': content['playlistId'],
 150                 'video_id': content['navigationEndpoint']['watchEndpoint']['videoId'],
 151                 'title': content['title']['simpleText'],
 152                 'author': content['longBylineText']['simpleText'] or \
 153                     content['shortBylineText']['simpleText'] , # always "YouTube"
 154                 'channel_id': None,
 155                 'n_videos': content['videoCountShortText']['runs'][0]['text'] or \
 156                     content['videoCountText']['runs'][0]['text'],
 157                     # videoCountShortText: "50+"; videoCountText: "50+ videos"
 158             }})
 159         elif key == 'channelRenderer':
 160             results.append({'type': 'CHANNEL', 'content': {
 161                 'channel_id': content['channelId'],
 162                 'title': content['title']['simpleText'],
 163                 'icons': mkthumbs(content['thumbnail']['thumbnails']),
 164                 'subscribers': content.get('subscriberCountText',{}).get('simpleText'), # "2.47K subscribers"
 165             }})
 166         elif key == 'shelfRenderer':
 167             r, e = parse_result_items(content['content']['verticalListRenderer']['items'])
 168             results.extend(r)
 169             extras.extend(e)
 170         elif key == 'movieRenderer': # movies to buy/rent
 171             pass
 172         elif key == 'carouselAdRenderer' or key == 'searchPyvRenderer': # haha, no.
 173             pass
 174         elif key == 'horizontalCardListRenderer':
 175             # suggested searches: .cards[].searchRefinementCardRenderer.query.runs[].text
 176             pass
 177         elif key == 'emergencyOneboxRenderer': # suicide prevention hotline
 178             pass
 179         elif key == 'didYouMeanRenderer' or key == 'showingResultsForRenderer':
 180             extras.append({
 181                 'type': 'spelling',
 182                 'query': content['correctedQueryEndpoint']['searchEndpoint']['query'], # non-misspelled query
 183                 'autocorrected': key == 'showingResultsForRenderer',
 184             })
 185         else:
 186             log_unknown_card(item)
 187     return results, extras
 188
 189 def parse_infocard(card):
 190     """
 191     parses a single infocard into a format that's easier to handle.
 192     """
 193     card = card['cardRenderer']
 194     ctype = list(card['content'].keys())[0]
 195     content = card['content'][ctype]
 196     if ctype == "pollRenderer":
 197         return {'type': "POLL", 'content': {
 198             'question': content['question']['simpleText'],
 199             'answers': [(a['text']['simpleText'],a['numVotes']) \
 200                 for a in content['choices']],
 201         }}
 202     elif ctype == "videoInfoCardContentRenderer":
 203         is_live = content.get('badge',{}).get('liveBadgeRenderer') is not None
 204         return {'type': "VIDEO", 'content': {
 205             'video_id': content['action']['watchEndpoint']['videoId'],
 206             'title': content['videoTitle']['simpleText'],
 207             'author': delL(content['channelName']['simpleText']),
 208             'length': content.get('lengthString',{}).get('simpleText') \
 209                 if not is_live else "LIVE", # "23:03"
 210             'views': toInt(content.get('viewCountText',{}).get('simpleText')),
 211                 # XXX: views sometimes "Starts: July 31, 2020 at 1:30 PM"
 212         }}
 213     elif ctype == "playlistInfoCardContentRenderer":
 214         return {'type': "PLAYLIST", 'content': {
 215             'playlist_id': content['action']['watchEndpoint']['playlistId'],
 216             'video_id': content['action']['watchEndpoint']['videoId'],
 217             'title': content['playlistTitle']['simpleText'],
 218             'author': delL(content['channelName']['simpleText']),
 219             'n_videos': toInt(content['playlistVideoCount']['simpleText']),
 220         }}
 221     elif ctype == "simpleCardContentRenderer" and \
 222             'urlEndpoint' in content['command']:
 223         return {'type': "WEBSITE", 'content': {
 224             'url': clean_url(content['command']['urlEndpoint']['url']),
 225             'domain': content['displayDomain']['simpleText'],
 226             'title': content['title']['simpleText'],
 227             # XXX: no thumbnails for infocards
 228         }}
 229     elif ctype == "collaboratorInfoCardContentRenderer":
 230         return {'type': "CHANNEL", 'content': {
 231             'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 232             'title': content['channelName']['simpleText'],
 233             'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 234             'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers"
 235         }}
 236     else:
 237         log_unknown_card(card)
 238         return None
 239
 240 def parse_endcard(card):
 241     """
 242     parses a single endcard into a format that's easier to handle.
 243     """
 244     card = card.get('endscreenElementRenderer', card) #only sometimes nested
 245     ctype = card['style']
 246     if ctype == "CHANNEL":
 247         return {'type': ctype, 'content': {
 248             'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 249             'title': card['title']['simpleText'],
 250             'icons': mkthumbs(card['image']['thumbnails']),
 251         }}
 252     elif ctype == "VIDEO":
 253         return {'type': ctype, 'content': {
 254             'video_id': card['endpoint']['watchEndpoint']['videoId'],
 255             'title': card['title']['simpleText'],
 256             'length': card['videoDuration']['simpleText'],  # '12:21'
 257             'views': toInt(card['metadata']['simpleText']),
 258             # XXX: no channel name
 259         }}
 260     elif ctype == "PLAYLIST":
 261         return {'type': ctype, 'content': {
 262             'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 263             'video_id': card['endpoint']['watchEndpoint']['videoId'],
 264             'title': card['title']['simpleText'],
 265             'author': delL(card['metadata']['simpleText']),
 266             'n_videos': toInt(card['playlistLength']['simpleText']),
 267         }}
 268     elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 269         url = clean_url(card['endpoint']['urlEndpoint']['url'])
 270         return {'type': "WEBSITE", 'content': {
 271             'url': url,
 272             'domain': urlparse(url).netloc,
 273             'title': card['title']['simpleText'],
 274             'icons': mkthumbs(card['image']['thumbnails']),
 275         }}
 276     else:
 277         log_unknown_card(card)
 278         return None
 279
 280 def parse_channel_items(items, channel_id, author):
 281     result = []
 282     extra = []
 283     for item in items:
 284         key = next(iter(item.keys()), None)
 285         content = item[key]
 286         if key == "gridVideoRenderer" or key == "videoRenderer":
 287             result.append({'type': 'VIDEO', 'content': {
 288                 'video_id': content['videoId'],
 289                 'title': content['title']['simpleText'],
 290                 'author': author,
 291                 'channel_id': channel_id,
 292                 'length': listfind(content.get('thumbnailOverlays',[]),'thumbnailOverlayTimeStatusRenderer').get('text',{}).get('simpleText'),
 293                 'views': toInt(content.get('viewCountText',{}).get('simpleText')),
 294                 'published': age(content.get('publishedTimeText',{}).get('simpleText')),
 295             }})
 296         elif key == "gridPlaylistRenderer" or key == "playlistRenderer":
 297             result.append({'type': 'PLAYLIST', 'content': {
 298                 'playlist_id': content['navigationEndpoint']['watchEndpoint']['playlistId'],
 299                 'video_id': content['navigationEndpoint']['watchEndpoint']['videoId'],
 300                 'title': (content['title'].get('simpleText') or # playlistRenderer
 301                     content['title']['runs'][0]['text']), # gridPlaylistRenderer
 302                 'author': author,
 303                 'channel_id': channel_id,
 304                 'n_videos': toInt(content.get('videoCount') or # playlistRenderer
 305                     content.get('videoCountShortText',{}).get('simpleText') or # grid(1)
 306                     content.get('videoCountText',{}).get('runs',[{}])[0].get('text')), # grid(2)
 307             }})
 308         elif key == "itemSectionRenderer":
 309                 r, e = parse_channel_items(content['contents'], channel_id, author)
 310                 result.extend(r)
 311                 extra.extend(e)
 312         elif key == "messageRenderer":
 313             # e.g. {'messageRenderer': {'text': {'runs': [{'text': 'This channel has no playlists.'}]}}}
 314             pass
 315         else:
 316             log_unknown_card(item)
 317
 318     return result, extra
 319
 320 def parse_playlist(item):
 321     key = next(iter(item.keys()), None)
 322     content = item[key]
 323     if key == "playlistVideoRenderer":
 324         if not content.get('isPlayable', False):
 325             return None # private or deleted video
 326
 327         return {'type': 'VIDEO', 'content': {
 328             'video_id': content['videoId'],
 329             'title': (content['title'].get('simpleText') or # playable videos
 330                 content['title'].get('runs',[{}])[0].get('text')), # "[Private video]"
 331             'playlist_id': content['navigationEndpoint']['watchEndpoint']['playlistId'],
 332             'index': content['navigationEndpoint']['watchEndpoint']['index'], #or int(content['index']['simpleText'])
 333             # rest is missing from unplayable videos:
 334             'author': content.get('shortBylineText',{}).get('runs',[{}])[0].get('text'),
 335             'channel_id':content.get('shortBylineText',{}).get('runs',[{}])[0].get('navigationEndpoint',{}).get('browseEndpoint',{}).get('browseId'),
 336             'length': (content.get("lengthText",{}).get("simpleText") or # "8:51"
 337                 int(content.get("lengthSeconds", 0))), # "531"
 338             'starttime': content['navigationEndpoint']['watchEndpoint'].get('startTimeSeconds'),
 339         }}
 340     else:
 341         raise Exception(item) # XXX TODO