# functions that deal with parsing data from youtube's internal API ("innertube") from urllib.parse import parse_qs, urlparse def listfind(obj, key): """ given a list of dicts, where one dict contains a given key, return said key. """ return next(iter([ obj[key] for obj in obj if key in obj.keys() ]),{}) def listget(obj, index, fallback=None): return next(iter(obj[index:]), fallback) def prepare_searchresults(yt_results): contents = listfind(yt_results, 'response') \ .get('contents',{})\ .get('twoColumnSearchResultsRenderer',{})\ .get('primaryContents',{})\ .get('sectionListRenderer',{})\ .get('contents',[]) contents = listfind(contents, 'itemSectionRenderer').get('contents',[]) return parse_result_items(contents) def prepare_infocards(metadata): cards = metadata.get('cards',{}).get('cardCollectionRenderer',{}).get('cards',[]) return [parse_infocard(card) for card in cards if card is not None] def prepare_endcards(metadata): endsc = metadata.get('endscreen',{}).get('endscreenRenderer',{}).get('elements',[]) return [parse_endcard(card) for card in endsc if card is not None] def mkthumbs(thumbs): output = {str(e['height']): e['url'] for e in thumbs} largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None) return {**output, 'largest': largest} def clean_url(url): # externals URLs are redirected through youtube.com/redirect, but we # may encounter internal URLs, too return parse_qs(urlparse(url).query).get('q',[url])[0] def toInt(s, fallback=0): if s is None: return fallback try: return int(''.join(filter(str.isdigit, s))) except ValueError: return fallback # Remove left-/rightmost word from string: delL = lambda s: s.partition(' ')[2] def age(s): if s is None: # missing from autogen'd music, some livestreams return None # Some livestreams have "Streamed 7 hours ago" s = s.replace("Streamed ","") # Now, everything should be in the form "1 year ago" value, unit, _ = s.split(" ") suffix = dict( month='mn', months='mn', ).get(unit, unit[0]) # first letter otherwise (e.g. year(s) => y) return f"{value}{suffix}" def parse_result_items(items): # TODO: use .get() for most non-essential attributes """ parses youtube search response into an easier to use format. """ results = [] for item in items: key = next(iter(item.keys()), None) content = item[key] if key == 'videoRenderer': is_live = listfind(content.get('badges',[]), 'metadataBadgeRenderer').get('style') == 'BADGE_STYLE_TYPE_LIVE_NOW' results.append({'type': 'VIDEO', 'content': { 'video_id': content['videoId'], 'title': content['title']['runs'][0]['text'], 'author': content['longBylineText']['runs'][0]['text'] or \ content['shortBylineText']['runs'][0]['text'], 'channel_id': content['ownerText']['runs'][0] \ ['navigationEndpoint']['browseEndpoint']['browseId'], 'length': content.get('lengthText',{}).get('simpleText') \ if not is_live else 'LIVE', # "44:07", "1:41:50" 'views': toInt(content.get('viewCountText',{}).get('simpleText') or # "123,456 views" listget(content.get('viewCountText',{}).get('runs'),0,{}).get('text')), # "1,234 watching" 'published': age(content.get('publishedTimeText',{}).get('simpleText')), }}) elif key == 'playlistRenderer': results.append({'type': 'PLAYLIST', 'content': { 'playlist_id': content['navigationEndpoint']['watchEndpoint']['playlistId'], 'video_id': content['navigationEndpoint']['watchEndpoint']['videoId'], 'title': content['title']['simpleText'], 'author': content['longBylineText']['runs'][0]['text'] or content['shortBylineText']['runs'][0]['text'], 'channel_id': content['longBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], # OR .shortBylineText 'n_videos': toInt(content['videoCount']), }}) elif key == 'radioRenderer': # "Mix" playlists results.append({'type': 'PLAYLIST', 'content': { 'playlist_id': content['playlistId'], 'video_id': content['navigationEndpoint']['watchEndpoint']['videoId'], 'title': content['title']['simpleText'], 'author': content['longBylineText']['simpleText'] or \ content['shortBylineText']['simpleText'] , # always "YouTube" 'channel_id': None, 'n_videos': content['videoCountShortText']['runs'][0]['text'] or \ content['videoCountText']['runs'][0]['text'], # videoCountShortText: "50+"; videoCountText: "50+ videos" }}) elif key == 'channelRenderer': results.append({'type': 'CHANNEL', 'content': { 'channel_id': content['channelId'], 'title': content['title']['simpleText'], 'icons': mkthumbs(content['thumbnail']['thumbnails']), 'subscribers': content['subscriberCountText']['simpleText'], # "2.47K subscribers" }}) elif key == 'shelfRenderer': results.extend([item for item in parse_result_items(content['content']['verticalListRenderer']['items']) ]) elif key == 'movieRenderer': # movies to buy/rent pass elif key == 'horizontalCardListRenderer': # suggested searches: .cards[].searchRefinementCardRenderer.query.runs[].text pass else: import pprint content = {'error': f"{key} is not implemented;
{pprint.pformat(item)}
"} results.append({'type': key, 'content': content}) return results def parse_infocard(card): """ parses a single infocard into a format that's easier to handle. """ card = card['cardRenderer'] ctype = list(card['content'].keys())[0] content = card['content'][ctype] if ctype == "pollRenderer": return {'type': "POLL", 'content': { 'question': content['question']['simpleText'], 'answers': [(a['text']['simpleText'],a['numVotes']) \ for a in content['choices']], }} elif ctype == "videoInfoCardContentRenderer": is_live = content.get('badge',{}).get('liveBadgeRenderer') is not None return {'type': "VIDEO", 'content': { 'video_id': content['action']['watchEndpoint']['videoId'], 'title': content['videoTitle']['simpleText'], 'author': delL(content['channelName']['simpleText']), 'length': content.get('lengthString',{}).get('simpleText') \ if not is_live else "LIVE", # "23:03" 'views': toInt(content.get('viewCountText',{}).get('simpleText')), # XXX: views sometimes "Starts: July 31, 2020 at 1:30 PM" }} elif ctype == "playlistInfoCardContentRenderer": return {'type': "PLAYLIST", 'content': { 'playlist_id': content['action']['watchEndpoint']['playlistId'], 'video_id': content['action']['watchEndpoint']['videoId'], 'title': content['playlistTitle']['simpleText'], 'author': delL(content['channelName']['simpleText']), 'n_videos': toInt(content['playlistVideoCount']['simpleText']), }} elif ctype == "simpleCardContentRenderer" and \ 'urlEndpoint' in content['command']: return {'type': "WEBSITE", 'content': { 'url': clean_url(content['command']['urlEndpoint']['url']), 'domain': content['displayDomain']['simpleText'], 'title': content['title']['simpleText'], # XXX: no thumbnails for infocards }} elif ctype == "collaboratorInfoCardContentRenderer": return {'type': "CHANNEL", 'content': { 'channel_id': content['endpoint']['browseEndpoint']['browseId'], 'title': content['channelName']['simpleText'], 'icons': mkthumbs(content['channelAvatar']['thumbnails']), 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers" }} else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} # TODO!!! return None def parse_endcard(card): """ parses a single endcard into a format that's easier to handle. """ card = card.get('endscreenElementRenderer', card) #only sometimes nested ctype = card['style'] if ctype == "CHANNEL": return {'type': ctype, 'content': { 'channel_id': card['endpoint']['browseEndpoint']['browseId'], 'title': card['title']['simpleText'], 'icons': mkthumbs(card['image']['thumbnails']), }} elif ctype == "VIDEO": return {'type': ctype, 'content': { 'video_id': card['endpoint']['watchEndpoint']['videoId'], 'title': card['title']['simpleText'], 'length': card['videoDuration']['simpleText'], # '12:21' 'views': toInt(card['metadata']['simpleText']), # XXX: no channel name }} elif ctype == "PLAYLIST": return {'type': ctype, 'content': { 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'], 'video_id': card['endpoint']['watchEndpoint']['videoId'], 'title': card['title']['simpleText'], 'author': delL(card['metadata']['simpleText']), 'n_videos': toInt(card['playlistLength']['simpleText']), }} elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE": url = clean_url(card['endpoint']['urlEndpoint']['url']) return {'type': "WEBSITE", 'content': { 'url': url, 'domain': urlparse(url).netloc, 'title': card['title']['simpleText'], 'icons': mkthumbs(card['image']['thumbnails']), }} else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} # TODO!!! return None