# functions that deal with parsing data from youtube's internal API ("innertube") from urllib.parse import parse_qs, urlparse def mkthumbs(thumbs): output = {str(e['height']): e['url'] for e in thumbs} largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None) return {**output, 'largest': largest} def clean_url(url): # externals URLs are redirected through youtube.com/redirect, but we # may encounter internal URLs, too return parse_qs(urlparse(url).query).get('q',[url])[0] # Remove left-/rightmost word from string: delL = lambda s: s.partition(' ')[2] delR = lambda s: s.rpartition(' ')[0] # Thousands seperator aware int(): intT = lambda s: int(s.replace(',', '')) def parse_result_items(items): """ parses youtube search response into an easier to use format. """ results = [] for item in items: key = next(iter(item.keys()), None) if key == 'videoRenderer': is_live = next(iter([badge['metadataBadgeRenderer'] for badge in item[key].get('badges',[]) if 'metadataBadgeRenderer' in badge.keys()]),{}).get('style') == 'BADGE_STYLE_TYPE_LIVE_NOW' results.append( {'type': 'VIDEO', 'content': { 'video_id': item[key]['videoId'], 'title': item[key]['title']['runs'][0]['text'], # XXX: handle/concat multiple runs? 'author': item[key]['longBylineText']['runs'][0]['text'], # OR: ownerText (never works), shortBylineText 'channel_id': item[key]['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], # OR: channelThumbnailSupportedRenderers.channelThumbnailWithLinkRenderer.navigationEndpoint.browseId 'length': item[key].get('lengthText',{}).get('simpleText') if not is_live else 'LIVE', # "44:07", "1:41:50" -- XXX: maybe absent--when? 'views': item[key].get('viewCountText',{}).get('simpleText'), # XXX: "123,456 views", absent on livestreams # published: e.g. "1 year ago"; missing on autogenerated # music 'videos', livestreams sometimes "Streamed 7 hours # ago", sometimes absent. 'published': item[key].get('publishedTimeText',{}).get('simpleText',"").replace("Streamed ",""), }} ) elif key == 'playlistRenderer': results.append( {'type': 'PLAYLIST', 'content': { 'playlist_id': item[key]['navigationEndpoint']['watchEndpoint']['playlistId'], 'video_id': item[key]['navigationEndpoint']['watchEndpoint']['videoId'], 'title': item[key]['title']['simpleText'], 'author': item[key]['longBylineText']['runs'][0]['text'], # OR: .shortBylineText 'channel_id': item[key]['longBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], # OR .shortBylineText 'n_videos': item[key]['videoCount'], }} ) elif key == 'radioRenderer': # "Mix" playlists results.append( {'type': 'PLAYLIST', 'content': { 'playlist_id': item[key]['playlistId'], # OR: same as normal playlist 'video_id': item[key]['navigationEndpoint']['watchEndpoint']['videoId'], 'title': item[key]['title']['simpleText'], 'author': item[key]['longBylineText']['simpleText'], # always "YouTube"; OR: .shortBylineText 'channel_id': None, # xxx: nothing available #'n_videos': item[key]['videoCountText']['runs'][0]['text'], # XXX: "50+ videos" 'n_videos': item[key]['videoCountShortText']['runs'][0]['text'], # "50+" }} ) elif key == 'channelRenderer': results.append( {'type': 'CHANNEL', 'content': { 'channel_id': item[key]['channelId'], 'title': item[key]['title']['simpleText'], 'icons': mkthumbs(item[key]['thumbnail']['thumbnails']), # [{url,height,width}] 'subscribers': item[key]['subscriberCountText']['simpleText'], # XXX: "2.47K subscribers" }} ) elif key == 'shelfRenderer': results.extend([ item for item in parse_result_items(item[key]['content']['verticalListRenderer']['items']) ]) elif key == 'movieRenderer': # movies to buy/rent pass elif key == 'horizontalCardListRenderer': # suggested searches: .cards[].searchRefinementCardRenderer.query.runs[].text pass else: import pprint content = {'error': f"{key} is not implemented;
{pprint.pformat(item)}
"} results.append({'type': key, 'content': content}) return results def parse_infocard(card): """ parses a single infocard into a format that's easier to handle. """ card = card['cardRenderer'] ctype = list(card['content'].keys())[0] content = card['content'][ctype] if ctype == "pollRenderer": ctype = "POLL" content = { 'question': content['question']['simpleText'], 'answers': [(a['text']['simpleText'],a['numVotes']) \ for a in content['choices']], } elif ctype == "videoInfoCardContentRenderer": ctype = "VIDEO" # if the card references a live stream, it has no length, but a "LIVE NOW" badge. # TODO: this is ugly; cleanup. is_live = content.get('badge',{}).get('liveBadgeRenderer',{}) length = is_live.get('label',{}).get('simpleText') or content.get('lengthString',{}).get('simpleText') # '23:03' from flask import current_app current_app.logger.warning(content['viewCountText']['simpleText']) # Starts: July 31, 2020 at 1:30 PM # viewCountText.simpleText might contain ^this! try: view_count = intT(delR(content['viewCountText']['simpleText'])) except: view_count = 0 content = { 'video_id': content['action']['watchEndpoint']['videoId'], 'title': content['videoTitle']['simpleText'], 'author': delL(content['channelName']['simpleText']), 'length': length, 'views': view_count, } elif ctype == "playlistInfoCardContentRenderer": ctype = "PLAYLIST" content = { 'playlist_id': content['action']['watchEndpoint']['playlistId'], 'video_id': content['action']['watchEndpoint']['videoId'], 'title': content['playlistTitle']['simpleText'], 'author': delL(content['channelName']['simpleText']), 'n_videos': intT(content['playlistVideoCount']['simpleText']), } elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']: ctype = "WEBSITE" content = { 'url': clean_url(content['command']['urlEndpoint']['url']), 'domain': content['displayDomain']['simpleText'], 'title': content['title']['simpleText'], # XXX: no thumbnails for infocards } elif ctype == "collaboratorInfoCardContentRenderer": ctype = "CHANNEL" content = { 'channel_id': content['endpoint']['browseEndpoint']['browseId'], 'title': content['channelName']['simpleText'], 'icons': mkthumbs(content['channelAvatar']['thumbnails']), 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers" } else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} return {'type': ctype, 'content': content} def parse_endcard(card): """ parses a single endcard into a format that's easier to handle. """ card = card.get('endscreenElementRenderer', card) #only sometimes nested ctype = card['style'] if ctype == "CHANNEL": content = { 'channel_id': card['endpoint']['browseEndpoint']['browseId'], 'title': card['title']['simpleText'], 'icons': mkthumbs(card['image']['thumbnails']), } elif ctype == "VIDEO": content = { 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on) 'title': card['title']['simpleText'], 'length': card['videoDuration']['simpleText'], # '12:21' 'views': delR(card['metadata']['simpleText']), # XXX: no channel name } elif ctype == "PLAYLIST": content = { 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'], 'video_id': card['endpoint']['watchEndpoint']['videoId'], 'title': card['title']['simpleText'], 'author': delL(card['metadata']['simpleText']), 'n_videos': intT(delR(card['playlistLength']['simpleText'])), } elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE": ctype = "WEBSITE" url = clean_url(card['endpoint']['urlEndpoint']['url']) content = { 'url': url, 'domain': urlparse(url).netloc, 'title': card['title']['simpleText'], 'icons': mkthumbs(card['image']['thumbnails']), } else: import pprint content = {'error': f"{ctype} is not implemented;
{pprint.pformat(card)}
"} return {'type': ctype, 'content': content}