import re import requests from urllib.parse import urlparse from ..common.common import video_metadata from ..common.innertube import prepare_infocards, prepare_endcards, G def prepare_metadata(metadata): meta = metadata['videoDetails'] # the actual video streams have exact information: try: sd = metadata['streamingData'] some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0] aspect_ratio = some_stream['width'] / some_stream['height'] # if that's unavailable (e.g. on livestreams), fall back to 16:9 except: aspect_ratio = 16/9 # Note: we could get subtitles in multiple formats directly by querying # https://video.google.com/timedtext?hl=en&type=list&v= followed by # https://www.youtube.com/api/timedtext?lang=&v=&fmt={srv1|srv2|srv3|ttml|vtt}, # but that won't give us autogenerated subtitles (and is an extra request). # we can still add &fmt= to the extracted URLs below (first one takes precedence). try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist): default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0) main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['captionTrackIndices'] except: main_subtitle = -1 subtitles = sorted([ {'url':cc['baseUrl'], 'code':cc['languageCode'], 'autogenerated':cc.get('kind')=="asr", 'name':cc['name']|G.text, 'default':i==main_subtitle, 'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy for i,cc in enumerate(metadata|G('captions') |G('playerCaptionsTracklistRenderer') |G('captionTracks') or []) # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2: ], key=lambda cc: (not cc['default']) + cc['autogenerated']) endcards = prepare_endcards(metadata) thumbs = meta['thumbnail']['thumbnails'] poster = sorted(thumbs, key=lambda t: t['width'], reverse=True)[0]['url'] return { **video_metadata(metadata), 'description': meta['shortDescription'], 'aspectr': aspect_ratio, 'unlisted': not meta['isCrawlable'], 'poster': poster, 'endcards': endcards, 'all_cards': endcards, 'subtitles': subtitles, } def channel_exists(feed_id): feed_type = "channel_id" if re.match(r"^UC[A-Za-z0-9_-]{22}$", feed_id) else "user" r = requests.head("https://www.youtube.com/feeds/videos.xml", params={ feed_type: feed_id, }) return r.ok def microformat_parser(metadata): """ parses additional metadata only available with get_video_info(metaOnly=True) """ meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{}) all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split() whitelisted = sorted(meta2.get('availableCountries',[])) blacklisted = sorted(set(all_countries) - set(whitelisted)) regions = ( 'all' if not blacklisted else 'none' if not whitelisted else f"not in {' '.join(blacklisted)}" if len(blacklisted) < len(whitelisted) else f"only in {' '.join(whitelisted)}" ) try: poster = sorted(meta2['thumbnail']['thumbnails'], key=lambda t: t['width'], reverse=True)[0]['url'] except: poster = None infocards = prepare_infocards(metadata) endcards = prepare_endcards(metadata) # combine cards to weed out duplicates. for videos and playlists prefer # infocards, for channels and websites prefer endcards, as those have more # information than the other. # if the card type is not in ident, we use the whole card for comparison # (otherwise they'd all replace each other) ident = { # ctype -> ident 'VIDEO': 'video_id', 'PLAYLIST': 'playlist_id', 'CHANNEL': 'channel_id', 'WEBSITE': 'url', 'POLL': 'question', } getident = lambda c: c['content'].get(ident.get(c['type']), c) mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types] exclude = lambda cards, without: [c for c in cards if getident(c) not in without] allcards = exclude(infocards, mkexclude(endcards, ['CHANNEL','WEBSITE'])) + \ exclude(endcards, mkexclude(infocards, ['VIDEO','PLAYLIST'])) return { 'published': meta2.get('publishDate'), #'uploaded': meta2.get('uploadDate'), #'infocards': infocards, #'endcards': endcards, 'all_cards': allcards, 'poster': poster, 'regions': regions, }