app/youtube/lib.py

   1 import re
   2 import requests
   3 from urllib.parse import urlparse
   4
   5 from ..common.common import video_metadata
   6 from ..common.innertube import prepare_infocards, prepare_endcards
   7
   8 def prepare_metadata(metadata):
   9     meta1 = metadata['videoDetails']
  10     meta2 = metadata['microformat']['playerMicroformatRenderer']
  11
  12     # the actual video streams have exact information:
  13     try:
  14         sd = metadata['streamingData']
  15         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
  16         aspect_ratio = some_stream['width'] / some_stream['height']
  17     # if that's unavailable (e.g. on livestreams), fall back to
  18     # thumbnails (only either 4:3 or 16:9).
  19     except:
  20         some_img = meta2['thumbnail']['thumbnails'][0]
  21         aspect_ratio = some_img['width'] / some_img['height']
  22
  23     # Note: we could get subtitles in multiple formats directly by querying
  24     # https://video.google.com/timedtext?hl=en&type=list&v=<VIDEO_ID> followed by
  25     # https://www.youtube.com/api/timedtext?lang=<LANG_CODE>&v=<VIDEO_ID>&fmt={srv1|srv2|srv3|ttml|vtt},
  26     # but that won't give us autogenerated subtitles (and is an extra request).
  27     # we can still add &fmt= to the extracted URLs below (first one takes precedence).
  28     try: # find the native language captions (assuming there is only 1 audioTrack) (any level might not exist):
  29         default_track = metadata.get('captions',{}).get('playerCaptionsTracklistRenderer',{}).get('defaultAudioTrackIndex', 0)
  30         main_subtitle = metadata['captions']['playerCaptionsTracklistRenderer']['audioTracks'][default_track]['defaultCaptionTrackIndex']
  31     except:
  32         main_subtitle = -1
  33     subtitles = sorted([
  34         {'url':cc['baseUrl'],
  35          'code':cc['languageCode'],
  36          'autogenerated':cc.get('kind')=="asr",
  37          'name':cc['name']['simpleText'],
  38          'default':i==main_subtitle,
  39          'query':"fmt=vtt&"+urlparse(cc['baseUrl']).query} # for our internal proxy
  40         for i,cc in enumerate(metadata.get('captions',{})
  41             .get('playerCaptionsTracklistRenderer',{})
  42             .get('captionTracks',[]))
  43     # sort order: default lang gets weight 0 (first), other manually translated weight 1, autogenerated weight 2:
  44     ], key=lambda cc: (not cc['default']) + cc['autogenerated'])
  45
  46     infocards = prepare_infocards(metadata)
  47     endcards = prepare_endcards(metadata)
  48     # combine cards to weed out duplicates. for videos and playlists prefer
  49     # infocards, for channels and websites prefer endcards, as those have more
  50     # information than the other.
  51     # if the card type is not in ident, we use the whole card for comparison
  52     # (otherwise they'd all replace each other)
  53     ident = { # ctype -> ident
  54         'VIDEO': 'video_id',
  55         'PLAYLIST': 'playlist_id',
  56         'CHANNEL': 'channel_id',
  57         'WEBSITE': 'url',
  58         'POLL': 'question',
  59     }
  60     getident = lambda c: c['content'].get(ident.get(c['type']), c)
  61     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
  62     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
  63
  64     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
  65                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
  66
  67     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
  68         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
  69         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
  70         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
  71         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
  72         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
  73         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
  74         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
  75         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
  76         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
  77         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
  78     whitelisted = sorted(meta2.get('availableCountries',[]))
  79     blacklisted = sorted(set(all_countries) - set(whitelisted))
  80
  81     # the rating goes from 1 to 5, and is the ratio of up- to down votes, plus 1
  82     thumbs_up = 100 * (meta1['averageRating']-1) / 4  # reconstructed ratio
  83     thumbs_dn = 100 - thumbs_up
  84
  85     return {
  86         **video_metadata(metadata),
  87         'description': meta1['shortDescription'],
  88         'rating': meta1['averageRating'],
  89         'thumbs_up': thumbs_up,
  90         'thumbs_dn': thumbs_dn,
  91         'category': meta2['category'],
  92         'aspectr': aspect_ratio,
  93         'unlisted': meta2['isUnlisted'],
  94         'whitelisted': whitelisted,
  95         'blacklisted': blacklisted,
  96         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
  97         'infocards': infocards,
  98         'endcards': endcards,
  99         'all_cards': allcards,
 100         'subtitles': subtitles,
 101     }
 102
 103 def channel_exists(feed_id):
 104     feed_type = "channel_id" if re.match(r"^UC[A-Za-z0-9_-]{22}$", feed_id) else "user"
 105     r = requests.head("https://www.youtube.com/feeds/videos.xml", params={
 106         feed_type: feed_id,
 107     })
 108     return r.ok