app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  31     if not r.ok:
  32         return None
  33
  34     return r.text
  35
  36 def parse_xml(xmldata):
  37     ns = {
  38         'atom':"http://www.w3.org/2005/Atom",
  39         'yt': "http://www.youtube.com/xml/schemas/2015",
  40         'media':"http://search.yahoo.com/mrss/"
  41     }
  42
  43     feed = ElementTree.fromstring(xmldata)
  44     title = feed.find('atom:title',ns).text
  45     author = feed.find('atom:author/atom:name',ns).text \
  46         if feed.find('atom:author',ns) else None
  47     videos = []
  48     for entry in feed.findall('atom:entry',ns):
  49         videos.append({
  50             'video_id': entry.find('yt:videoId',ns).text,
  51             'title': entry.find('atom:title',ns).text,
  52             'published': entry.find('atom:published',ns).text,
  53             'channel_id': entry.find('yt:channelId',ns).text,
  54             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  55             # extra fields for pull_subs/webhook:
  56             'updated': entry.find('atom:updated',ns).text,
  57         })
  58
  59     return title, author, videos
  60
  61 def update_channel(db, xmldata):
  62     if not xmldata: return False
  63
  64     # Note: websub does not return global author, hence taking from first video
  65     title, _, videos = parse_xml(xmldata)
  66
  67     c = db.cursor()
  68     for i, video in enumerate(videos):
  69         now = datetime.now(timezone.utc)
  70         updated = dateutil.parser.parse(video['updated'])
  71         published = dateutil.parser.parse(video['published'])
  72         # if update and published time are near-identical, we assume it's new.
  73         if (updated - published).seconds < 60 and (now - published).days < 7:
  74             timestamp = now
  75         else:#, it's just an update to an older video.
  76             timestamp = published
  77
  78         c.execute("""
  79             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  80                            VALUES (?, ?, ?, datetime(?), datetime(?))
  81         """, (
  82             video['video_id'],
  83             video['channel_id'],
  84             video['title'],
  85             video['published'],
  86             timestamp
  87         ))
  88
  89         if i == 0: # only required once per feed
  90             c.execute("""
  91                 INSERT OR REPLACE INTO channels (id, name)
  92                                 VALUES (?, ?)
  93             """, (video['channel_id'], video['author']))
  94     db.commit()
  95
  96     return True
  97
  98 def get_video_info(video_id, sts=0, algo=""):
  99     """
 100     returns: best-quality muxed video stream, player_response, error-type/mesage
 101     error types: player, malformed, livestream, geolocked, exhausted
 102     """
 103     player_error = None # for 'exhausted'
 104     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 105         r = requests.get(f"https://www.youtube.com/get_video_info"+
 106             f"?video_id={video_id}"+
 107             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 108             f"&el={el}"+
 109             f"&sts={sts}"+
 110             f"&hl=en_US") #"&hl=en&gl=US"
 111         params = parse_qs(r.text)
 112         if 'errorcode' in params: # status=fail
 113             return None, None, 'malformed', params['reason'][0]
 114
 115         metadata = json.loads(params.get('player_response')[0])
 116         playabilityStatus = metadata['playabilityStatus']['status']
 117         if playabilityStatus != "OK":
 118             playabilityReason = metadata['playabilityStatus']['reason']
 119             player_error = f"{playabilityStatus}: {playabilityReason}"
 120             if playabilityStatus == "UNPLAYABLE":
 121                 continue  # try again with next el value (or fail as exhausted)
 122             # without videoDetails, there's only the error message
 123             maybe_metadata = metadata if 'videoDetails' in metadata else None
 124             return None, maybe_metadata, 'player', player_error
 125         if metadata['videoDetails']['isLiveContent'] and \
 126                 metadata['videoDetails'].get('isPostLiveDvr', False):
 127             return None, metadata, 'livestream', None
 128
 129         if not 'formats' in metadata['streamingData']:
 130             continue # no urls
 131
 132         formats = metadata['streamingData']['formats']
 133         for (i,v) in enumerate(formats):
 134             if not ('cipher' in v or 'signatureCipher' in v): continue
 135             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 136             formats[i]['url'] = unscramble(cipher, algo)
 137
 138         # todo: check if we have urls or try again
 139         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 140
 141         if 'gcr' in parse_qs(url):
 142             return None, metadata, 'geolocked', None
 143
 144         return url, metadata, None, None
 145     else:
 146         return None, metadata, 'exhausted', player_error
 147
 148 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 149     signature = list(cipher['s'][0])
 150     for c in algo.split():
 151         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 152         ix = int(ix) % len(signature) if ix else 0
 153         if not op: continue
 154         if op == 'r': signature = list(reversed(signature))
 155         if op == 's': signature = signature[ix:]
 156         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 157     sp = cipher.get('sp', ['signature'])[0]
 158     sig = cipher.get('sig', [''.join(signature)])[0]
 159     return f"{cipher['url'][0]}&{sp}={sig}"
 160
 161 def prepare_metadata(metadata):
 162     meta1 = metadata['videoDetails']
 163     meta2 = metadata['microformat']['playerMicroformatRenderer']
 164     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 165         if 'cards' in metadata else []
 166     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 167         if 'endscreen' in metadata else []
 168
 169     # the actual video streams have exact information:
 170     try:
 171         sd = metadata['streamingData']
 172         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 173         aspect_ratio = some_stream['width'] / some_stream['height']
 174     # if that's unavailable (e.g. on livestreams), fall back to
 175     # thumbnails (only either 4:3 or 16:9).
 176     except:
 177         some_img = meta2['thumbnail']['thumbnails'][0]
 178         aspect_ratio = some_img['width'] / some_img['height']
 179
 180     subtitles = sorted([
 181         {'url':cc['baseUrl'],
 182          'code':cc['languageCode'],
 183          'autogenerated':cc.get('kind')=="asr",
 184          'name':cc['name']['simpleText']}
 185         for cc in metadata.get('captions',{})
 186             .get('playerCaptionsTracklistRenderer',{})
 187             .get('captionTracks',[])
 188     ], key=lambda cc: cc['autogenerated'])
 189
 190     def clean_url(url):
 191         # externals URLs are redirected through youtube.com/redirect, but we
 192         # may encounter internal URLs, too
 193         return parse_qs(urlparse(url).query).get('q',[url])[0]
 194     # Remove left-/rightmost word from string:
 195     delL = lambda s: s.partition(' ')[2]
 196     delR = lambda s: s.rpartition(' ')[0]
 197     # Thousands seperator aware int():
 198     intT = lambda s: int(s.replace(',', ''))
 199
 200     def parse_infocard(card):
 201         card = card['cardRenderer']
 202         ctype = list(card['content'].keys())[0]
 203         content = card['content'][ctype]
 204         if ctype == "pollRenderer":
 205             ctype = "POLL"
 206             content = {
 207                 'question': content['question']['simpleText'],
 208                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 209                     for a in content['choices']],
 210             }
 211         elif ctype == "videoInfoCardContentRenderer":
 212             ctype = "VIDEO"
 213             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 214             # TODO: this is ugly; cleanup.
 215             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 216             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 217             content = {
 218                 'video_id': content['action']['watchEndpoint']['videoId'],
 219                 'title': content['videoTitle']['simpleText'],
 220                 'author': delL(content['channelName']['simpleText']),
 221                 'length': length,
 222                 'views': intT(delR(content['viewCountText']['simpleText'])),
 223             }
 224         elif ctype == "playlistInfoCardContentRenderer":
 225             ctype = "PLAYLIST"
 226             content = {
 227                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 228                 'video_id': content['action']['watchEndpoint']['videoId'],
 229                 'title': content['playlistTitle']['simpleText'],
 230                 'author': delL(content['channelName']['simpleText']),
 231                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 232             }
 233         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 234             ctype = "WEBSITE"
 235             content = {
 236                 'url': clean_url(content['command']['urlEndpoint']['url']),
 237                 'domain': content['displayDomain']['simpleText'],
 238                 'title': content['title']['simpleText'],
 239                 # XXX: no thumbnails for infocards
 240             }
 241         elif ctype == "collaboratorInfoCardContentRenderer":
 242             ctype = "CHANNEL"
 243             content = {
 244                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 245                 'title': content['channelName']['simpleText'],
 246                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 247                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 248             }
 249         else:
 250             import pprint
 251             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 252
 253         return {'type': ctype, 'content': content}
 254
 255     def mkthumbs(thumbs):
 256         return {e['height']: e['url'] for e in thumbs}
 257     def parse_endcard(card):
 258         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 259         ctype = card['style']
 260         if ctype == "CHANNEL":
 261             content = {
 262                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 263                 'title': card['title']['simpleText'],
 264                 'icons': mkthumbs(card['image']['thumbnails']),
 265             }
 266         elif ctype == "VIDEO":
 267             content = {
 268                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 269                 'title': card['title']['simpleText'],
 270                 'length': card['videoDuration']['simpleText'],  # '12:21'
 271                 'views': delR(card['metadata']['simpleText']),
 272                 # XXX: no channel name
 273             }
 274         elif ctype == "PLAYLIST":
 275             content = {
 276                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 277                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 278                 'title': card['title']['simpleText'],
 279                 'author': delL(card['metadata']['simpleText']),
 280                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 281             }
 282         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 283             ctype = "WEBSITE"
 284             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 285             content = {
 286                 'url': url,
 287                 'domain': urlparse(url).netloc,
 288                 'title': card['title']['simpleText'],
 289                 'icons': mkthumbs(card['image']['thumbnails']),
 290             }
 291         else:
 292             import pprint
 293             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 294
 295         return {'type': ctype, 'content': content}
 296
 297     infocards = [parse_infocard(card) for card in cards]
 298     endcards = [parse_endcard(card) for card in endsc]
 299     # combine cards to weed out duplicates. for videos and playlists prefer
 300     # infocards, for channels and websites prefer endcards, as those have more
 301     # information than the other.
 302     # if the card type is not in ident, we use the whole card for comparison
 303     # (otherwise they'd all replace each other)
 304     ident = { # ctype -> ident
 305         'VIDEO': 'video_id',
 306         'PLAYLIST': 'playlist_id',
 307         'CHANNEL': 'channel_id',
 308         'WEBSITE': 'url',
 309         'POLL': 'question',
 310     }
 311     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 312     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 313     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 314
 315     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 316                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 317
 318     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 319         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 320         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 321         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 322         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 323         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 324         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 325         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 326         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 327         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 328         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 329     whitelisted = sorted(meta2['availableCountries'])
 330     blacklisted = sorted(set(all_countries) - set(whitelisted))
 331
 332     return {
 333         'title': meta1['title'],
 334         'author': meta1['author'],
 335         'channel_id': meta1['channelId'],
 336         'description': meta1['shortDescription'],
 337         'published': meta2['publishDate'],
 338         'views': meta1['viewCount'],
 339         'length': int(meta1['lengthSeconds']),
 340         'rating': meta1['averageRating'],
 341         'category': meta2['category'],
 342         'aspectr': aspect_ratio,
 343         'unlisted': meta2['isUnlisted'],
 344         'countries': whitelisted,
 345         'blacklisted': blacklisted,
 346         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 347         'infocards': infocards,
 348         'endcards': endcards,
 349         'all_cards': allcards,
 350         'subtitles': subtitles,
 351     }
 352
 353 class RedditException(Exception): pass
 354 def fetch_reddit(subreddits, params=[], count=None, before=None, after=None):
 355     """
 356     fetches data from a subreddit (or a multireddit like gif+gifs) and
 357     filters/sorts results.
 358     returns a tuple of ([{video}],before,after)
 359     """
 360     # TODO support /r/videos/top/?t=week
 361     # TODO support ?limit=100
 362
 363     if not subreddits:
 364         return [], None, None
 365
 366     query = '&'.join([f"{k}={v}" for k,v in [('count',count), ('before',before), ('after',after), *params] if v])
 367     multireddit = '+'.join(subreddits)
 368     r = requests.get(f"https://old.reddit.com/r/{multireddit}.json?{query}", headers={'User-Agent':'Mozilla/5.0'})
 369     if not r.ok or not 'data' in r.json():
 370         raise RedditException(r.text)
 371
 372     videos = []
 373     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 374     for entry in entries:
 375         e = entry['data']
 376         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 377             continue
 378         try:
 379             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 380             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 381         except:
 382             continue # XXX: should we log that?
 383         if not video_id: continue
 384         videos.append({
 385             'video_id': video_id,
 386             'title': e['title'],
 387             'url': e['permalink'],
 388             'n_comments': e['num_comments'],
 389             'n_karma': e['score'],
 390             'subreddit': e['subreddit'],
 391             'post_id': e['id'],
 392         })
 393     before = r.json()['data']['before']
 394     after = r.json()['data']['after']
 395
 396     return videos, before, after
 397
 398 def pp(*args):
 399     from pprint import pprint
 400     import sys, codecs
 401     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))