app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  31     if not r.ok:
  32         return None
  33
  34     return r.text
  35
  36 def parse_xml(xmldata):
  37     ns = {
  38         'atom':"http://www.w3.org/2005/Atom",
  39         'yt': "http://www.youtube.com/xml/schemas/2015",
  40         'media':"http://search.yahoo.com/mrss/"
  41     }
  42
  43     feed = ElementTree.fromstring(xmldata)
  44     title = feed.find('atom:title',ns).text
  45     author = feed.find('atom:author/atom:name',ns).text \
  46         if feed.find('atom:author',ns) else None
  47     videos = []
  48     for entry in feed.findall('atom:entry',ns):
  49         videos.append({
  50             'video_id': entry.find('yt:videoId',ns).text,
  51             'title': entry.find('atom:title',ns).text,
  52             'published': entry.find('atom:published',ns).text,
  53             'channel_id': entry.find('yt:channelId',ns).text,
  54             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  55             # extra fields for pull_subs/webhook:
  56             'updated': entry.find('atom:updated',ns).text,
  57         })
  58
  59     return title, author, videos
  60
  61 def update_channel(db, xmldata):
  62     if not xmldata: return False
  63
  64     # Note: websub does not return global author, hence taking from first video
  65     title, _, videos = parse_xml(xmldata)
  66
  67     c = db.cursor()
  68     for i, video in enumerate(videos):
  69         now = datetime.now(timezone.utc)
  70         updated = dateutil.parser.parse(video['updated'])
  71         published = dateutil.parser.parse(video['published'])
  72         # if update and published time are near-identical, we assume it's new.
  73         if (updated - published).seconds < 60 and (now - published).days < 7:
  74             timestamp = now
  75         else:#, it's just an update to an older video.
  76             timestamp = published
  77
  78         c.execute("""
  79             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  80                            VALUES (?, ?, ?, datetime(?), datetime(?))
  81         """, (
  82             video['video_id'],
  83             video['channel_id'],
  84             video['title'],
  85             video['published'],
  86             timestamp
  87         ))
  88
  89         if i == 0: # only required once per feed
  90             c.execute("""
  91                 INSERT OR REPLACE INTO channels (id, name)
  92                                 VALUES (?, ?)
  93             """, (video['channel_id'], video['author']))
  94     db.commit()
  95
  96     return True
  97
  98 def get_video_info(video_id, sts=0, algo=""):
  99     """
 100     returns: best-quality muxed video stream, player_response, error-type/mesage
 101     error types: player, malformed, livestream, geolocked, exhausted
 102     """
 103     player_error = None # for 'exhausted'
 104     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 105         r = requests.get(f"https://www.youtube.com/get_video_info"+
 106             f"?video_id={video_id}"+
 107             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 108             f"&el={el}"+
 109             f"&sts={sts}"+
 110             f"&hl=en_US") #"&hl=en&gl=US"
 111         params = parse_qs(r.text)
 112         if 'errorcode' in params: # status=fail
 113             return None, None, 'malformed', params['reason'][0]
 114
 115         metadata = json.loads(params.get('player_response')[0])
 116         playabilityStatus = metadata['playabilityStatus']['status']
 117         if playabilityStatus != "OK":
 118             playabilityReason = metadata['playabilityStatus']['reason']
 119             player_error = f"{playabilityStatus}: {playabilityReason}"
 120             if playabilityStatus == "UNPLAYABLE":
 121                 continue  # try again with next el value (or fail as exhausted)
 122             # without videoDetails, there's only the error message
 123             maybe_metadata = metadata if 'videoDetails' in metadata else None
 124             return None, maybe_metadata, 'player', player_error
 125         if metadata['videoDetails']['isLiveContent'] and \
 126                 (metadata['videoDetails'].get('isLive', False) or \
 127                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 128             return None, metadata, 'livestream', None
 129
 130         if not 'formats' in metadata['streamingData']:
 131             continue # no urls
 132
 133         formats = metadata['streamingData']['formats']
 134         for (i,v) in enumerate(formats):
 135             if not ('cipher' in v or 'signatureCipher' in v): continue
 136             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 137             formats[i]['url'] = unscramble(cipher, algo)
 138
 139         # todo: check if we have urls or try again
 140         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 141
 142         if 'gcr' in parse_qs(url):
 143             return None, metadata, 'geolocked', None
 144
 145         return url, metadata, None, None
 146     else:
 147         return None, metadata, 'exhausted', player_error
 148
 149 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 150     signature = list(cipher['s'][0])
 151     for c in algo.split():
 152         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 153         ix = int(ix) % len(signature) if ix else 0
 154         if not op: continue
 155         if op == 'r': signature = list(reversed(signature))
 156         if op == 's': signature = signature[ix:]
 157         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 158     sp = cipher.get('sp', ['signature'])[0]
 159     sig = cipher.get('sig', [''.join(signature)])[0]
 160     return f"{cipher['url'][0]}&{sp}={sig}"
 161
 162 def prepare_metadata(metadata):
 163     meta1 = metadata['videoDetails']
 164     meta2 = metadata['microformat']['playerMicroformatRenderer']
 165     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 166         if 'cards' in metadata else []
 167     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 168         if 'endscreen' in metadata else []
 169
 170     # the actual video streams have exact information:
 171     try:
 172         sd = metadata['streamingData']
 173         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 174         aspect_ratio = some_stream['width'] / some_stream['height']
 175     # if that's unavailable (e.g. on livestreams), fall back to
 176     # thumbnails (only either 4:3 or 16:9).
 177     except:
 178         some_img = meta2['thumbnail']['thumbnails'][0]
 179         aspect_ratio = some_img['width'] / some_img['height']
 180
 181     subtitles = sorted([
 182         {'url':cc['baseUrl'],
 183          'code':cc['languageCode'],
 184          'autogenerated':cc.get('kind')=="asr",
 185          'name':cc['name']['simpleText']}
 186         for cc in metadata.get('captions',{})
 187             .get('playerCaptionsTracklistRenderer',{})
 188             .get('captionTracks',[])
 189     ], key=lambda cc: cc['autogenerated'])
 190
 191     def clean_url(url):
 192         # externals URLs are redirected through youtube.com/redirect, but we
 193         # may encounter internal URLs, too
 194         return parse_qs(urlparse(url).query).get('q',[url])[0]
 195     # Remove left-/rightmost word from string:
 196     delL = lambda s: s.partition(' ')[2]
 197     delR = lambda s: s.rpartition(' ')[0]
 198     # Thousands seperator aware int():
 199     intT = lambda s: int(s.replace(',', ''))
 200
 201     def parse_infocard(card):
 202         card = card['cardRenderer']
 203         ctype = list(card['content'].keys())[0]
 204         content = card['content'][ctype]
 205         if ctype == "pollRenderer":
 206             ctype = "POLL"
 207             content = {
 208                 'question': content['question']['simpleText'],
 209                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 210                     for a in content['choices']],
 211             }
 212         elif ctype == "videoInfoCardContentRenderer":
 213             ctype = "VIDEO"
 214             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 215             # TODO: this is ugly; cleanup.
 216             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 217             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 218             content = {
 219                 'video_id': content['action']['watchEndpoint']['videoId'],
 220                 'title': content['videoTitle']['simpleText'],
 221                 'author': delL(content['channelName']['simpleText']),
 222                 'length': length,
 223                 'views': intT(delR(content['viewCountText']['simpleText'])),
 224             }
 225         elif ctype == "playlistInfoCardContentRenderer":
 226             ctype = "PLAYLIST"
 227             content = {
 228                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 229                 'video_id': content['action']['watchEndpoint']['videoId'],
 230                 'title': content['playlistTitle']['simpleText'],
 231                 'author': delL(content['channelName']['simpleText']),
 232                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 233             }
 234         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 235             ctype = "WEBSITE"
 236             content = {
 237                 'url': clean_url(content['command']['urlEndpoint']['url']),
 238                 'domain': content['displayDomain']['simpleText'],
 239                 'title': content['title']['simpleText'],
 240                 # XXX: no thumbnails for infocards
 241             }
 242         elif ctype == "collaboratorInfoCardContentRenderer":
 243             ctype = "CHANNEL"
 244             content = {
 245                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 246                 'title': content['channelName']['simpleText'],
 247                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 248                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 249             }
 250         else:
 251             import pprint
 252             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 253
 254         return {'type': ctype, 'content': content}
 255
 256     def mkthumbs(thumbs):
 257         return {e['height']: e['url'] for e in thumbs}
 258     def parse_endcard(card):
 259         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 260         ctype = card['style']
 261         if ctype == "CHANNEL":
 262             content = {
 263                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 264                 'title': card['title']['simpleText'],
 265                 'icons': mkthumbs(card['image']['thumbnails']),
 266             }
 267         elif ctype == "VIDEO":
 268             content = {
 269                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 270                 'title': card['title']['simpleText'],
 271                 'length': card['videoDuration']['simpleText'],  # '12:21'
 272                 'views': delR(card['metadata']['simpleText']),
 273                 # XXX: no channel name
 274             }
 275         elif ctype == "PLAYLIST":
 276             content = {
 277                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 278                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 279                 'title': card['title']['simpleText'],
 280                 'author': delL(card['metadata']['simpleText']),
 281                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 282             }
 283         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 284             ctype = "WEBSITE"
 285             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 286             content = {
 287                 'url': url,
 288                 'domain': urlparse(url).netloc,
 289                 'title': card['title']['simpleText'],
 290                 'icons': mkthumbs(card['image']['thumbnails']),
 291             }
 292         else:
 293             import pprint
 294             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 295
 296         return {'type': ctype, 'content': content}
 297
 298     infocards = [parse_infocard(card) for card in cards]
 299     endcards = [parse_endcard(card) for card in endsc]
 300     # combine cards to weed out duplicates. for videos and playlists prefer
 301     # infocards, for channels and websites prefer endcards, as those have more
 302     # information than the other.
 303     # if the card type is not in ident, we use the whole card for comparison
 304     # (otherwise they'd all replace each other)
 305     ident = { # ctype -> ident
 306         'VIDEO': 'video_id',
 307         'PLAYLIST': 'playlist_id',
 308         'CHANNEL': 'channel_id',
 309         'WEBSITE': 'url',
 310         'POLL': 'question',
 311     }
 312     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 313     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 314     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 315
 316     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 317                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 318
 319     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 320         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 321         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 322         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 323         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 324         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 325         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 326         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 327         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 328         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 329         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 330     whitelisted = sorted(meta2['availableCountries'])
 331     blacklisted = sorted(set(all_countries) - set(whitelisted))
 332
 333     return {
 334         'title': meta1['title'],
 335         'author': meta1['author'],
 336         'channel_id': meta1['channelId'],
 337         'description': meta1['shortDescription'],
 338         'published': meta2['publishDate'],
 339         'views': meta1['viewCount'],
 340         'length': int(meta1['lengthSeconds']),
 341         'rating': meta1['averageRating'],
 342         'category': meta2['category'],
 343         'aspectr': aspect_ratio,
 344         'unlisted': meta2['isUnlisted'],
 345         'countries': whitelisted,
 346         'blacklisted': blacklisted,
 347         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 348         'infocards': infocards,
 349         'endcards': endcards,
 350         'all_cards': allcards,
 351         'subtitles': subtitles,
 352     }
 353
 354 class RedditException(Exception): pass
 355 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 356         count=None, before=None, after=None):
 357     """
 358     fetches data from a subreddit (or a multireddit like gif+gifs) and
 359     filters/sorts results.
 360     sorted_by values: hot, new, rising, controversial, top
 361     time values: hour, week, month, year, all (for top and controversial)
 362     returns a tuple of ([{video}],before,after)
 363     """
 364     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 365
 366     if not subreddits:
 367         return [], None, None
 368
 369     query = '&'.join([f"{k}={v}" for k,v in {
 370         'count':count,
 371         'before':before,
 372         'after':after,
 373         'limit':limit, # 1..100 (default 25)
 374         't': time, # hour,week,month,year,all
 375     }.items() if v])
 376     multireddit = '+'.join(subreddits)
 377     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
 378             headers={'User-Agent':'Mozilla/5.0'})
 379     if not r.ok or not 'data' in r.json():
 380         raise RedditException(r.text)
 381
 382     videos = []
 383     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 384     for entry in entries:
 385         e = entry['data']
 386         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 387             continue
 388         try:
 389             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 390             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 391         except:
 392             continue # XXX: should we log that?
 393         if not video_id: continue
 394         videos.append({
 395             'video_id': video_id,
 396             'title': e['title'],
 397             'url': e['permalink'],
 398             'n_comments': e['num_comments'],
 399             'n_karma': e['score'],
 400             'subreddit': e['subreddit'],
 401             'post_id': e['id'],
 402         })
 403     before = r.json()['data']['before']
 404     after = r.json()['data']['after']
 405
 406     return videos, before, after
 407
 408 def pp(*args):
 409     from pprint import pprint
 410     import sys, codecs
 411     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))