app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     # TODO: handle requests.exceptions.ConnectionError
  31     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  32     if not r.ok:
  33         return None
  34
  35     return r.text
  36
  37 def parse_xml(xmldata):
  38     ns = {
  39         'atom':"http://www.w3.org/2005/Atom",
  40         'yt': "http://www.youtube.com/xml/schemas/2015",
  41         'media':"http://search.yahoo.com/mrss/"
  42     }
  43
  44     feed = ElementTree.fromstring(xmldata)
  45     title = feed.find('atom:title',ns).text
  46     author = feed.find('atom:author/atom:name',ns).text \
  47         if feed.find('atom:author',ns) else None
  48     videos = []
  49     for entry in feed.findall('atom:entry',ns):
  50         videos.append({
  51             'video_id': entry.find('yt:videoId',ns).text,
  52             'title': entry.find('atom:title',ns).text,
  53             'published': entry.find('atom:published',ns).text,
  54             'channel_id': entry.find('yt:channelId',ns).text,
  55             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  56             # extra fields for pull_subs/webhook:
  57             'updated': entry.find('atom:updated',ns).text,
  58         })
  59
  60     return title, author, videos
  61
  62 def update_channel(db, xmldata):
  63     if not xmldata: return False
  64
  65     # Note: websub does not return global author, hence taking from first video
  66     title, _, videos = parse_xml(xmldata)
  67
  68     c = db.cursor()
  69     for i, video in enumerate(videos):
  70         now = datetime.now(timezone.utc)
  71         updated = dateutil.parser.parse(video['updated'])
  72         published = dateutil.parser.parse(video['published'])
  73         # if update and published time are near-identical, we assume it's new.
  74         if (updated - published).seconds < 60 and (now - published).days < 7:
  75             timestamp = now
  76         else:#, it's just an update to an older video.
  77             timestamp = published
  78
  79         c.execute("""
  80             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  81                            VALUES (?, ?, ?, datetime(?), datetime(?))
  82         """, (
  83             video['video_id'],
  84             video['channel_id'],
  85             video['title'],
  86             video['published'],
  87             timestamp
  88         ))
  89
  90         if i == 0: # only required once per feed
  91             c.execute("""
  92                 INSERT OR REPLACE INTO channels (id, name)
  93                                 VALUES (?, ?)
  94             """, (video['channel_id'], video['author']))
  95     db.commit()
  96
  97     return True
  98
  99 def get_video_info(video_id, sts=0, algo=""):
 100     """
 101     returns: best-quality muxed video stream, player_response, error-type/mesage
 102     error types: player, malformed, livestream, geolocked, exhausted
 103     """
 104     player_error = None # for 'exhausted'
 105     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 106         r = requests.get(f"https://www.youtube.com/get_video_info"+
 107             f"?video_id={video_id}"+
 108             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 109             f"&el={el}"+
 110             f"&sts={sts}"+
 111             f"&hl=en_US") #"&hl=en&gl=US"
 112         params = parse_qs(r.text)
 113         if 'errorcode' in params: # status=fail
 114             return None, None, 'malformed', params['reason'][0]
 115
 116         metadata = json.loads(params.get('player_response')[0])
 117         playabilityStatus = metadata['playabilityStatus']['status']
 118         if playabilityStatus != "OK":
 119             playabilityReason = metadata['playabilityStatus']['reason']
 120             player_error = f"{playabilityStatus}: {playabilityReason}"
 121             if playabilityStatus == "UNPLAYABLE":
 122                 continue  # try again with next el value (or fail as exhausted)
 123             # without videoDetails, there's only the error message
 124             maybe_metadata = metadata if 'videoDetails' in metadata else None
 125             return None, maybe_metadata, 'player', player_error
 126         if metadata['videoDetails']['isLiveContent'] and \
 127                 (metadata['videoDetails'].get('isLive', False) or \
 128                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 129             return None, metadata, 'livestream', None
 130
 131         if not 'formats' in metadata['streamingData']:
 132             continue # no urls
 133
 134         formats = metadata['streamingData']['formats']
 135         for (i,v) in enumerate(formats):
 136             if not ('cipher' in v or 'signatureCipher' in v): continue
 137             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 138             formats[i]['url'] = unscramble(cipher, algo)
 139
 140         # todo: check if we have urls or try again
 141         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 142
 143         if 'gcr' in parse_qs(url):
 144             return None, metadata, 'geolocked', None
 145
 146         return url, metadata, None, None
 147     else:
 148         return None, metadata, 'exhausted', player_error
 149
 150 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 151     signature = list(cipher['s'][0])
 152     for c in algo.split():
 153         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 154         ix = int(ix) % len(signature) if ix else 0
 155         if not op: continue
 156         if op == 'r': signature = list(reversed(signature))
 157         if op == 's': signature = signature[ix:]
 158         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 159     sp = cipher.get('sp', ['signature'])[0]
 160     sig = cipher.get('sig', [''.join(signature)])[0]
 161     return f"{cipher['url'][0]}&{sp}={sig}"
 162
 163 def prepare_metadata(metadata):
 164     meta1 = metadata['videoDetails']
 165     meta2 = metadata['microformat']['playerMicroformatRenderer']
 166     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 167         if 'cards' in metadata else []
 168     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 169         if 'endscreen' in metadata else []
 170
 171     # the actual video streams have exact information:
 172     try:
 173         sd = metadata['streamingData']
 174         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 175         aspect_ratio = some_stream['width'] / some_stream['height']
 176     # if that's unavailable (e.g. on livestreams), fall back to
 177     # thumbnails (only either 4:3 or 16:9).
 178     except:
 179         some_img = meta2['thumbnail']['thumbnails'][0]
 180         aspect_ratio = some_img['width'] / some_img['height']
 181
 182     subtitles = sorted([
 183         {'url':cc['baseUrl'],
 184          'code':cc['languageCode'],
 185          'autogenerated':cc.get('kind')=="asr",
 186          'name':cc['name']['simpleText']}
 187         for cc in metadata.get('captions',{})
 188             .get('playerCaptionsTracklistRenderer',{})
 189             .get('captionTracks',[])
 190     ], key=lambda cc: cc['autogenerated'])
 191
 192     def clean_url(url):
 193         # externals URLs are redirected through youtube.com/redirect, but we
 194         # may encounter internal URLs, too
 195         return parse_qs(urlparse(url).query).get('q',[url])[0]
 196     # Remove left-/rightmost word from string:
 197     delL = lambda s: s.partition(' ')[2]
 198     delR = lambda s: s.rpartition(' ')[0]
 199     # Thousands seperator aware int():
 200     intT = lambda s: int(s.replace(',', ''))
 201
 202     def parse_infocard(card):
 203         card = card['cardRenderer']
 204         ctype = list(card['content'].keys())[0]
 205         content = card['content'][ctype]
 206         if ctype == "pollRenderer":
 207             ctype = "POLL"
 208             content = {
 209                 'question': content['question']['simpleText'],
 210                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 211                     for a in content['choices']],
 212             }
 213         elif ctype == "videoInfoCardContentRenderer":
 214             ctype = "VIDEO"
 215             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 216             # TODO: this is ugly; cleanup.
 217             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 218             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 219             content = {
 220                 'video_id': content['action']['watchEndpoint']['videoId'],
 221                 'title': content['videoTitle']['simpleText'],
 222                 'author': delL(content['channelName']['simpleText']),
 223                 'length': length,
 224                 'views': intT(delR(content['viewCountText']['simpleText'])),
 225             }
 226         elif ctype == "playlistInfoCardContentRenderer":
 227             ctype = "PLAYLIST"
 228             content = {
 229                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 230                 'video_id': content['action']['watchEndpoint']['videoId'],
 231                 'title': content['playlistTitle']['simpleText'],
 232                 'author': delL(content['channelName']['simpleText']),
 233                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 234             }
 235         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 236             ctype = "WEBSITE"
 237             content = {
 238                 'url': clean_url(content['command']['urlEndpoint']['url']),
 239                 'domain': content['displayDomain']['simpleText'],
 240                 'title': content['title']['simpleText'],
 241                 # XXX: no thumbnails for infocards
 242             }
 243         elif ctype == "collaboratorInfoCardContentRenderer":
 244             ctype = "CHANNEL"
 245             content = {
 246                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 247                 'title': content['channelName']['simpleText'],
 248                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 249                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 250             }
 251         else:
 252             import pprint
 253             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 254
 255         return {'type': ctype, 'content': content}
 256
 257     def mkthumbs(thumbs):
 258         return {e['height']: e['url'] for e in thumbs}
 259     def parse_endcard(card):
 260         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 261         ctype = card['style']
 262         if ctype == "CHANNEL":
 263             content = {
 264                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 265                 'title': card['title']['simpleText'],
 266                 'icons': mkthumbs(card['image']['thumbnails']),
 267             }
 268         elif ctype == "VIDEO":
 269             content = {
 270                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 271                 'title': card['title']['simpleText'],
 272                 'length': card['videoDuration']['simpleText'],  # '12:21'
 273                 'views': delR(card['metadata']['simpleText']),
 274                 # XXX: no channel name
 275             }
 276         elif ctype == "PLAYLIST":
 277             content = {
 278                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 279                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 280                 'title': card['title']['simpleText'],
 281                 'author': delL(card['metadata']['simpleText']),
 282                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 283             }
 284         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 285             ctype = "WEBSITE"
 286             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 287             content = {
 288                 'url': url,
 289                 'domain': urlparse(url).netloc,
 290                 'title': card['title']['simpleText'],
 291                 'icons': mkthumbs(card['image']['thumbnails']),
 292             }
 293         else:
 294             import pprint
 295             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 296
 297         return {'type': ctype, 'content': content}
 298
 299     infocards = [parse_infocard(card) for card in cards]
 300     endcards = [parse_endcard(card) for card in endsc]
 301     # combine cards to weed out duplicates. for videos and playlists prefer
 302     # infocards, for channels and websites prefer endcards, as those have more
 303     # information than the other.
 304     # if the card type is not in ident, we use the whole card for comparison
 305     # (otherwise they'd all replace each other)
 306     ident = { # ctype -> ident
 307         'VIDEO': 'video_id',
 308         'PLAYLIST': 'playlist_id',
 309         'CHANNEL': 'channel_id',
 310         'WEBSITE': 'url',
 311         'POLL': 'question',
 312     }
 313     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 314     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 315     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 316
 317     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 318                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 319
 320     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 321         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 322         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 323         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 324         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 325         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 326         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 327         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 328         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 329         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 330         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 331     whitelisted = sorted(meta2.get('availableCountries',[]))
 332     blacklisted = sorted(set(all_countries) - set(whitelisted))
 333
 334     return {
 335         'title': meta1['title'],
 336         'author': meta1['author'],
 337         'channel_id': meta1['channelId'],
 338         'description': meta1['shortDescription'],
 339         'published': meta2['publishDate'],
 340         'views': meta1['viewCount'],
 341         'length': int(meta1['lengthSeconds']),
 342         'rating': meta1['averageRating'],
 343         'category': meta2['category'],
 344         'aspectr': aspect_ratio,
 345         'unlisted': meta2['isUnlisted'],
 346         'countries': whitelisted,
 347         'blacklisted': blacklisted,
 348         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 349         'infocards': infocards,
 350         'endcards': endcards,
 351         'all_cards': allcards,
 352         'subtitles': subtitles,
 353     }
 354
 355 class RedditException(Exception): pass
 356 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 357         count=None, before=None, after=None):
 358     """
 359     fetches data from a subreddit (or a multireddit like gif+gifs) and
 360     filters/sorts results.
 361     sorted_by values: hot, new, rising, controversial, top
 362     time values: hour, week, month, year, all (for top and controversial)
 363     returns a tuple of ([{video}],before,after)
 364     """
 365     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 366
 367     if not subreddits:
 368         return [], None, None
 369
 370     query = '&'.join([f"{k}={v}" for k,v in {
 371         'count':count,
 372         'before':before,
 373         'after':after,
 374         'limit':limit, # 1..100 (default 25)
 375         't': time, # hour,week,month,year,all
 376     }.items() if v])
 377     multireddit = '+'.join(subreddits)
 378     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
 379             headers={'User-Agent':'Mozilla/5.0'})
 380     if not r.ok or not 'data' in r.json():
 381         raise RedditException(r.text)
 382
 383     videos = []
 384     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 385     for entry in entries:
 386         e = entry['data']
 387         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 388             continue
 389         try:
 390             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 391             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 392         except:
 393             continue # XXX: should we log that?
 394         if not video_id: continue
 395         videos.append({
 396             'video_id': video_id,
 397             'title': e['title'],
 398             'url': e['permalink'],
 399             'n_comments': e['num_comments'],
 400             'n_karma': e['score'],
 401             'subreddit': e['subreddit'],
 402             'post_id': e['id'],
 403         })
 404     before = r.json()['data']['before']
 405     after = r.json()['data']['after']
 406
 407     return videos, before, after
 408
 409 def pp(*args):
 410     from pprint import pprint
 411     import sys, codecs
 412     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))