app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     # TODO: handle requests.exceptions.ConnectionError
  31     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  32     if not r.ok:
  33         return None
  34
  35     return r.text
  36
  37 def parse_xml(xmldata):
  38     ns = {
  39         'atom':"http://www.w3.org/2005/Atom",
  40         'yt': "http://www.youtube.com/xml/schemas/2015",
  41         'media':"http://search.yahoo.com/mrss/",
  42         'at': "http://purl.org/atompub/tombstones/1.0",
  43     }
  44
  45     feed = ElementTree.fromstring(xmldata)
  46     if feed.find('at:deleted-entry',ns):
  47         author = feed.find('at:deleted-entry/at:by/name',ns).text
  48         ref = feed.find('at:deleted-entry',ns).get('ref')
  49         (_, _, video_id) = ref.rpartition(':')
  50         return None, None, []
  51     title = feed.find('atom:title',ns).text
  52     author = feed.find('atom:author/atom:name',ns).text \
  53         if feed.find('atom:author',ns) else None
  54     videos = []
  55     for entry in feed.findall('atom:entry',ns):
  56         videos.append({
  57             'video_id': entry.find('yt:videoId',ns).text,
  58             'title': entry.find('atom:title',ns).text,
  59             'published': entry.find('atom:published',ns).text,
  60             'channel_id': entry.find('yt:channelId',ns).text,
  61             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  62             # extra fields for pull_subs/webhook:
  63             'updated': entry.find('atom:updated',ns).text,
  64         })
  65
  66     return title, author, videos
  67
  68 def update_channel(db, xmldata):
  69     if not xmldata: return False
  70
  71     # Note: websub does not return global author, hence taking from first video
  72     title, _, videos = parse_xml(xmldata)
  73
  74     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
  75
  76     c = db.cursor()
  77     for i, video in enumerate(videos):
  78         now = datetime.now(timezone.utc)
  79         updated = dateutil.parser.parse(video['updated'])
  80         published = dateutil.parser.parse(video['published'])
  81         # if update and published time are near-identical, we assume it's new.
  82         if (updated - published).seconds < 60 and (now - published).days < 7:
  83             timestamp = now
  84         else:#, it's just an update to an older video.
  85             timestamp = published
  86
  87         c.execute("""
  88             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  89                            VALUES (?, ?, ?, datetime(?), datetime(?))
  90         """, (
  91             video['video_id'],
  92             video['channel_id'],
  93             video['title'],
  94             video['published'],
  95             timestamp
  96         ))
  97
  98         if i == 0: # only required once per feed
  99             c.execute("""
 100                 INSERT OR REPLACE INTO channels (id, name)
 101                                 VALUES (?, ?)
 102             """, (video['channel_id'], video['author']))
 103     db.commit()
 104
 105     return True
 106
 107 def get_video_info(video_id, sts=0, algo=""):
 108     """
 109     returns: best-quality muxed video stream, player_response, error-type/mesage
 110     error types: player, malformed, livestream, geolocked, exhausted
 111     """
 112     player_error = None # for 'exhausted'
 113     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 114         r = requests.get(f"https://www.youtube.com/get_video_info"+
 115             f"?video_id={video_id}"+
 116             f"&eurl=https://youtube.googleapis.com/v/{video_id}"+
 117             f"&el={el}"+
 118             f"&sts={sts}"+
 119             f"&hl=en_US") #"&hl=en&gl=US"
 120         params = parse_qs(r.text)
 121         if 'errorcode' in params: # status=fail
 122             return None, None, 'malformed', params['reason'][0]
 123
 124         metadata = json.loads(params.get('player_response')[0])
 125         playabilityStatus = metadata['playabilityStatus']['status']
 126         if playabilityStatus != "OK":
 127             playabilityReason = metadata['playabilityStatus']['reason']
 128             player_error = f"{playabilityStatus}: {playabilityReason}"
 129             if playabilityStatus == "UNPLAYABLE":
 130                 continue  # try again with next el value (or fail as exhausted)
 131             # without videoDetails, there's only the error message
 132             maybe_metadata = metadata if 'videoDetails' in metadata else None
 133             return None, maybe_metadata, 'player', player_error
 134         if metadata['videoDetails']['isLiveContent'] and \
 135                 (metadata['videoDetails'].get('isLive', False) or \
 136                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 137             return None, metadata, 'livestream', None
 138
 139         if not 'formats' in metadata['streamingData']:
 140             continue # no urls
 141
 142         formats = metadata['streamingData']['formats']
 143         for (i,v) in enumerate(formats):
 144             if not ('cipher' in v or 'signatureCipher' in v): continue
 145             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 146             formats[i]['url'] = unscramble(cipher, algo)
 147
 148         # todo: check if we have urls or try again
 149         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 150
 151         if 'gcr' in parse_qs(url):
 152             return None, metadata, 'geolocked', None
 153
 154         return url, metadata, None, None
 155     else:
 156         return None, metadata, 'exhausted', player_error
 157
 158 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 159     signature = list(cipher['s'][0])
 160     for c in algo.split():
 161         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 162         ix = int(ix) % len(signature) if ix else 0
 163         if not op: continue
 164         if op == 'r': signature = list(reversed(signature))
 165         if op == 's': signature = signature[ix:]
 166         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 167     sp = cipher.get('sp', ['signature'])[0]
 168     sig = cipher.get('sig', [''.join(signature)])[0]
 169     return f"{cipher['url'][0]}&{sp}={sig}"
 170
 171 def prepare_metadata(metadata):
 172     meta1 = metadata['videoDetails']
 173     meta2 = metadata['microformat']['playerMicroformatRenderer']
 174     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 175         if 'cards' in metadata else []
 176     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 177         if 'endscreen' in metadata else []
 178
 179     # the actual video streams have exact information:
 180     try:
 181         sd = metadata['streamingData']
 182         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 183         aspect_ratio = some_stream['width'] / some_stream['height']
 184     # if that's unavailable (e.g. on livestreams), fall back to
 185     # thumbnails (only either 4:3 or 16:9).
 186     except:
 187         some_img = meta2['thumbnail']['thumbnails'][0]
 188         aspect_ratio = some_img['width'] / some_img['height']
 189
 190     subtitles = sorted([
 191         {'url':cc['baseUrl'],
 192          'code':cc['languageCode'],
 193          'autogenerated':cc.get('kind')=="asr",
 194          'name':cc['name']['simpleText']}
 195         for cc in metadata.get('captions',{})
 196             .get('playerCaptionsTracklistRenderer',{})
 197             .get('captionTracks',[])
 198     ], key=lambda cc: cc['autogenerated'])
 199
 200     def clean_url(url):
 201         # externals URLs are redirected through youtube.com/redirect, but we
 202         # may encounter internal URLs, too
 203         return parse_qs(urlparse(url).query).get('q',[url])[0]
 204     # Remove left-/rightmost word from string:
 205     delL = lambda s: s.partition(' ')[2]
 206     delR = lambda s: s.rpartition(' ')[0]
 207     # Thousands seperator aware int():
 208     intT = lambda s: int(s.replace(',', ''))
 209
 210     def parse_infocard(card):
 211         card = card['cardRenderer']
 212         ctype = list(card['content'].keys())[0]
 213         content = card['content'][ctype]
 214         if ctype == "pollRenderer":
 215             ctype = "POLL"
 216             content = {
 217                 'question': content['question']['simpleText'],
 218                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 219                     for a in content['choices']],
 220             }
 221         elif ctype == "videoInfoCardContentRenderer":
 222             ctype = "VIDEO"
 223             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 224             # TODO: this is ugly; cleanup.
 225             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 226             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 227             content = {
 228                 'video_id': content['action']['watchEndpoint']['videoId'],
 229                 'title': content['videoTitle']['simpleText'],
 230                 'author': delL(content['channelName']['simpleText']),
 231                 'length': length,
 232                 'views': intT(delR(content['viewCountText']['simpleText'])),
 233             }
 234         elif ctype == "playlistInfoCardContentRenderer":
 235             ctype = "PLAYLIST"
 236             content = {
 237                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 238                 'video_id': content['action']['watchEndpoint']['videoId'],
 239                 'title': content['playlistTitle']['simpleText'],
 240                 'author': delL(content['channelName']['simpleText']),
 241                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 242             }
 243         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 244             ctype = "WEBSITE"
 245             content = {
 246                 'url': clean_url(content['command']['urlEndpoint']['url']),
 247                 'domain': content['displayDomain']['simpleText'],
 248                 'title': content['title']['simpleText'],
 249                 # XXX: no thumbnails for infocards
 250             }
 251         elif ctype == "collaboratorInfoCardContentRenderer":
 252             ctype = "CHANNEL"
 253             content = {
 254                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 255                 'title': content['channelName']['simpleText'],
 256                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 257                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 258             }
 259         else:
 260             import pprint
 261             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 262
 263         return {'type': ctype, 'content': content}
 264
 265     def mkthumbs(thumbs):
 266         return {e['height']: e['url'] for e in thumbs}
 267     def parse_endcard(card):
 268         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 269         ctype = card['style']
 270         if ctype == "CHANNEL":
 271             content = {
 272                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 273                 'title': card['title']['simpleText'],
 274                 'icons': mkthumbs(card['image']['thumbnails']),
 275             }
 276         elif ctype == "VIDEO":
 277             content = {
 278                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 279                 'title': card['title']['simpleText'],
 280                 'length': card['videoDuration']['simpleText'],  # '12:21'
 281                 'views': delR(card['metadata']['simpleText']),
 282                 # XXX: no channel name
 283             }
 284         elif ctype == "PLAYLIST":
 285             content = {
 286                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 287                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 288                 'title': card['title']['simpleText'],
 289                 'author': delL(card['metadata']['simpleText']),
 290                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 291             }
 292         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 293             ctype = "WEBSITE"
 294             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 295             content = {
 296                 'url': url,
 297                 'domain': urlparse(url).netloc,
 298                 'title': card['title']['simpleText'],
 299                 'icons': mkthumbs(card['image']['thumbnails']),
 300             }
 301         else:
 302             import pprint
 303             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 304
 305         return {'type': ctype, 'content': content}
 306
 307     infocards = [parse_infocard(card) for card in cards]
 308     endcards = [parse_endcard(card) for card in endsc]
 309     # combine cards to weed out duplicates. for videos and playlists prefer
 310     # infocards, for channels and websites prefer endcards, as those have more
 311     # information than the other.
 312     # if the card type is not in ident, we use the whole card for comparison
 313     # (otherwise they'd all replace each other)
 314     ident = { # ctype -> ident
 315         'VIDEO': 'video_id',
 316         'PLAYLIST': 'playlist_id',
 317         'CHANNEL': 'channel_id',
 318         'WEBSITE': 'url',
 319         'POLL': 'question',
 320     }
 321     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 322     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 323     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 324
 325     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 326                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 327
 328     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 329         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 330         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 331         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 332         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 333         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 334         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 335         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 336         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 337         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 338         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 339     whitelisted = sorted(meta2.get('availableCountries',[]))
 340     blacklisted = sorted(set(all_countries) - set(whitelisted))
 341
 342     return {
 343         'title': meta1['title'],
 344         'author': meta1['author'],
 345         'channel_id': meta1['channelId'],
 346         'description': meta1['shortDescription'],
 347         'published': meta2['publishDate'],
 348         'views': meta1['viewCount'],
 349         'length': int(meta1['lengthSeconds']),
 350         'rating': meta1['averageRating'],
 351         'category': meta2['category'],
 352         'aspectr': aspect_ratio,
 353         'unlisted': meta2['isUnlisted'],
 354         'countries': whitelisted,
 355         'blacklisted': blacklisted,
 356         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 357         'infocards': infocards,
 358         'endcards': endcards,
 359         'all_cards': allcards,
 360         'subtitles': subtitles,
 361     }
 362
 363 class RedditException(Exception): pass
 364 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 365         count=None, before=None, after=None):
 366     """
 367     fetches data from a subreddit (or a multireddit like gif+gifs) and
 368     filters/sorts results.
 369     sorted_by values: hot, new, rising, controversial, top
 370     time values: hour, week, month, year, all (for top and controversial)
 371     returns a tuple of ([{video}],before,after)
 372     """
 373     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 374
 375     if not subreddits:
 376         return [], None, None
 377
 378     query = '&'.join([f"{k}={v}" for k,v in {
 379         'count':count,
 380         'before':before,
 381         'after':after,
 382         'limit':limit, # 1..100 (default 25)
 383         't': time, # hour,week,month,year,all
 384     }.items() if v])
 385     multireddit = '+'.join(subreddits)
 386     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json?{query}",
 387             headers={'User-Agent':'Mozilla/5.0'})
 388     if not r.ok or not 'data' in r.json():
 389         raise RedditException(r.text)
 390
 391     videos = []
 392     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 393     for entry in entries:
 394         e = entry['data']
 395         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 396             continue
 397         try:
 398             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 399             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 400         except:
 401             continue # XXX: should we log that?
 402         if not video_id: continue
 403         videos.append({
 404             'video_id': video_id,
 405             'title': e['title'],
 406             'url': e['permalink'],
 407             'n_comments': e['num_comments'],
 408             'n_karma': e['score'],
 409             'subreddit': e['subreddit'],
 410             'post_id': e['id'],
 411         })
 412     before = r.json()['data']['before']
 413     after = r.json()['data']['after']
 414
 415     return videos, before, after
 416
 417 def pp(*args):
 418     from pprint import pprint
 419     import sys, codecs
 420     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))