app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 def fetch_xml(feed_type, feed_id):
  30     # TODO: handle requests.exceptions.ConnectionError
  31     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  32         feed_type: feed_id,
  33     })
  34     if not r.ok:
  35         return None
  36
  37     return r.text
  38
  39 def parse_xml(xmldata):
  40     ns = {
  41         'atom':"http://www.w3.org/2005/Atom",
  42         'yt': "http://www.youtube.com/xml/schemas/2015",
  43         'media':"http://search.yahoo.com/mrss/",
  44         'at': "http://purl.org/atompub/tombstones/1.0",
  45     }
  46
  47     feed = ElementTree.fromstring(xmldata)
  48     if feed.find('at:deleted-entry',ns):
  49         author = feed.find('at:deleted-entry/at:by/name',ns).text
  50         ref = feed.find('at:deleted-entry',ns).get('ref')
  51         (_, _, video_id) = ref.rpartition(':')
  52         return None, None, []
  53     title = feed.find('atom:title',ns).text
  54     author = feed.find('atom:author/atom:name',ns).text \
  55         if feed.find('atom:author',ns) else None
  56     videos = []
  57     for entry in feed.findall('atom:entry',ns):
  58         videos.append({
  59             'video_id': entry.find('yt:videoId',ns).text,
  60             'title': entry.find('atom:title',ns).text,
  61             'published': entry.find('atom:published',ns).text,
  62             'channel_id': entry.find('yt:channelId',ns).text,
  63             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  64             # extra fields for pull_subs/webhook:
  65             'updated': entry.find('atom:updated',ns).text,
  66         })
  67
  68     return title, author, videos
  69
  70 def update_channel(db, xmldata):
  71     if not xmldata: return False
  72
  73     # Note: websub does not return global author, hence taking from first video
  74     title, _, videos = parse_xml(xmldata)
  75
  76     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
  77
  78     c = db.cursor()
  79     for i, video in enumerate(videos):
  80         now = datetime.now(timezone.utc)
  81         updated = dateutil.parser.parse(video['updated'])
  82         published = dateutil.parser.parse(video['published'])
  83         # if update and published time are near-identical, we assume it's new.
  84         if (updated - published).seconds < 60 and (now - published).days < 7:
  85             timestamp = now
  86         else:#, it's just an update to an older video.
  87             timestamp = published
  88
  89         c.execute("""
  90             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  91                            VALUES (?, ?, ?, datetime(?), datetime(?))
  92         """, (
  93             video['video_id'],
  94             video['channel_id'],
  95             video['title'],
  96             video['published'],
  97             timestamp
  98         ))
  99
 100         if i == 0: # only required once per feed
 101             c.execute("""
 102                 INSERT OR REPLACE INTO channels (id, name)
 103                                 VALUES (?, ?)
 104             """, (video['channel_id'], video['author']))
 105     db.commit()
 106
 107     return True
 108
 109 def get_video_info(video_id, sts=0, algo=""):
 110     """
 111     returns: best-quality muxed video stream, player_response, error-type/mesage
 112     error types: player, malformed, livestream, geolocked, exhausted
 113     """
 114     player_error = None # for 'exhausted'
 115     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 116         r = requests.get("https://www.youtube.com/get_video_info", {
 117             "video_id": video_id,
 118             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 119             "el": el,
 120             "sts": sts,
 121             "hl": "en_US",
 122         })
 123         params = parse_qs(r.text)
 124         if 'errorcode' in params: # status=fail
 125             return None, None, 'malformed', params['reason'][0]
 126
 127         metadata = json.loads(params.get('player_response')[0])
 128         playabilityStatus = metadata['playabilityStatus']['status']
 129         if playabilityStatus != "OK":
 130             playabilityReason = metadata['playabilityStatus']['reason']
 131             player_error = f"{playabilityStatus}: {playabilityReason}"
 132             if playabilityStatus == "UNPLAYABLE":
 133                 continue  # try again with next el value (or fail as exhausted)
 134             # without videoDetails, there's only the error message
 135             maybe_metadata = metadata if 'videoDetails' in metadata else None
 136             return None, maybe_metadata, 'player', player_error
 137         if metadata['videoDetails']['isLiveContent'] and \
 138                 (metadata['videoDetails'].get('isLive', False) or \
 139                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 140             return None, metadata, 'livestream', None
 141
 142         if not 'formats' in metadata['streamingData']:
 143             continue # no urls
 144
 145         formats = metadata['streamingData']['formats']
 146         for (i,v) in enumerate(formats):
 147             if not ('cipher' in v or 'signatureCipher' in v): continue
 148             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 149             formats[i]['url'] = unscramble(cipher, algo)
 150
 151         # todo: check if we have urls or try again
 152         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 153
 154         if 'gcr' in parse_qs(url):
 155             return None, metadata, 'geolocked', None
 156
 157         return url, metadata, None, None
 158     else:
 159         return None, metadata, 'exhausted', player_error
 160
 161 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 162     signature = list(cipher['s'][0])
 163     for c in algo.split():
 164         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 165         ix = int(ix) % len(signature) if ix else 0
 166         if not op: continue
 167         if op == 'r': signature = list(reversed(signature))
 168         if op == 's': signature = signature[ix:]
 169         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 170     sp = cipher.get('sp', ['signature'])[0]
 171     sig = cipher.get('sig', [''.join(signature)])[0]
 172     return f"{cipher['url'][0]}&{sp}={sig}"
 173
 174 def prepare_metadata(metadata):
 175     meta1 = metadata['videoDetails']
 176     meta2 = metadata['microformat']['playerMicroformatRenderer']
 177     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 178         if 'cards' in metadata else []
 179     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 180         if 'endscreen' in metadata else []
 181
 182     # the actual video streams have exact information:
 183     try:
 184         sd = metadata['streamingData']
 185         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 186         aspect_ratio = some_stream['width'] / some_stream['height']
 187     # if that's unavailable (e.g. on livestreams), fall back to
 188     # thumbnails (only either 4:3 or 16:9).
 189     except:
 190         some_img = meta2['thumbnail']['thumbnails'][0]
 191         aspect_ratio = some_img['width'] / some_img['height']
 192
 193     subtitles = sorted([
 194         {'url':cc['baseUrl'],
 195          'code':cc['languageCode'],
 196          'autogenerated':cc.get('kind')=="asr",
 197          'name':cc['name']['simpleText']}
 198         for cc in metadata.get('captions',{})
 199             .get('playerCaptionsTracklistRenderer',{})
 200             .get('captionTracks',[])
 201     ], key=lambda cc: cc['autogenerated'])
 202
 203     def clean_url(url):
 204         # externals URLs are redirected through youtube.com/redirect, but we
 205         # may encounter internal URLs, too
 206         return parse_qs(urlparse(url).query).get('q',[url])[0]
 207     # Remove left-/rightmost word from string:
 208     delL = lambda s: s.partition(' ')[2]
 209     delR = lambda s: s.rpartition(' ')[0]
 210     # Thousands seperator aware int():
 211     intT = lambda s: int(s.replace(',', ''))
 212
 213     def parse_infocard(card):
 214         card = card['cardRenderer']
 215         ctype = list(card['content'].keys())[0]
 216         content = card['content'][ctype]
 217         if ctype == "pollRenderer":
 218             ctype = "POLL"
 219             content = {
 220                 'question': content['question']['simpleText'],
 221                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 222                     for a in content['choices']],
 223             }
 224         elif ctype == "videoInfoCardContentRenderer":
 225             ctype = "VIDEO"
 226             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 227             # TODO: this is ugly; cleanup.
 228             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 229             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 230             content = {
 231                 'video_id': content['action']['watchEndpoint']['videoId'],
 232                 'title': content['videoTitle']['simpleText'],
 233                 'author': delL(content['channelName']['simpleText']),
 234                 'length': length,
 235                 'views': intT(delR(content['viewCountText']['simpleText'])),
 236             }
 237         elif ctype == "playlistInfoCardContentRenderer":
 238             ctype = "PLAYLIST"
 239             content = {
 240                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 241                 'video_id': content['action']['watchEndpoint']['videoId'],
 242                 'title': content['playlistTitle']['simpleText'],
 243                 'author': delL(content['channelName']['simpleText']),
 244                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 245             }
 246         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 247             ctype = "WEBSITE"
 248             content = {
 249                 'url': clean_url(content['command']['urlEndpoint']['url']),
 250                 'domain': content['displayDomain']['simpleText'],
 251                 'title': content['title']['simpleText'],
 252                 # XXX: no thumbnails for infocards
 253             }
 254         elif ctype == "collaboratorInfoCardContentRenderer":
 255             ctype = "CHANNEL"
 256             content = {
 257                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 258                 'title': content['channelName']['simpleText'],
 259                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 260                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 261             }
 262         else:
 263             import pprint
 264             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 265
 266         return {'type': ctype, 'content': content}
 267
 268     def mkthumbs(thumbs):
 269         return {e['height']: e['url'] for e in thumbs}
 270     def parse_endcard(card):
 271         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 272         ctype = card['style']
 273         if ctype == "CHANNEL":
 274             content = {
 275                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 276                 'title': card['title']['simpleText'],
 277                 'icons': mkthumbs(card['image']['thumbnails']),
 278             }
 279         elif ctype == "VIDEO":
 280             content = {
 281                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 282                 'title': card['title']['simpleText'],
 283                 'length': card['videoDuration']['simpleText'],  # '12:21'
 284                 'views': delR(card['metadata']['simpleText']),
 285                 # XXX: no channel name
 286             }
 287         elif ctype == "PLAYLIST":
 288             content = {
 289                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 290                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 291                 'title': card['title']['simpleText'],
 292                 'author': delL(card['metadata']['simpleText']),
 293                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 294             }
 295         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 296             ctype = "WEBSITE"
 297             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 298             content = {
 299                 'url': url,
 300                 'domain': urlparse(url).netloc,
 301                 'title': card['title']['simpleText'],
 302                 'icons': mkthumbs(card['image']['thumbnails']),
 303             }
 304         else:
 305             import pprint
 306             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 307
 308         return {'type': ctype, 'content': content}
 309
 310     infocards = [parse_infocard(card) for card in cards]
 311     endcards = [parse_endcard(card) for card in endsc]
 312     # combine cards to weed out duplicates. for videos and playlists prefer
 313     # infocards, for channels and websites prefer endcards, as those have more
 314     # information than the other.
 315     # if the card type is not in ident, we use the whole card for comparison
 316     # (otherwise they'd all replace each other)
 317     ident = { # ctype -> ident
 318         'VIDEO': 'video_id',
 319         'PLAYLIST': 'playlist_id',
 320         'CHANNEL': 'channel_id',
 321         'WEBSITE': 'url',
 322         'POLL': 'question',
 323     }
 324     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 325     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 326     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 327
 328     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 329                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 330
 331     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 332         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 333         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 334         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 335         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 336         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 337         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 338         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 339         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 340         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 341         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 342     whitelisted = sorted(meta2.get('availableCountries',[]))
 343     blacklisted = sorted(set(all_countries) - set(whitelisted))
 344
 345     return {
 346         'title': meta1['title'],
 347         'author': meta1['author'],
 348         'channel_id': meta1['channelId'],
 349         'description': meta1['shortDescription'],
 350         'published': meta2['publishDate'],
 351         'views': meta1['viewCount'],
 352         'length': int(meta1['lengthSeconds']),
 353         'rating': meta1['averageRating'],
 354         'category': meta2['category'],
 355         'aspectr': aspect_ratio,
 356         'unlisted': meta2['isUnlisted'],
 357         'countries': whitelisted,
 358         'blacklisted': blacklisted,
 359         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 360         'infocards': infocards,
 361         'endcards': endcards,
 362         'all_cards': allcards,
 363         'subtitles': subtitles,
 364     }
 365
 366 class RedditException(Exception): pass
 367 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 368         count=None, before=None, after=None):
 369     """
 370     fetches data from a subreddit (or a multireddit like gif+gifs) and
 371     filters/sorts results.
 372     sorted_by values: hot, new, rising, controversial, top
 373     time values: hour, week, month, year, all (for top and controversial)
 374     returns a tuple of ([{video}],before,after)
 375     """
 376     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 377
 378     if not subreddits:
 379         return [], None, None
 380
 381     query = {k:v for k,v in {
 382         'count':count,
 383         'before':before,
 384         'after':after,
 385         'limit':limit, # 1..100 (default 25)
 386         't': time, # hour,week,month,year,all
 387     }.items() if v}
 388     multireddit = '+'.join(subreddits)
 389     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 390             query, headers={'User-Agent':'Mozilla/5.0'})
 391     if not r.ok or not 'data' in r.json():
 392         raise RedditException(r.text)
 393
 394     videos = []
 395     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 396     for entry in entries:
 397         e = entry['data']
 398         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 399             continue
 400         try:
 401             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 402             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 403         except:
 404             continue # XXX: should we log that?
 405         if not video_id: continue
 406         videos.append({
 407             'video_id': video_id,
 408             'title': e['title'],
 409             'url': e['permalink'],
 410             'n_comments': e['num_comments'],
 411             'n_karma': e['score'],
 412             'subreddit': e['subreddit'],
 413             'post_id': e['id'],
 414         })
 415     before = r.json()['data']['before']
 416     after = r.json()['data']['after']
 417
 418     return videos, before, after
 419
 420 def pp(*args):
 421     from pprint import pprint
 422     import sys, codecs
 423     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))