app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15
  16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  17 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  18
  19 # Note: this should only be required for the 'memory' backed cache.
  20 # TODO: only run for long-running processes, i.e. the frontend
  21 from threading import Timer
  22 def purge_cache(sec):
  23     requests_cache.remove_expired_responses()
  24     t = Timer(sec, purge_cache, args=(sec,))
  25     t.setDaemon(True)
  26     t.start()
  27 purge_cache(10*60)
  28
  29 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  30 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  31 try:
  32     #raise Exception()
  33     from flask import g
  34     import requests
  35     from requests import Session as OriginalSession
  36     class _NSASession(OriginalSession):
  37         def request(self, method, url, params=None, data=None, **kwargs):
  38             response = super(_NSASession, self).request(
  39                 method, url, params, data, **kwargs
  40             )
  41             if 'api_requests' not in g:
  42                 g.api_requests = []
  43             g.api_requests.append((url, params, response.text))
  44             return response
  45     requests.Session = requests.sessions.Session = _NSASession
  46 except:
  47     pass
  48
  49 def fetch_xml(feed_type, feed_id):
  50     # TODO: handle requests.exceptions.ConnectionError
  51     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  52         feed_type: feed_id,
  53     })
  54     if not r.ok:
  55         return None
  56
  57     return r.text
  58
  59 def parse_xml(xmldata):
  60     ns = {
  61         'atom':"http://www.w3.org/2005/Atom",
  62         'yt': "http://www.youtube.com/xml/schemas/2015",
  63         'media':"http://search.yahoo.com/mrss/",
  64         'at': "http://purl.org/atompub/tombstones/1.0",
  65     }
  66
  67     feed = ElementTree.fromstring(xmldata)
  68     if feed.find('at:deleted-entry',ns):
  69         author = feed.find('at:deleted-entry/at:by/name',ns).text
  70         ref = feed.find('at:deleted-entry',ns).get('ref')
  71         (_, _, video_id) = ref.rpartition(':')
  72         return None, None, []
  73     title = feed.find('atom:title',ns).text
  74     author = feed.find('atom:author/atom:name',ns).text \
  75         if feed.find('atom:author',ns) else None
  76     videos = []
  77     for entry in feed.findall('atom:entry',ns):
  78         videos.append({
  79             'video_id': entry.find('yt:videoId',ns).text,
  80             'title': entry.find('atom:title',ns).text,
  81             'published': entry.find('atom:published',ns).text,
  82             'channel_id': entry.find('yt:channelId',ns).text,
  83             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  84             # extra fields for pull_subs/webhook:
  85             'updated': entry.find('atom:updated',ns).text,
  86         })
  87
  88     return title, author, videos
  89
  90 def update_channel(db, xmldata):
  91     if not xmldata: return False
  92
  93     # Note: websub does not return global author, hence taking from first video
  94     title, _, videos = parse_xml(xmldata)
  95
  96     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
  97
  98     c = db.cursor()
  99     for i, video in enumerate(videos):
 100         now = datetime.now(timezone.utc)
 101         updated = dateutil.parser.parse(video['updated'])
 102         published = dateutil.parser.parse(video['published'])
 103         # if update and published time are near-identical, we assume it's new.
 104         if (updated - published).seconds < 60 and (now - published).days < 7:
 105             timestamp = now
 106         else:#, it's just an update to an older video.
 107             timestamp = published
 108
 109         c.execute("""
 110             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 111                            VALUES (?, ?, ?, datetime(?), datetime(?))
 112         """, (
 113             video['video_id'],
 114             video['channel_id'],
 115             video['title'],
 116             video['published'],
 117             timestamp
 118         ))
 119
 120         if i == 0: # only required once per feed
 121             c.execute("""
 122                 INSERT OR REPLACE INTO channels (id, name)
 123                                 VALUES (?, ?)
 124             """, (video['channel_id'], video['author']))
 125     db.commit()
 126
 127     return True
 128
 129 def get_video_info(video_id, sts=0, algo=""):
 130     """
 131     returns: best-quality muxed video stream, player_response, error-type/mesage
 132     error types: player, malformed, livestream, geolocked, exhausted
 133     """
 134     player_error = None # for 'exhausted'
 135     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 136         r = requests.get("https://www.youtube.com/get_video_info", {
 137             "video_id": video_id,
 138             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 139             "el": el,
 140             "sts": sts,
 141             "hl": "en_US",
 142         })
 143         params = parse_qs(r.text)
 144         if 'errorcode' in params: # status=fail
 145             return None, None, 'malformed', params['reason'][0]
 146
 147         metadata = json.loads(params.get('player_response')[0])
 148         playabilityStatus = metadata['playabilityStatus']['status']
 149         if playabilityStatus != "OK":
 150             playabilityReason = metadata['playabilityStatus']['reason']
 151             player_error = f"{playabilityStatus}: {playabilityReason}"
 152             if playabilityStatus == "UNPLAYABLE":
 153                 continue  # try again with next el value (or fail as exhausted)
 154             # without videoDetails, there's only the error message
 155             maybe_metadata = metadata if 'videoDetails' in metadata else None
 156             return None, maybe_metadata, 'player', player_error
 157         if metadata['videoDetails']['isLiveContent'] and \
 158                 (metadata['videoDetails'].get('isLive', False) or \
 159                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 160             return None, metadata, 'livestream', None
 161
 162         if not 'formats' in metadata['streamingData']:
 163             continue # no urls
 164
 165         formats = metadata['streamingData']['formats']
 166         for (i,v) in enumerate(formats):
 167             if not ('cipher' in v or 'signatureCipher' in v): continue
 168             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 169             formats[i]['url'] = unscramble(cipher, algo)
 170
 171         # todo: check if we have urls or try again
 172         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 173
 174         if 'gcr' in parse_qs(url):
 175             return None, metadata, 'geolocked', None
 176
 177         return url, metadata, None, None
 178     else:
 179         return None, metadata, 'exhausted', player_error
 180
 181 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 182     signature = list(cipher['s'][0])
 183     for c in algo.split():
 184         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 185         ix = int(ix) % len(signature) if ix else 0
 186         if not op: continue
 187         if op == 'r': signature = list(reversed(signature))
 188         if op == 's': signature = signature[ix:]
 189         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 190     sp = cipher.get('sp', ['signature'])[0]
 191     sig = cipher.get('sig', [''.join(signature)])[0]
 192     return f"{cipher['url'][0]}&{sp}={sig}"
 193
 194 def prepare_metadata(metadata):
 195     meta1 = metadata['videoDetails']
 196     meta2 = metadata['microformat']['playerMicroformatRenderer']
 197     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 198         if 'cards' in metadata else []
 199     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 200         if 'endscreen' in metadata else []
 201
 202     # the actual video streams have exact information:
 203     try:
 204         sd = metadata['streamingData']
 205         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 206         aspect_ratio = some_stream['width'] / some_stream['height']
 207     # if that's unavailable (e.g. on livestreams), fall back to
 208     # thumbnails (only either 4:3 or 16:9).
 209     except:
 210         some_img = meta2['thumbnail']['thumbnails'][0]
 211         aspect_ratio = some_img['width'] / some_img['height']
 212
 213     subtitles = sorted([
 214         {'url':cc['baseUrl'],
 215          'code':cc['languageCode'],
 216          'autogenerated':cc.get('kind')=="asr",
 217          'name':cc['name']['simpleText']}
 218         for cc in metadata.get('captions',{})
 219             .get('playerCaptionsTracklistRenderer',{})
 220             .get('captionTracks',[])
 221     ], key=lambda cc: cc['autogenerated'])
 222
 223     def clean_url(url):
 224         # externals URLs are redirected through youtube.com/redirect, but we
 225         # may encounter internal URLs, too
 226         return parse_qs(urlparse(url).query).get('q',[url])[0]
 227     # Remove left-/rightmost word from string:
 228     delL = lambda s: s.partition(' ')[2]
 229     delR = lambda s: s.rpartition(' ')[0]
 230     # Thousands seperator aware int():
 231     intT = lambda s: int(s.replace(',', ''))
 232
 233     def parse_infocard(card):
 234         card = card['cardRenderer']
 235         ctype = list(card['content'].keys())[0]
 236         content = card['content'][ctype]
 237         if ctype == "pollRenderer":
 238             ctype = "POLL"
 239             content = {
 240                 'question': content['question']['simpleText'],
 241                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 242                     for a in content['choices']],
 243             }
 244         elif ctype == "videoInfoCardContentRenderer":
 245             ctype = "VIDEO"
 246             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 247             # TODO: this is ugly; cleanup.
 248             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 249             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 250             content = {
 251                 'video_id': content['action']['watchEndpoint']['videoId'],
 252                 'title': content['videoTitle']['simpleText'],
 253                 'author': delL(content['channelName']['simpleText']),
 254                 'length': length,
 255                 'views': intT(delR(content['viewCountText']['simpleText'])),
 256             }
 257         elif ctype == "playlistInfoCardContentRenderer":
 258             ctype = "PLAYLIST"
 259             content = {
 260                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 261                 'video_id': content['action']['watchEndpoint']['videoId'],
 262                 'title': content['playlistTitle']['simpleText'],
 263                 'author': delL(content['channelName']['simpleText']),
 264                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 265             }
 266         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 267             ctype = "WEBSITE"
 268             content = {
 269                 'url': clean_url(content['command']['urlEndpoint']['url']),
 270                 'domain': content['displayDomain']['simpleText'],
 271                 'title': content['title']['simpleText'],
 272                 # XXX: no thumbnails for infocards
 273             }
 274         elif ctype == "collaboratorInfoCardContentRenderer":
 275             ctype = "CHANNEL"
 276             content = {
 277                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 278                 'title': content['channelName']['simpleText'],
 279                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 280                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 281             }
 282         else:
 283             import pprint
 284             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 285
 286         return {'type': ctype, 'content': content}
 287
 288     def mkthumbs(thumbs):
 289         return {e['height']: e['url'] for e in thumbs}
 290     def parse_endcard(card):
 291         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 292         ctype = card['style']
 293         if ctype == "CHANNEL":
 294             content = {
 295                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 296                 'title': card['title']['simpleText'],
 297                 'icons': mkthumbs(card['image']['thumbnails']),
 298             }
 299         elif ctype == "VIDEO":
 300             content = {
 301                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 302                 'title': card['title']['simpleText'],
 303                 'length': card['videoDuration']['simpleText'],  # '12:21'
 304                 'views': delR(card['metadata']['simpleText']),
 305                 # XXX: no channel name
 306             }
 307         elif ctype == "PLAYLIST":
 308             content = {
 309                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 310                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 311                 'title': card['title']['simpleText'],
 312                 'author': delL(card['metadata']['simpleText']),
 313                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 314             }
 315         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 316             ctype = "WEBSITE"
 317             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 318             content = {
 319                 'url': url,
 320                 'domain': urlparse(url).netloc,
 321                 'title': card['title']['simpleText'],
 322                 'icons': mkthumbs(card['image']['thumbnails']),
 323             }
 324         else:
 325             import pprint
 326             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 327
 328         return {'type': ctype, 'content': content}
 329
 330     infocards = [parse_infocard(card) for card in cards]
 331     endcards = [parse_endcard(card) for card in endsc]
 332     # combine cards to weed out duplicates. for videos and playlists prefer
 333     # infocards, for channels and websites prefer endcards, as those have more
 334     # information than the other.
 335     # if the card type is not in ident, we use the whole card for comparison
 336     # (otherwise they'd all replace each other)
 337     ident = { # ctype -> ident
 338         'VIDEO': 'video_id',
 339         'PLAYLIST': 'playlist_id',
 340         'CHANNEL': 'channel_id',
 341         'WEBSITE': 'url',
 342         'POLL': 'question',
 343     }
 344     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 345     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 346     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 347
 348     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 349                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 350
 351     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 352         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 353         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 354         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 355         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 356         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 357         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 358         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 359         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 360         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 361         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 362     whitelisted = sorted(meta2.get('availableCountries',[]))
 363     blacklisted = sorted(set(all_countries) - set(whitelisted))
 364
 365     return {
 366         'title': meta1['title'],
 367         'author': meta1['author'],
 368         'channel_id': meta1['channelId'],
 369         'description': meta1['shortDescription'],
 370         'published': meta2['publishDate'],
 371         'views': meta1['viewCount'],
 372         'length': int(meta1['lengthSeconds']),
 373         'rating': meta1['averageRating'],
 374         'category': meta2['category'],
 375         'aspectr': aspect_ratio,
 376         'unlisted': meta2['isUnlisted'],
 377         'countries': whitelisted,
 378         'blacklisted': blacklisted,
 379         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 380         'infocards': infocards,
 381         'endcards': endcards,
 382         'all_cards': allcards,
 383         'subtitles': subtitles,
 384     }
 385
 386 class RedditException(Exception): pass
 387 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 388         count=None, before=None, after=None):
 389     """
 390     fetches data from a subreddit (or a multireddit like gif+gifs) and
 391     filters/sorts results.
 392     sorted_by values: hot, new, rising, controversial, top
 393     time values: hour, week, month, year, all (for top and controversial)
 394     returns a tuple of ([{video}],before,after)
 395     """
 396     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 397
 398     if not subreddits:
 399         return [], None, None
 400
 401     query = {k:v for k,v in {
 402         'count':count,
 403         'before':before,
 404         'after':after,
 405         'limit':limit, # 1..100 (default 25)
 406         't': time, # hour,week,month,year,all
 407     }.items() if v}
 408     multireddit = '+'.join(subreddits)
 409     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 410             query, headers={'User-Agent':'Mozilla/5.0'})
 411     if not r.ok or not 'data' in r.json():
 412         raise RedditException(r.text)
 413
 414     videos = []
 415     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 416     for entry in entries:
 417         e = entry['data']
 418         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 419             continue
 420         try:
 421             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 422             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 423         except:
 424             continue # XXX: should we log that?
 425         if not video_id: continue
 426         videos.append({
 427             'video_id': video_id,
 428             'title': e['title'],
 429             'url': e['permalink'],
 430             'n_comments': e['num_comments'],
 431             'n_karma': e['score'],
 432             'subreddit': e['subreddit'],
 433             'post_id': e['id'],
 434         })
 435     before = r.json()['data']['before']
 436     after = r.json()['data']['after']
 437
 438     return videos, before, after
 439
 440 def pp(*args):
 441     from pprint import pprint
 442     import sys, codecs
 443     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))