app/common/common.py

   1 import os
   2 import re
   3 import json
   4 import requests
   5 import requests_cache
   6 import dateutil.parser
   7 from xml.etree import ElementTree
   8 from configparser import ConfigParser
   9 from datetime import datetime, timezone
  10 from urllib.parse import parse_qs, urlparse
  11
  12 cf = ConfigParser()
  13 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  14 cf.read(config_filename)
  15 if not 'global' in cf: # todo: full config check
  16     raise Exception("Configuration file not found or empty")
  17
  18 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start.  TODO: exipre when video is livestream/premiere/etc
  19 requests_cache.install_cache(backend='memory', expire_after=10*60, allowable_codes=(200,))
  20
  21 # Note: this should only be required for the 'memory' backed cache.
  22 # TODO: only run for long-running processes, i.e. the frontend
  23 from threading import Timer
  24 def purge_cache(sec):
  25     requests_cache.remove_expired_responses()
  26     t = Timer(sec, purge_cache, args=(sec,))
  27     t.setDaemon(True)
  28     t.start()
  29 purge_cache(10*60)
  30
  31 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
  32 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
  33 try:
  34     #raise Exception()
  35     from flask import g
  36     import requests
  37     from requests import Session as OriginalSession
  38     class _NSASession(OriginalSession):
  39         def request(self, method, url, params=None, data=None, **kwargs):
  40             response = super(_NSASession, self).request(
  41                 method, url, params, data, **kwargs
  42             )
  43             if 'api_requests' not in g:
  44                 g.api_requests = []
  45             g.api_requests.append((url, params, response.text))
  46             return response
  47     requests.Session = requests.sessions.Session = _NSASession
  48 except:
  49     pass
  50
  51 def fetch_xml(feed_type, feed_id):
  52     # TODO: handle requests.exceptions.ConnectionError
  53     r = requests.get("https://www.youtube.com/feeds/videos.xml", {
  54         feed_type: feed_id,
  55     })
  56     if not r.ok:
  57         return None
  58
  59     return r.text
  60
  61 def parse_xml(xmldata):
  62     ns = {
  63         'atom':"http://www.w3.org/2005/Atom",
  64         'yt': "http://www.youtube.com/xml/schemas/2015",
  65         'media':"http://search.yahoo.com/mrss/",
  66         'at': "http://purl.org/atompub/tombstones/1.0",
  67     }
  68
  69     feed = ElementTree.fromstring(xmldata)
  70     if feed.find('at:deleted-entry',ns):
  71         author = feed.find('at:deleted-entry/at:by/name',ns).text
  72         ref = feed.find('at:deleted-entry',ns).get('ref')
  73         (_, _, video_id) = ref.rpartition(':')
  74         return None, None, []
  75     title = feed.find('atom:title',ns).text
  76     author = feed.find('atom:author/atom:name',ns).text \
  77         if feed.find('atom:author',ns) else None
  78     videos = []
  79     for entry in feed.findall('atom:entry',ns):
  80         videos.append({
  81             'video_id': entry.find('yt:videoId',ns).text,
  82             'title': entry.find('atom:title',ns).text,
  83             'published': entry.find('atom:published',ns).text,
  84             'channel_id': entry.find('yt:channelId',ns).text,
  85             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  86             # extra fields for pull_subs/webhook:
  87             'updated': entry.find('atom:updated',ns).text,
  88         })
  89
  90     return title, author, videos
  91
  92 def update_channel(db, xmldata):
  93     if not xmldata: return False
  94
  95     # Note: websub does not return global author, hence taking from first video
  96     title, _, videos = parse_xml(xmldata)
  97
  98     # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
  99
 100     c = db.cursor()
 101     for i, video in enumerate(videos):
 102         now = datetime.now(timezone.utc)
 103         updated = dateutil.parser.parse(video['updated'])
 104         published = dateutil.parser.parse(video['published'])
 105         # if update and published time are near-identical, we assume it's new.
 106         if (updated - published).seconds < 60 and (now - published).days < 7:
 107             timestamp = now
 108         else:#, it's just an update to an older video.
 109             timestamp = published
 110
 111         c.execute("""
 112             INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
 113                            VALUES (?, ?, ?, datetime(?), datetime(?))
 114         """, (
 115             video['video_id'],
 116             video['channel_id'],
 117             video['title'],
 118             video['published'],
 119             timestamp
 120         ))
 121
 122         if i == 0: # only required once per feed
 123             c.execute("""
 124                 INSERT OR REPLACE INTO channels (id, name)
 125                                 VALUES (?, ?)
 126             """, (video['channel_id'], video['author']))
 127     db.commit()
 128
 129     return True
 130
 131 def get_video_info(video_id, sts=0, algo=""):
 132     """
 133     returns: best-quality muxed video stream, player_response, error-type/mesage
 134     error types: player, malformed, livestream, geolocked, exhausted
 135     """
 136     player_error = None # for 'exhausted'
 137     for el in ['embedded', 'detailpage']:#sometimes, only one or the other works
 138         r = requests.get("https://www.youtube.com/get_video_info", {
 139             "video_id": video_id,
 140             "eurl": f"https://youtube.googleapis.com/v/{video_id}",
 141             "el": el,
 142             "sts": sts,
 143             "hl": "en_US",
 144         })
 145         params = parse_qs(r.text)
 146         if 'errorcode' in params: # status=fail
 147             return None, None, 'malformed', params['reason'][0]
 148
 149         metadata = json.loads(params.get('player_response')[0])
 150         playabilityStatus = metadata['playabilityStatus']['status']
 151         if playabilityStatus != "OK":
 152             playabilityReason = metadata['playabilityStatus']['reason']
 153             player_error = f"{playabilityStatus}: {playabilityReason}"
 154             if playabilityStatus == "UNPLAYABLE":
 155                 continue  # try again with next el value (or fail as exhausted)
 156             # without videoDetails, there's only the error message
 157             maybe_metadata = metadata if 'videoDetails' in metadata else None
 158             return None, maybe_metadata, 'player', player_error
 159         if metadata['videoDetails']['isLiveContent'] and \
 160                 (metadata['videoDetails'].get('isLive', False) or \
 161                 metadata['videoDetails'].get('isPostLiveDvr', False)):
 162             return None, metadata, 'livestream', None
 163
 164         if not 'formats' in metadata['streamingData']:
 165             continue # no urls
 166
 167         formats = metadata['streamingData']['formats']
 168         for (i,v) in enumerate(formats):
 169             if not ('cipher' in v or 'signatureCipher' in v): continue
 170             cipher = parse_qs(v.get('cipher') or v.get('signatureCipher'))
 171             formats[i]['url'] = unscramble(cipher, algo)
 172
 173         # todo: check if we have urls or try again
 174         url = sorted(formats, key=lambda k: k['height'], reverse=True)[0]['url']
 175
 176         if 'gcr' in parse_qs(url):
 177             return None, metadata, 'geolocked', None
 178
 179         return url, metadata, None, None
 180     else:
 181         return None, metadata, 'exhausted', player_error
 182
 183 def unscramble(cipher, algo):  # test video id: UxxajLWwzqY
 184     signature = list(cipher['s'][0])
 185     for c in algo.split():
 186         op, ix = re.match(r"([rsw])(\d+)?", c).groups()
 187         ix = int(ix) % len(signature) if ix else 0
 188         if not op: continue
 189         if op == 'r': signature = list(reversed(signature))
 190         if op == 's': signature = signature[ix:]
 191         if op == 'w': signature[0], signature[ix] = signature[ix], signature[0]
 192     sp = cipher.get('sp', ['signature'])[0]
 193     sig = cipher.get('sig', [''.join(signature)])[0]
 194     return f"{cipher['url'][0]}&{sp}={sig}"
 195
 196 def prepare_metadata(metadata):
 197     meta1 = metadata['videoDetails']
 198     meta2 = metadata['microformat']['playerMicroformatRenderer']
 199     cards = metadata['cards']['cardCollectionRenderer']['cards'] \
 200         if 'cards' in metadata else []
 201     endsc = metadata['endscreen']['endscreenRenderer']['elements'] \
 202         if 'endscreen' in metadata else []
 203
 204     # the actual video streams have exact information:
 205     try:
 206         sd = metadata['streamingData']
 207         some_stream = (sd.get('adaptiveFormats',[]) + sd.get('formats',[]))[0]
 208         aspect_ratio = some_stream['width'] / some_stream['height']
 209     # if that's unavailable (e.g. on livestreams), fall back to
 210     # thumbnails (only either 4:3 or 16:9).
 211     except:
 212         some_img = meta2['thumbnail']['thumbnails'][0]
 213         aspect_ratio = some_img['width'] / some_img['height']
 214
 215     subtitles = sorted([
 216         {'url':cc['baseUrl'],
 217          'code':cc['languageCode'],
 218          'autogenerated':cc.get('kind')=="asr",
 219          'name':cc['name']['simpleText']}
 220         for cc in metadata.get('captions',{})
 221             .get('playerCaptionsTracklistRenderer',{})
 222             .get('captionTracks',[])
 223     ], key=lambda cc: cc['autogenerated'])
 224
 225     def clean_url(url):
 226         # externals URLs are redirected through youtube.com/redirect, but we
 227         # may encounter internal URLs, too
 228         return parse_qs(urlparse(url).query).get('q',[url])[0]
 229     # Remove left-/rightmost word from string:
 230     delL = lambda s: s.partition(' ')[2]
 231     delR = lambda s: s.rpartition(' ')[0]
 232     # Thousands seperator aware int():
 233     intT = lambda s: int(s.replace(',', ''))
 234
 235     def parse_infocard(card):
 236         card = card['cardRenderer']
 237         ctype = list(card['content'].keys())[0]
 238         content = card['content'][ctype]
 239         if ctype == "pollRenderer":
 240             ctype = "POLL"
 241             content = {
 242                 'question': content['question']['simpleText'],
 243                 'answers': [(a['text']['simpleText'],a['numVotes']) \
 244                     for a in content['choices']],
 245             }
 246         elif ctype == "videoInfoCardContentRenderer":
 247             ctype = "VIDEO"
 248             # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
 249             # TODO: this is ugly; cleanup.
 250             is_live = content.get('badge',{}).get('liveBadgeRenderer',{})
 251             length = is_live.get('label',{}).get('simpleText') or content['lengthString']['simpleText']  # '23:03'
 252             content = {
 253                 'video_id': content['action']['watchEndpoint']['videoId'],
 254                 'title': content['videoTitle']['simpleText'],
 255                 'author': delL(content['channelName']['simpleText']),
 256                 'length': length,
 257                 'views': intT(delR(content['viewCountText']['simpleText'])),
 258             }
 259         elif ctype == "playlistInfoCardContentRenderer":
 260             ctype = "PLAYLIST"
 261             content = {
 262                 'playlist_id': content['action']['watchEndpoint']['playlistId'],
 263                 'video_id': content['action']['watchEndpoint']['videoId'],
 264                 'title': content['playlistTitle']['simpleText'],
 265                 'author': delL(content['channelName']['simpleText']),
 266                 'n_videos': intT(content['playlistVideoCount']['simpleText']),
 267             }
 268         elif ctype == "simpleCardContentRenderer" and 'urlEndpoint' in content['command']:
 269             ctype = "WEBSITE"
 270             content = {
 271                 'url': clean_url(content['command']['urlEndpoint']['url']),
 272                 'domain': content['displayDomain']['simpleText'],
 273                 'title': content['title']['simpleText'],
 274                 # XXX: no thumbnails for infocards
 275             }
 276         elif ctype == "collaboratorInfoCardContentRenderer":
 277             ctype = "CHANNEL"
 278             content = {
 279                 'channel_id': content['endpoint']['browseEndpoint']['browseId'],
 280                 'title': content['channelName']['simpleText'],
 281                 'icons': mkthumbs(content['channelAvatar']['thumbnails']),
 282                 'subscribers': content['subscriberCountText']['simpleText'], # "545K subscribers"
 283             }
 284         else:
 285             import pprint
 286             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 287
 288         return {'type': ctype, 'content': content}
 289
 290     def mkthumbs(thumbs):
 291         return {e['height']: e['url'] for e in thumbs}
 292     def parse_endcard(card):
 293         card = card.get('endscreenElementRenderer', card) #only sometimes nested
 294         ctype = card['style']
 295         if ctype == "CHANNEL":
 296             content = {
 297                 'channel_id': card['endpoint']['browseEndpoint']['browseId'],
 298                 'title': card['title']['simpleText'],
 299                 'icons': mkthumbs(card['image']['thumbnails']),
 300             }
 301         elif ctype == "VIDEO":
 302             content = {
 303                 'video_id': card['endpoint']['watchEndpoint']['videoId'], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
 304                 'title': card['title']['simpleText'],
 305                 'length': card['videoDuration']['simpleText'],  # '12:21'
 306                 'views': delR(card['metadata']['simpleText']),
 307                 # XXX: no channel name
 308             }
 309         elif ctype == "PLAYLIST":
 310             content = {
 311                 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'],
 312                 'video_id': card['endpoint']['watchEndpoint']['videoId'],
 313                 'title': card['title']['simpleText'],
 314                 'author': delL(card['metadata']['simpleText']),
 315                 'n_videos': intT(delR(card['playlistLength']['simpleText'])),
 316             }
 317         elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE":
 318             ctype = "WEBSITE"
 319             url = clean_url(card['endpoint']['urlEndpoint']['url'])
 320             content = {
 321                 'url': url,
 322                 'domain': urlparse(url).netloc,
 323                 'title': card['title']['simpleText'],
 324                 'icons': mkthumbs(card['image']['thumbnails']),
 325             }
 326         else:
 327             import pprint
 328             content = {'error': f"{ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>"}
 329
 330         return {'type': ctype, 'content': content}
 331
 332     infocards = [parse_infocard(card) for card in cards]
 333     endcards = [parse_endcard(card) for card in endsc]
 334     # combine cards to weed out duplicates. for videos and playlists prefer
 335     # infocards, for channels and websites prefer endcards, as those have more
 336     # information than the other.
 337     # if the card type is not in ident, we use the whole card for comparison
 338     # (otherwise they'd all replace each other)
 339     ident = { # ctype -> ident
 340         'VIDEO': 'video_id',
 341         'PLAYLIST': 'playlist_id',
 342         'CHANNEL': 'channel_id',
 343         'WEBSITE': 'url',
 344         'POLL': 'question',
 345     }
 346     getident = lambda c: c['content'].get(ident.get(c['type']), c)
 347     mkexclude = lambda cards, types: [getident(c) for c in cards if c['type'] in types]
 348     exclude = lambda cards, without: [c for c in cards if getident(c) not in without]
 349
 350     allcards = exclude(infocards, mkexclude(endcards,  ['CHANNEL','WEBSITE'])) + \
 351                exclude(endcards,  mkexclude(infocards, ['VIDEO','PLAYLIST']))
 352
 353     all_countries = """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
 354         BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
 355         CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
 356         ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
 357         GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
 358         KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
 359         ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
 360         NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
 361         RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
 362         SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
 363         VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""".split()
 364     whitelisted = sorted(meta2.get('availableCountries',[]))
 365     blacklisted = sorted(set(all_countries) - set(whitelisted))
 366
 367     return {
 368         'title': meta1['title'],
 369         'author': meta1['author'],
 370         'channel_id': meta1['channelId'],
 371         'description': meta1['shortDescription'],
 372         'published': meta2['publishDate'],
 373         'views': meta1['viewCount'],
 374         'length': int(meta1['lengthSeconds']),
 375         'rating': meta1['averageRating'],
 376         'category': meta2['category'],
 377         'aspectr': aspect_ratio,
 378         'unlisted': meta2['isUnlisted'],
 379         'countries': whitelisted,
 380         'blacklisted': blacklisted,
 381         'poster': meta2['thumbnail']['thumbnails'][0]['url'],
 382         'infocards': infocards,
 383         'endcards': endcards,
 384         'all_cards': allcards,
 385         'subtitles': subtitles,
 386     }
 387
 388 class RedditException(Exception): pass
 389 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
 390         count=None, before=None, after=None):
 391     """
 392     fetches data from a subreddit (or a multireddit like gif+gifs) and
 393     filters/sorts results.
 394     sorted_by values: hot, new, rising, controversial, top
 395     time values: hour, day, week, month, year, all (for top and controversial)
 396     returns a tuple of ([{video}],before,after)
 397     """
 398     # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
 399
 400     if not subreddits:
 401         return [], None, None
 402
 403     query = {k:v for k,v in {
 404         'count':count,
 405         'before':before,
 406         'after':after,
 407         'limit':limit, # 1..100 (default 25)
 408         't': time, # hour,week,month,year,all
 409     }.items() if v}
 410     multireddit = '+'.join(subreddits)
 411     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
 412             query, headers={'User-Agent':'Mozilla/5.0'})
 413     if not r.ok or not 'data' in r.json():
 414         raise RedditException(r.text)
 415
 416     videos = []
 417     entries = sorted(r.json()['data']['children'], key=lambda e: e['data']['score'] > 1, reverse=True)
 418     for entry in entries:
 419         e = entry['data']
 420         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us']:
 421             continue
 422         try:
 423             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
 424             video_id = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)', e['url']).group(1)
 425         except:
 426             continue # XXX: should we log that?
 427         if not video_id: continue
 428         videos.append({
 429             'video_id': video_id,
 430             'title': e['title'],
 431             'url': e['permalink'],
 432             'n_comments': e['num_comments'],
 433             'n_karma': e['score'],
 434             'subreddit': e['subreddit'],
 435             'post_id': e['id'],
 436         })
 437     before = r.json()['data']['before']
 438     after = r.json()['data']['after']
 439
 440     return videos, before, after
 441
 442 def pp(*args):
 443     from pprint import pprint
 444     import sys, codecs
 445     pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer))