app/browse/lib.py

   1 import re
   2 import requests
   3 from datetime import datetime, timezone
   4
   5 from ..common.common import fetch_xml, parse_xml
   6
   7 def fetch_searchresults(q=None, sp=None):
   8   for _ in range(2):
   9     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  10     r = requests.get(f"https://www.youtube.com/results", {
  11         'search_query': q,
  12         'pbj': 1, # makes youtube return a json-response
  13         'hl': 'en', #'en_US',
  14         'sp': sp,
  15     }, headers={
  16         'x-youtube-client-name': '1',
  17         'x-youtube-client-version': f'2.{today}.01.01', # the version is parsed as a date, and if it's invalid (e.g. month>12 or even feb>=30), youtube throws an encrypted stacktrace :D (but any random date >= 20160323 as of 20200802 works (even year 3000)
  18     })
  19     if not r.ok:
  20         return None
  21
  22     # Sometimes, youtube throws an exception after the response already begun.
  23     # This can manifest in two ways:
  24     # 1) So the status code is 200, begins with JSON and switches to HTML half
  25     #    way through. WTF?! (This should be "fixed" by retrying, though)
  26     # 2) The response just stopping mid-way through like this: response.text ==
  27     #    '[\r\n{"page": "search","rootVe": "4724"},\r\n{"page": "search",'
  28     # hence, just try-catching the decoding step is the easiest way out.
  29     try:
  30         return r.json()
  31     except:
  32         continue # will return None once we break out of the loop
  33
  34 def fetch_ajax(params):
  35     """
  36     fetch data using a continuation protobuf
  37     """
  38     # TODO: handle auto_generated!
  39     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  40
  41     # TODO: this is not cached any more! -> https://github.com/reclosedev/requests-cache/issues/154
  42     r = requests.post(f"https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={
  43         'continuation': params,
  44         'context': {'client': {
  45             'gl': 'US',
  46             'hl': 'en',
  47             'clientName': 'WEB',
  48             'clientVersion': f'2.{today}.01.01',
  49         }},
  50     })
  51
  52     if not r.ok:
  53         return None
  54
  55     return r.json()
  56
  57 def canonicalize_channel(name):
  58     if re.fullmatch(r"(UC[A-Za-z0-9_-]{22})", name):
  59         return name
  60
  61     # try /user/ (legacy URLs):
  62     xmlfeed = fetch_xml("user", name)
  63     if xmlfeed:
  64         _, _, _, channel_id, _ = parse_xml(xmlfeed)
  65         return channel_id
  66
  67     # get UCID of /c/ (vanity URLs):
  68     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  69     r = requests.get(f'https://www.youtube.com/c/{name}/about?pbj=1&hl=en_US', headers={
  70         'x-youtube-client-name': '1',
  71         'x-youtube-client-version': f'2.{today}.01.01', # see fetch_searchresults()
  72     })
  73     try:
  74         return r.json()[1]['response']['metadata']['channelMetadataRenderer']['rssUrl'].split("=")[1]
  75     except:
  76         pass
  77
  78     # unable to extract:
  79     return None