app/browse/lib.py

   1 import re
   2 import requests
   3 from datetime import datetime, timezone
   4
   5 from ..common.common import fetch_xml, parse_xml
   6
   7 def fetch_searchresults(q=None, sp=None):
   8   for _ in range(2):
   9     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  10     r = requests.get(f"https://www.youtube.com/results", {
  11         'search_query': q,
  12         'pbj': 1, # makes youtube return a json-response
  13         'hl': 'en', #'en_US',
  14         'sp': sp,
  15     }, headers={
  16         'x-youtube-client-name': '1',
  17         'x-youtube-client-version': f'2.{today}.01.01', # the version is parsed as a date, and if it's invalid (e.g. month>12 or even feb>=30), youtube throws an encrypted stacktrace :D (but any random date >= 20160323 as of 20200802 works (even year 3000)
  18     })
  19     if not r.ok:
  20         return None
  21
  22     # Sometimes, youtube throws an exception after the response already begun.
  23     # This can manifest in two ways:
  24     # 1) So the status code is 200, begins with JSON and switches to HTML half
  25     #    way through. WTF?! (This should be "fixed" by retrying, though)
  26     # 2) The response just stopping mid-way through like this: response.text ==
  27     #    '[\r\n{"page": "search","rootVe": "4724"},\r\n{"page": "search",'
  28     # hence, just try-catching the decoding step is the easiest way out.
  29     try:
  30         return r.json()
  31     except:
  32         continue # will return None once we break out of the loop
  33
  34 def fetch_ajax(params):
  35     """
  36     fetch data using a continuation protobuf
  37     """
  38     # TODO: handle auto_generated!
  39     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  40
  41     r = requests.get(f"https://www.youtube.com/browse_ajax", {
  42         'continuation': params,
  43         'gl': 'US',
  44         'hl': 'en',
  45     }, headers={
  46         'x-youtube-client-name': '1',
  47         'x-youtube-client-version': f'2.{today}.01.01', # see fetch_searchresults()
  48     })
  49
  50     if not r.ok:
  51         return None
  52
  53     return r.json()
  54
  55 def canonicalize_channel(name):
  56     if re.fullmatch(r"(UC[A-Za-z0-9_-]{22})", name):
  57         return name
  58
  59     # try /user/ (legacy URLs):
  60     xmlfeed = fetch_xml("user", name)
  61     if xmlfeed:
  62         _, _, _, channel_id, _ = parse_xml(xmlfeed)
  63         return channel_id
  64
  65     # get UCID of /c/ (vanity URLs):
  66     today = datetime.now(timezone.utc).strftime("%Y%m%d")
  67     r = requests.get(f'https://www.youtube.com/c/{name}/about?pbj=1&hl=en_US', headers={
  68         'x-youtube-client-name': '1',
  69         'x-youtube-client-version': f'2.{today}.01.01', # see fetch_searchresults()
  70     })
  71     try:
  72         return r.json()[1]['response']['metadata']['channelMetadataRenderer']['rssUrl'].split("=")[1]
  73     except:
  74         pass
  75
  76     # unable to extract:
  77     return None