app/reddit/lib.py

   1 import re
   2 import html
   3 import requests
   4
   5 class RedditException(Exception): pass
   6 def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36,
   7         count=None, before=None, after=None):
   8     """
   9     fetches data from a subreddit (or a multireddit like gif+gifs) and
  10     filters/sorts results.
  11     sorted_by values: hot, new, rising, controversial, top
  12     time values: hour, day, week, month, year, all (for top and controversial)
  13     """
  14
  15     if not subreddits:
  16         return None
  17
  18     query = {k:v for k,v in {
  19         'count':count,
  20         'before':before,
  21         'after':after,
  22         'limit':limit, # 1..100 (default 25)
  23         't': time, # hour,week,month,year,all
  24     }.items() if v}
  25     multireddit = '+'.join(subreddits)
  26     r = requests.get(f"https://old.reddit.com/r/{multireddit}/{sorted_by}.json",
  27             query, headers={'User-Agent':'Mozilla/5.0'})
  28     if not r.ok or not 'data' in r.json():
  29         raise RedditException(r.text)
  30
  31     return r.json()
  32
  33 def fetch_reddit_post(post_id):
  34     # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
  35     r = requests.get(f"https://old.reddit.com/by_id/t3_{post_id}.json",
  36             headers={'User-Agent':'Mozilla/5.0'})
  37     if not r.ok or not 'data' in r.json():
  38         raise RedditException(r.text)
  39
  40     return r.json()
  41
  42 def parse_reddit_videos(data):
  43     videos = []
  44     data = data['data'] or {}
  45     entries = sorted(data.get('children',[]),
  46             key=lambda e: e['data']['score'] > 1,
  47             reverse=True)
  48     for entry in entries:
  49         e = entry['data']
  50         if e['domain'] not in ['youtube.com', 'youtu.be', 'invidio.us', 'invidious.snopyta.org']:
  51             continue
  52         try:
  53             # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
  54             match = re.match(r'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&amp;)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)(?:[?&#]t=([0-9hms:]+))?', e['url'])
  55             video_id = match.group(1)
  56             timestamp = match.group(2)
  57             maybe_length = re.match(r'.*[\[(](?:00:)?(\d\d?(?::\d\d){1,2})[\])]', e['title'])  # .* to match last occurence in line (probably terrible for performance?)
  58             maybe_length = maybe_length.group(1) if maybe_length else None
  59             if maybe_length:
  60                 # 20:59:00 => 20:59 (we're assuming no video is >10h)
  61                 maybe_length = re.sub(r"([1-9]\d:\d\d):00", r"\1", maybe_length)
  62         except:
  63             continue # XXX: should we log that?
  64         if not video_id: continue
  65         videos.append({
  66             'video_id': video_id,
  67             'timestamp': timestamp,
  68             'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template
  69             'url': e['permalink'],
  70             'n_comments': e['num_comments'],
  71             'n_karma': e['score'],
  72             'subreddit': e['subreddit'],
  73             'post_id': e['id'],
  74             'length': maybe_length,
  75         })
  76
  77     return videos