import re import html import requests class RedditException(Exception): pass def fetch_reddit(subreddits, sorted_by="hot", time=None, *, limit=36, count=None, before=None, after=None): """ fetches data from a subreddit (or a multireddit like gif+gifs) and filters/sorts results. sorted_by values: hot, new, rising, controversial, top time values: hour, day, week, month, year, all (for top and controversial) """ if not subreddits: return None query = {k:v for k,v in { 'count':count, 'before':before, 'after':after, 'limit':limit, # 1..100 (default 25) 't': time, # hour,week,month,year,all }.items() if v} multireddit = '+'.join(subreddits) r = requests.get(f"{multireddit}/{sorted_by}.json", query, headers={'User-Agent':'Mozilla/5.0'}) if not r.ok or not 'data' in r.json(): raise RedditException(r.text) return r.json() def fetch_reddit_post(post_id): # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json r = requests.get(f"{post_id}.json", headers={'User-Agent':'Mozilla/5.0'}) if not r.ok or not 'data' in r.json(): raise RedditException(r.text) return r.json() def parse_reddit_videos(data): videos = [] data = data['data'] or {} entries = data.get('children',[]) for entry in entries: e = entry['data'] if e['domain'] not in ['', '', '']: continue try: # Note: is not valid (404s), but seen in the wild. match = re.match(r'^https?://(?:www.|m.)?(?\?(?:.*&)?v=||youtube(?:-nocookie)?.com/(?:embed|shorts|live)/|[-_0-9A-Za-z]+)(?:[?&#]t=([0-9hms:]+))?', e['url']) video_id = timestamp = maybe_length = re.match(r'.*[\[(](?:00:)?(\d\d?(?::\d\d){1,2})[\])]', e['title']) # .* to match last occurence in line (probably terrible for performance?) maybe_length = if maybe_length else None if maybe_length: # 20:59:00 => 20:59 (we're assuming no video is >10h) maybe_length = re.sub(r"([1-9]\d:\d\d):00", r"\1", maybe_length) except: continue # XXX: should we log that? if not video_id: continue videos.append({ 'video_id': video_id, 'timestamp': timestamp, 'title': html.unescape(e['title']), # Note: we unescape and re-escape in the template 'url': e['permalink'], 'n_comments': e['num_comments'], 'n_karma': e['score'], 'subreddit': e['subreddit'], 'post_id': e['id'], 'length': maybe_length, }) return videos