import os import requests import dateutil.parser from datetime import datetime, timezone from xml.etree import ElementTree from urllib.parse import parse_qs from configparser import ConfigParser cf = ConfigParser() config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini') cf.read(config_filename) def fetch_xml(feed_type, feed_id): r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}") if not r.ok: return None return r.text def parse_xml(xmldata): ns = { 'atom':"http://www.w3.org/2005/Atom", 'yt': "http://www.youtube.com/xml/schemas/2015", 'media':"http://search.yahoo.com/mrss/" } feed = ElementTree.fromstring(xmldata) author = feed.find('atom:author',ns).find('atom:name',ns).text if feed.find('atom:author',ns) else None if feed.find('yt:channelId',ns): channel_id = feed.find('yt:channelId',ns).text else: # TODO: clean this up (websub has no yt:channelId, this should be adapted for playlists) self = feed.find('atom:link[@rel="self"]',ns).get('href') channel_id = parse_qs(self.split('?')[1]).get('channel_id')[0] title = feed.find('atom:title',ns).text videos = [] for entry in feed.findall('atom:entry',ns): videos.append({ 'video_id': entry.find('yt:videoId',ns).text, 'title': entry.find('atom:title',ns).text, 'published': entry.find('atom:published',ns).text, 'channel_id': entry.find('yt:channelId',ns).text, 'author': entry.find('atom:author',ns).find('atom:name',ns).text, # extra fields for pull_subs/webhook: 'updated': entry.find('atom:updated',ns).text, #'description': entry.find('media:group',ns).find('media:description',ns).text ##xxx:missing for websub }) return title, author, channel_id, videos def update_channel(db, xmldata): """ returns True on success, False on failure. rigorous error checking is required, otherwise data will be lost! the caller MUST (as per RFC 2119) write (append) the xmlfeed into a file on error. """ if not xmldata: return False # Note: wbesub does not return global author title, author, channel_id, videos = parse_xml(xmldata) #xxx: perl-code had this eval'd for a die c = db.cursor() for video in videos: now = datetime.now(timezone.utc) updated = dateutil.parser.parse(video['updated']) published = dateutil.parser.parse(video['updated']) # if update and published time are near-identical, it's new. use crawl time if it was published within a week. # else, it's just an update to an older video (before we subscribed, so use original upload time). if (updated - published).seconds < 60 and (now - published).days < 7: timestamp = now else: timestamp = published c.execute(""" INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled) VALUES (?, ?, ?, datetime(?), datetime(?)) """, (video['video_id'], video['channel_id'], video['title'], video['published'], timestamp)) #XXX:errorcheck # update channel name (we don't fetch it on subscribing) author = video['author'] # XXX: doing this once per channel is enough (for pull-subs.pl) c.execute(""" INSERT OR REPLACE INTO channels (id, name) VALUES (?, ?) """, (channel_id, author)) #XXX:errorcheck return True