app/common.py

   1 import os
   2 import requests
   3 import dateutil.parser
   4 from datetime import datetime, timezone
   5 from xml.etree import ElementTree
   6 from urllib.parse import parse_qs
   7 from configparser import ConfigParser
   8
   9 cf = ConfigParser()
  10 config_filename = os.environ.get('YT_CONFIG', '/etc/yt/config.ini')
  11 cf.read(config_filename)
  12
  13 def fetch_xml(feed_type, feed_id):
  14     r = requests.get(f"https://www.youtube.com/feeds/videos.xml?{feed_type}={feed_id}")
  15     if not r.ok:
  16         return None
  17
  18     return r.text
  19
  20 def parse_xml(xmldata):
  21     ns = {
  22         'atom':"http://www.w3.org/2005/Atom",
  23         'yt': "http://www.youtube.com/xml/schemas/2015",
  24         'media':"http://search.yahoo.com/mrss/"
  25     }
  26
  27     feed = ElementTree.fromstring(xmldata)
  28     author = feed.find('atom:author',ns).find('atom:name',ns).text if feed.find('atom:author',ns) else None
  29     if feed.find('yt:channelId',ns):
  30         channel_id = feed.find('yt:channelId',ns).text
  31     else: # TODO: clean this up (websub has no yt:channelId, this should be adapted for playlists)
  32         self = feed.find('atom:link[@rel="self"]',ns).get('href')
  33         channel_id = parse_qs(self.split('?')[1]).get('channel_id')[0]
  34     title = feed.find('atom:title',ns).text
  35     videos = []
  36     for entry in feed.findall('atom:entry',ns):
  37         videos.append({
  38             'video_id': entry.find('yt:videoId',ns).text,
  39             'title': entry.find('atom:title',ns).text,
  40             'published': entry.find('atom:published',ns).text,
  41             'channel_id': entry.find('yt:channelId',ns).text,
  42             'author': entry.find('atom:author',ns).find('atom:name',ns).text,
  43             # extra fields for pull_subs/webhook:
  44             'updated': entry.find('atom:updated',ns).text,
  45             #'description': entry.find('media:group',ns).find('media:description',ns).text ##xxx:missing for websub
  46         })
  47
  48     return title, author, channel_id, videos
  49
  50 def update_channel(db, xmldata):
  51     """
  52     returns True on success, False on failure. rigorous error checking is required, otherwise data will be lost!
  53     the caller MUST (as per RFC 2119) write (append) the xmlfeed into a file on error.
  54     """
  55     if not xmldata: return False
  56
  57     # Note: wbesub does not return global author
  58     title, author, channel_id, videos = parse_xml(xmldata) #xxx: perl-code had this eval'd for a die
  59
  60     c = db.cursor()
  61     for video in videos:
  62         now = datetime.now(timezone.utc)
  63         updated = dateutil.parser.parse(video['updated'])
  64         published = dateutil.parser.parse(video['updated'])
  65         # if update and published time are near-identical, it's new. use crawl time if it was published within a week.
  66         # else, it's just an update to an older video (before we subscribed, so use original upload time).
  67         if (updated - published).seconds < 60 and (now - published).days < 7:
  68             timestamp = now
  69         else:
  70             timestamp = published
  71
  72         c.execute("""
  73         INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
  74                        VALUES (?, ?, ?, datetime(?), datetime(?))
  75         """, (video['video_id'], video['channel_id'], video['title'], video['published'], timestamp)) #XXX:errorcheck
  76
  77     # update channel name (we don't fetch it on subscribing)
  78         author = video['author'] # XXX: doing this once per channel is enough (for pull-subs.pl)
  79         c.execute("""
  80             INSERT OR REPLACE INTO channels (id, name)
  81                             VALUES (?, ?)
  82         """, (channel_id, author)) #XXX:errorcheck
  83
  84     return True