From 6f38709a82171117d1f940877ba7ef0ceab1e603 Mon Sep 17 00:00:00 2001 From: girst Date: Fri, 7 Aug 2020 21:19:19 +0200 Subject: [PATCH] fetch length for new subscription videos in the future, we'll also detect whether it's a livestream --- app/common/common.py | 90 ++++++++++++++++++++++++-------------------- config/setup.sql | 1 + 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/app/common/common.py b/app/common/common.py index 3b9cc6b..80dda33 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -112,50 +112,60 @@ def update_channel(db, xmldata, from_webhook=False): #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],)) break - now = datetime.now(timezone.utc) - updated = dateutil.parser.parse(video['updated']) - published = dateutil.parser.parse(video['published']) - # if update and published time are near-identical, we assume it's new. - # checking if it was posted this week is necessary during xmlfeed pulling. - if (updated - published).seconds < 60 and (now - published).days < 7: - timestamp = now - if from_webhook: current_app.logger.warning(f"fresh video {video['video_id']}") # XXX: remove - else:#, it might just an update to an older video, or a previously unlisted one. - # first, assume it's an older video (correct when pulling xmlfeeds) - timestamp = published - # then, check if we don't know about it and if so, look up the real date. - + c.execute("SELECT 1 FROM videos WHERE id=?",(video['video_id'],)) + new_video = len(c.fetchall()) < 1 + if new_video: + if from_webhook:current_app.logger.warning(f"new video {video['video_id']}") + _, _, meta, _, _ = get_video_info(video['video_id']) # The 'published' timestamp sent in websub POSTs are often wrong (e.g.: # video gets uploaded as unlisted on day A and set to public on day B; # the webhook is sent on day B, but 'published' says A. The video - # therefore looks like it's just an update to an older video). If - # that's the case, we fetch get_video_info and double-check. - # We only need to do this to not-yet-in-the-database videos. - c.execute("SELECT 1 from videos where id = ?", (video['video_id'],)) - new_video = len(c.fetchall()) < 1 - if from_webhook: current_app.logger.warning(f"video {video['video_id']}") # XXX: remove - if from_webhook and new_video: - if from_webhook: current_app.logger.warning(f" is webhook and new") # XXX: remove - _, _, meta, _, _ = get_video_info(video['video_id']) - if meta: - meta = video_metadata(meta) - published = dateutil.parser.parse(meta['published']) - if from_webhook: current_app.logger.warning(f" uploaded {published}") # XXX: remove - if (now - published).days < 7: - timestamp = now - else:#, it's just an update to an older video. - timestamp = published + # therefore looks like it's just an update to an older video). + # g_v_i gives is the date the video was published to viewers, so we + # prefer that. But since g_v_i only returns the date without time, + # we still use xmlfeed's date if it's the same date. + published = dateutil.parser.parse(video['published']) + length = None + if meta: + meta = video_metadata(meta) + published2 = dateutil.parser.parse(meta['published']) + if from_webhook:current_app.logger.warning(f"published {published} / {published2}") + if published < published2: # g_v_i date is more accurate: + published = published2 + length = meta['length'] + + now = datetime.now(timezone.utc) + + # we pretend that all videos uploaded this week were uploaded just + # now, so the user sees it at the top of the feed, and it doesn't + # get inserted somewhere further down. + if (now - published).days < 7: + timestamp = now + else:#, it's just an update to an older video. + timestamp = published - c.execute(""" - INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled) - VALUES (?, ?, ?, datetime(?), datetime(?)) - """, ( - video['video_id'], - video['channel_id'], - video['title'], - video['published'], - timestamp - )) + c.execute(""" + INSERT OR IGNORE INTO videos + (id, channel_id, title, length, published, crawled) + VALUES (?, ?, ?, ?, datetime(?), datetime(?)) + """, ( + video['video_id'], + video['channel_id'], + video['title'], + length, + video['published'], + timestamp + )) + else: + # update video title (everything else can't change) + c.execute(""" + UPDATE OR IGNORE videos + SET title = ? + WHERE id = ? + """, ( + video['title'], + video['video_id'], + )) # for channels, this is obviously always the same, but playlists can # consist of videos from different channels: diff --git a/config/setup.sql b/config/setup.sql index 41e3f75..923ca9d 100644 --- a/config/setup.sql +++ b/config/setup.sql @@ -18,6 +18,7 @@ CREATE TABLE videos( id STRING PRIMARY KEY, channel_id STRING, title STRING, + length INTEGER, published DATETIME, crawled DATETIME DEFAULT CURRENT_TIMESTAMP); CREATE TABLE playlist_videos( -- 2.39.3