From e320952430abad72362837ef493291cca89aeea7 Mon Sep 17 00:00:00 2001 From: girst Date: Mon, 24 Apr 2023 21:59:30 +0000 Subject: [PATCH 01/16] [DATABASE CHANGE: Migration below] store is-shorts flag in subscriptions this will allow us to filter shorts from the subscription feed. ALTER TABLE videos ADD COLUMN shorts BOOLEAN DEFAULT NULL; --- app/common/common.py | 12 ++++++++---- config/setup.sql | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/app/common/common.py b/app/common/common.py index fc5c0b2..32b6b86 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -129,6 +129,7 @@ def update_channel(db, xmldata, from_webhook=False): length = None livestream = None premiere = None + shorts = None if meta: meta = video_metadata(meta) published2 = dateutil.parser.parse(meta['published']) @@ -137,6 +138,7 @@ def update_channel(db, xmldata, from_webhook=False): length = meta['length'] livestream = meta['livestream'] premiere = meta['premiere'] + shorts = meta['shorts'] now = datetime.now(timezone.utc) @@ -150,8 +152,8 @@ def update_channel(db, xmldata, from_webhook=False): c.execute(""" INSERT OR IGNORE INTO videos - (id, channel_id, title, length, livestream, premiere, published, crawled) - VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?)) + (id, channel_id, title, length, livestream, premiere, shorts, published, crawled) + VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?)) """, ( video['video_id'], video['channel_id'], @@ -159,6 +161,7 @@ def update_channel(db, xmldata, from_webhook=False): length, livestream, premiere, + shorts, published, timestamp )) @@ -374,8 +377,8 @@ def store_video_metadata(video_id): if meta: meta = video_metadata(meta) c.execute(""" - INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, published, crawled) - VALUES (?, ?, ?, ?, ?, ?, datetime(?), datetime(?)) + INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled) + VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?)) """, ( video_id, meta['channel_id'], @@ -383,6 +386,7 @@ def store_video_metadata(video_id): meta['length'], meta['livestream'], meta['premiere'], + meta['shorts'], meta['published'], meta['published'], )) diff --git a/config/setup.sql b/config/setup.sql index 4514e83..1f76da4 100644 --- a/config/setup.sql +++ b/config/setup.sql @@ -21,6 +21,7 @@ CREATE TABLE IF NOT EXISTS videos( length INTEGER, livestream BOOLEAN DEFAULT 0, premiere BOOLEAN DEFAULT 0, + shorts BOOLEAN DEFAULT NULL, published DATETIME, crawled DATETIME DEFAULT CURRENT_TIMESTAMP); CREATE TABLE IF NOT EXISTS playlist_videos( -- 2.39.3 From 87b0fc1556707f76cd30bee6207ef1187becf40e Mon Sep 17 00:00:00 2001 From: girst Date: Tue, 25 Apr 2023 16:08:34 +0000 Subject: [PATCH 02/16] improve shorts detection if only one of length>60 or aspect>1 is available we can rule out a shorts video. previously, we marked this state as undetermiable (NULL). --- app/common/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/common/common.py b/app/common/common.py index 32b6b86..8f8ac42 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -346,9 +346,9 @@ def video_metadata(metadata): # shorts are <= 60 seconds and vertical or square. if we were unable to # determine it, we set it to None. is_short = ( - None if length is None or aspect_ratio is None else - True if length <= 60 and aspect_ratio <= 1 else - False + None if length is None and aspect_ratio is None else + True if ((length or 61) <= 60) and ((aspect_ratio or 2) <= 1) else + False # length > 60 or aspect_ratio > 1 ) # Note: 'premiere' videos have livestream=False and published= will be the -- 2.39.3 From a80f6344c34a916878d6a3c337ef7bb3177d36b1 Mon Sep 17 00:00:00 2001 From: girst Date: Tue, 25 Apr 2023 18:06:48 +0000 Subject: [PATCH 03/16] fix dismissing multiple flash()es previously, dismissing one also dismissed all below. --- app/static/style.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/static/style.css b/app/static/style.css index 18a8f50..ef73d0d 100644 --- a/app/static/style.css +++ b/app/static/style.css @@ -43,7 +43,7 @@ article { box-sizing: border-box; } -.flashes>.flash-radio:checked ~ li { +.flashes>.flash-radio:checked + li { display: none; } .flashes>li .flash-close { -- 2.39.3 From 272386ad5bf8a58d36d4639bc3da76fe5df67133 Mon Sep 17 00:00:00 2001 From: girst Date: Wed, 26 Apr 2023 17:07:23 +0000 Subject: [PATCH 04/16] video metadata: prefer videoDetails over microformat for length this short was registered as 61seconds long, so our is_short detection didn't catch it: b2cy9BvaaY4 --- app/common/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/app/common/common.py b/app/common/common.py index 8f8ac42..5a3639f 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -317,8 +317,9 @@ def video_metadata(metadata): meta2 = metadata.get('microformat',{}).get('playerMicroformatRenderer',{}) # sometimes, we receive the notification so early that the length is not - # yet populated. Nothing we can do about it. - length = int(meta2.get('lengthSeconds',0)) or int(meta1.get('lengthSeconds',0)) or None + # yet populated. Nothing we can do about it. meta1 and meta2 use a + # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer. + length = int(meta1.get('lengthSeconds',0)) or int(meta2.get('lengthSeconds',0)) or None scheduled_time = metadata.get('playabilityStatus',{}) \ .get('liveStreamability',{}).get('liveStreamabilityRenderer',{}) \ -- 2.39.3 From 7820b9fcc883a70b9b85a74e0bcb7b4dd14535ae Mon Sep 17 00:00:00 2001 From: girst Date: Wed, 26 Apr 2023 17:19:02 +0000 Subject: [PATCH 05/16] [DATABASE CHANGE: Migration below] allow setting user settings from profile page CREATE TABLE IF NOT EXISTS user_settings( user_id INTEGER, setting TEXT NOT NULL, value TEXT NOT NULL, PRIMARY KEY(user_id, setting), FOREIGN KEY(user_id) REFERENCES users(id)); --- app/common/user.py | 22 +++++++++++++++++++++- app/templates/account_mgmt.html.j2 | 10 ++++++++++ config/setup.sql | 6 ++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/app/common/user.py b/app/common/user.py index 7b62e7c..d935bf0 100644 --- a/app/common/user.py +++ b/app/common/user.py @@ -2,6 +2,7 @@ from werkzeug.security import generate_password_hash, check_password_hash from .common import cf import sqlite3 import secrets +import json from flask_login import LoginManager, UserMixin, login_user, logout_user, login_required, current_user from flask import Blueprint, flash, redirect, render_template, url_for, request @@ -107,6 +108,16 @@ def init_login(app): def account_manager(): with sqlite3.connect(cf['global']['database']) as conn: c = conn.cursor() + c.execute(""" + SELECT setting, value + FROM user_settings + WHERE user_id = ? + """, (current_user.id,)) + result = c.fetchall() + settings = { + setting: json.loads(value) + for setting, value in result + } c.execute(""" SELECT token FROM user_tokens @@ -117,7 +128,7 @@ def init_login(app): (login_token,) = result else: login_token = "" - return render_template('account_mgmt.html.j2', login_token=login_token, random_pwd=secrets.token_hex(16)) + return render_template('account_mgmt.html.j2', settings=settings, login_token=login_token, random_pwd=secrets.token_hex(16)) @usermgmt.route('/manage/account', methods=['POST']) @login_required @@ -139,6 +150,15 @@ def init_login(app): VALUES (?, ?) """, (current_user.id, new_token)) flash('new token generated.', 'info') + elif action == 'chset': + with sqlite3.connect(cf['global']['database']) as conn: + noshorts = request.form.get('noshorts') == 'yes' + c = conn.cursor() + c.execute(""" + INSERT OR REPLACE INTO user_settings (user_id, setting, value) + VALUES (?, ?, ?) + """, (current_user.id, "noshorts", json.dumps(noshorts))) + flash('settings saved.', 'info') elif action == 'addusr': if not current_user.admin: return "only admins may do that!", 403 diff --git a/app/templates/account_mgmt.html.j2 b/app/templates/account_mgmt.html.j2 index dce7455..db8928e 100644 --- a/app/templates/account_mgmt.html.j2 +++ b/app/templates/account_mgmt.html.j2 @@ -21,6 +21,16 @@ + +

Site settings

+
+
Subscription Feed +
+ + +
+
+ {% if current_user.admin %}

Administration

diff --git a/config/setup.sql b/config/setup.sql index 1f76da4..c94469a 100644 --- a/config/setup.sql +++ b/config/setup.sql @@ -76,3 +76,9 @@ CREATE TABLE IF NOT EXISTS users( CREATE TABLE IF NOT EXISTS user_tokens( -- stores revocable url tokens for feeds. user_id INTEGER PRIMARY KEY NOT NULL, token TEXT NOT NULL); +CREATE TABLE IF NOT EXISTS user_settings( -- stores per-user settings as a vertical table. + user_id INTEGER, + setting TEXT NOT NULL, + value TEXT NOT NULL, + PRIMARY KEY(user_id, setting), + FOREIGN KEY(user_id) REFERENCES users(id)); -- 2.39.3 From 3730d4e8241cc393a8c4b1d7fb8dd47d3a443def Mon Sep 17 00:00:00 2001 From: girst Date: Wed, 26 Apr 2023 17:12:21 +0000 Subject: [PATCH 06/16] subscription feed: filter shorts if the user enabled the 'noshorts' setting videos are displayed iff either noshorts config is false or not a shorts video, but pinning overrides hiding. if shorts are shown, they are marked 'shorts' instead of the length. --- app/youtube/__init__.py | 21 ++++++++++++++++++--- app/youtube/templates/index.html.j2 | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/app/youtube/__init__.py b/app/youtube/__init__.py index d0f187c..6acff52 100644 --- a/app/youtube/__init__.py +++ b/app/youtube/__init__.py @@ -33,8 +33,21 @@ def feed(): page = request.args.get('page', 0, type=int) with sqlite3.connect(cf['global']['database']) as conn: c = conn.cursor() + + settings = {} # fallback for guest user + if current_user.is_authenticated: + c.execute(""" + SELECT setting, value + FROM user_settings + WHERE user_id = ? + """, (current_user.id,)) + settings = { + setting: json.loads(value) + for setting, value in c.fetchall() + } + c.execute(""" - SELECT videos.id, channel_id, name, title, length, livestream, premiere, published, playlist_videos.playlist_id, display + SELECT videos.id, channel_id, name, title, length, livestream, premiere, shorts, published, playlist_videos.playlist_id, display FROM videos JOIN channels ON videos.channel_id = channels.id LEFT JOIN playlist_videos ON (videos.id = playlist_videos.video_id) @@ -43,9 +56,10 @@ def feed(): OR playlist_videos.playlist_id IN (SELECT channel_id FROM subscriptions WHERE user=? AND type = 'playlist') OR flags.display = 'pinned') AND flags.display IS NOT 'hidden' + AND (flags.display = 'pinned' OR not ? or not shorts) ORDER BY (display = 'pinned') DESC, crawled DESC LIMIT 36 - OFFSET 36*?""", (token, token, token, page)) + OFFSET 36*?""", (token, token, token, settings.get('noshorts', False), page)) rows = [{ 'video_id': video_id, 'channel_id': channel_id, @@ -56,10 +70,11 @@ def feed(): 'premiere': premiere and (# only if it hasn't yet premiered: datetime.strptime(published+'+0000', "%Y-%m-%d %H:%M:%S%z")>datetime.now(tz=timezone.utc) ), + 'shorts': shorts, 'published': published, 'playlist': playlist, 'pinned': display == 'pinned', - } for (video_id, channel_id, author, title, length, livestream, premiere, published, playlist, display) in c.fetchall()] + } for (video_id, channel_id, author, title, length, livestream, premiere, shorts, published, playlist, display) in c.fetchall()] return render_template('index.html.j2', rows=rows, page=page) @frontend.route('/watch') diff --git a/app/youtube/templates/index.html.j2 b/app/youtube/templates/index.html.j2 index fbcb3a6..9347427 100644 --- a/app/youtube/templates/index.html.j2 +++ b/app/youtube/templates/index.html.j2 @@ -7,7 +7,7 @@ {{ super() }}
{% for row in rows %} - {% set badge = 'LIVE' if row.livestream else 'SOON' if row.premiere else row.length|format_time %} + {% set badge = 'shorts' if row.shorts else 'LIVE' if row.livestream else 'SOON' if row.premiere else row.length|format_time %} {% call macros.card(row.video_id, row.title, row.published|format_date, row.pinned, badge=badge) %} {{ macros.infobar_subscriptions(row.video_id, row.channel_id, row.author) }} {% endcall %} -- 2.39.3 From 6d74ea6802998f0db0075b1778f830d49be019e8 Mon Sep 17 00:00:00 2001 From: girst Date: Sat, 29 Apr 2023 12:44:11 +0000 Subject: [PATCH 07/16] use resolve_url endpoint for channel canonicalisation one less seperate place we call into youtube's frontend apis we now support: - /channel/ucid - /c/vanity - /user/username - /@handle - /brandname according to https://support.google.com/youtube/answer/6180214?hl=en vanity urls and usernames are legacy, brand urls aren't documented at all. --- app/browse/__init__.py | 15 ++++++++------- app/browse/lib.py | 26 ++++++++------------------ 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/app/browse/__init__.py b/app/browse/__init__.py index 6a6b1aa..4673dd6 100644 --- a/app/browse/__init__.py +++ b/app/browse/__init__.py @@ -118,6 +118,7 @@ def channel(channel_id, subpage="videos"): is_subscribed=is_subscribed, continuation=continuation) +@frontend.route('//') @frontend.route('/user//') @frontend.route('/user//') @frontend.route('/c//') @@ -127,15 +128,17 @@ def channel_redirect(user, subpage=None): The browse_ajax 'API' needs the UCID. """ - typ = request.path.split("/")[1] # 'c' or 'user' - # inverse of the test in /channel/: if re.match(r"(UC[A-Za-z0-9_-]{22})", user): return redirect(url_for('.channel', channel_id=user)) - channel_id = canonicalize_channel(user, typ) + if subpage not in (None, "home", "videos", "shorts", "streams", "playlists", "community", "channels", "about"): + raise NotFound("not a valid channel subpage") + + channel_id = canonicalize_channel(request.path) if not channel_id: - raise NotFound("channel appears to not exist") + raise NotFound("channel does not exist") + return redirect( url_for('.channel', channel_id=channel_id, subpage=subpage), 308 ) @@ -177,9 +180,7 @@ def plain_user_or_video(something): # prevent a lot of false-positives (and reduce youtube api calls) raise NotFound - # possible channel names: need to distinguish /name from /@name - typ = "c" if something[0] != "@" else "" - channel_id = canonicalize_channel(something, typ) + channel_id = canonicalize_channel(something) # /vanity or /@handle if channel_id: return redirect(url_for('.channel', channel_id=channel_id)) elif re.match(r"^[-_0-9A-Za-z]{11}$", something): # looks like a video id diff --git a/app/browse/lib.py b/app/browse/lib.py index 5d46b05..a4becf7 100644 --- a/app/browse/lib.py +++ b/app/browse/lib.py @@ -12,7 +12,7 @@ def fetch_ajax(endpoint, **kwargs): today = datetime.now(timezone.utc).strftime("%Y%m%d") # TODO: this is not cached any more! -> https://github.com/reclosedev/requests-cache/issues/154 - # Note: this 'innertube' API key exists since at least 2015: https://stackoverflow.com/q/33511165 + # TODO: replace host with youtubei.googleapis.com (used by android)? r = requests.post(f"https://www.youtube.com/youtubei/v1/{endpoint}?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", json={ **kwargs, 'context': {'client': { @@ -28,24 +28,14 @@ def fetch_ajax(endpoint, **kwargs): return r.json() -def canonicalize_channel(name, typ="c"): - if re.fullmatch(r"(UC[A-Za-z0-9_-]{22})", name): - return name +def canonicalize_channel(path): + if re.fullmatch(r"(UC[A-Za-z0-9_-]{22})", path): + return path - # get UCID of /c/ (vanity URLs): - today = datetime.now(timezone.utc).strftime("%Y%m%d") - typ += "/" if typ != "@" else "" - r = requests.get(f'https://www.youtube.com/{typ}{name}/about?pbj=1&hl=en_US', headers={ - 'x-youtube-client-name': '1', - 'x-youtube-client-version': f'2.{today}.01.01', # see fetch_searchresults() - }) - try: - return r.json()[1]['response']['metadata']['channelMetadataRenderer']['rssUrl'].split("=")[1] - except: - pass - - # unable to extract: - return None + # Note: for /watch, append query string, then return .endpoint.watchEndpoint.videoId + resolved = fetch_ajax("navigation/resolve_url", url=f"https://www.youtube.com/{path}") + channel_id = resolved.get('endpoint',{}).get('browseEndpoint',{}).get('browseId') + return channel_id def find_and_parse_error(result): error_obj = ( -- 2.39.3 From df351c6694ccb0b3874ae424ba5988699c68cea5 Mon Sep 17 00:00:00 2001 From: girst Date: Sat, 29 Apr 2023 13:04:46 +0000 Subject: [PATCH 08/16] remove vertical white space after closing all flashes --- app/static/style.css | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/app/static/style.css b/app/static/style.css index ef73d0d..1586add 100644 --- a/app/static/style.css +++ b/app/static/style.css @@ -29,13 +29,14 @@ article { } .flashes { - margin: 0 auto; - padding: .5em 1.5em; + margin: 1.5em auto 0; + padding: 0; box-sizing: border-box; max-width: 1200px; /* same as .articles */ } .flashes>li { + margin: .75em 0; display: block; border-radius: 5px; width: 100%; -- 2.39.3 From d99f64c52b1f5d786b05d76ab01b221c21e0ae41 Mon Sep 17 00:00:00 2001 From: girst Date: Sat, 29 Apr 2023 13:19:21 +0000 Subject: [PATCH 09/16] support attribution_link redirects --- app/browse/lib.py | 2 +- app/youtube/__init__.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/app/browse/lib.py b/app/browse/lib.py index a4becf7..e4e36b6 100644 --- a/app/browse/lib.py +++ b/app/browse/lib.py @@ -34,7 +34,7 @@ def canonicalize_channel(path): # Note: for /watch, append query string, then return .endpoint.watchEndpoint.videoId resolved = fetch_ajax("navigation/resolve_url", url=f"https://www.youtube.com/{path}") - channel_id = resolved.get('endpoint',{}).get('browseEndpoint',{}).get('browseId') + channel_id = (resolved or {}).get('endpoint',{}).get('browseEndpoint',{}).get('browseId') return channel_id def find_and_parse_error(result): diff --git a/app/youtube/__init__.py b/app/youtube/__init__.py index 6acff52..48f6a5f 100644 --- a/app/youtube/__init__.py +++ b/app/youtube/__init__.py @@ -236,6 +236,12 @@ def plain_user_or_video(something): # XXX: something == 'thethoughtemporium' -> 404s raise NotFound("Note: some usernames not recognized; try searching it") +@frontend.route('/attribution_link', strict_slashes=False) +def attribution_link(): + # /attribution_link?a=anything&u=/channel/UCZYTClx2T1of7BRZ86-8fow + # /attribution_link?a=JdfC0C9V6ZI&u=%2Fwatch%3Fv%3DEhxJLojIE_o%26feature%3Dshare + return redirect(request.args.get('u') or '/') + @frontend.route('/c//') @frontend.route('/c//') @frontend.route('/user//') -- 2.39.3 From 4d6ac61835207c3922d85870d9ce58a8c7051d5c Mon Sep 17 00:00:00 2001 From: girst Date: Sat, 29 Apr 2023 13:59:58 +0000 Subject: [PATCH 10/16] split common.innertube into youtube.cards and browse.innertube a few things that were used in both places (G, mkthumbs, log_unknown_card) now live in common.common. --- app/browse/__init__.py | 3 +- app/{common => browse}/innertube.py | 155 +--------------------------- app/browse/lib.py | 2 +- app/common/common.py | 38 ++++++- app/youtube/cards.py | 121 ++++++++++++++++++++++ app/youtube/lib.py | 3 +- 6 files changed, 161 insertions(+), 161 deletions(-) rename app/{common => browse}/innertube.py (70%) create mode 100644 app/youtube/cards.py diff --git a/app/browse/__init__.py b/app/browse/__init__.py index 4673dd6..af06fee 100644 --- a/app/browse/__init__.py +++ b/app/browse/__init__.py @@ -1,11 +1,12 @@ +import re import requests from flask import Blueprint, render_template, request, flash, g, url_for, redirect from flask_login import current_user from werkzeug.exceptions import BadRequest, NotFound from ..common.common import * -from ..common.innertube import * from .lib import * +from .innertube import prepare_searchresults, prepare_channel, prepare_playlist from .protobuf import make_sp, make_channel_params, make_playlist_params, Filters frontend = Blueprint('browse', __name__, diff --git a/app/common/innertube.py b/app/browse/innertube.py similarity index 70% rename from app/common/innertube.py rename to app/browse/innertube.py index 49d53ae..b47b6a6 100644 --- a/app/common/innertube.py +++ b/app/browse/innertube.py @@ -1,27 +1,7 @@ # functions that deal with parsing data from youtube's internal API ("innertube") -from urllib.parse import parse_qs, urlparse -import re +from ..common.common import mkthumbs, log_unknown_card, G -class G: - """ - null-coalescing version of dict.get() that also works on lists. - - the | operator is overloaded to achieve similar looking code to jq(1) filters. - the first found key is used: dict(foo=1)|G('bar','foo') returns 1. - """ - def __init__(self, *keys): - self.keys = keys - def __ror__(self, other): - for key in self.keys: - try: return other[key] - except: continue - return None - class _Text: - """ parses youtube's .runs[].text and .simpleText variants """ - def __ror__(self, other): # Note: only returning runs[0], not concat'ing all! - return other|G('simpleText') or other|G('runs')|G(0)|G('text') - text = _Text() class Select: """ |Select('foo') returns the first foo in list, |Select(all='foo') returns all foos. """ def __init__(self, key=None, *, all=None): @@ -66,14 +46,6 @@ def prepare_searchresults(yt_results): return items, extra, more -def prepare_infocards(metadata): - cards = metadata.get('cards',{}).get('cardCollectionRenderer',{}).get('cards',[]) - return list(filter(None, map(parse_infocard, cards))) - -def prepare_endcards(metadata): - endsc = metadata.get('endscreen',{}).get('endscreenRenderer',{}).get('elements',[]) - return list(filter(None, map(parse_endcard, endsc))) - def prepare_channel(response, channel_id, channel_name): meta1 = response|G('metadata')|G('channelMetadataRenderer') meta2 = response|G('microformat')|G('microformatDataRenderer') @@ -132,27 +104,6 @@ def prepare_playlist(result): return title, author, channel_id, list(filter(None, map(parse_playlist, unparsed))), more -def mkthumbs(thumbs): - output = {str(e['height']): e['url'] for e in thumbs} - largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None) - return {**output, 'largest': largest} - -def clean_url(url): - # externals URLs are redirected through youtube.com/redirect, but we - # may encounter internal URLs, too - return parse_qs(urlparse(url).query).get('q',[url])[0] - -def toInt(s, fallback=0): - if s is None: - return fallback - try: - return int(''.join(filter(str.isdigit, s))) - except ValueError: - return fallback - -# Remove left-/rightmost word from string: -delL = lambda s: s.partition(' ')[2] - def age(s): if s is None: # missing from autogen'd music, some livestreams return None @@ -167,16 +118,6 @@ def age(s): return f"{value}{suffix}" -def log_unknown_card(data): - import json - try: - from flask import request - source = request.url - except: source = "unknown" - with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f: - f.write(f"\n/***** {source} *****/\n") - json.dump(data, f, indent=2) - def parse_result_items(items): # TODO: use .get() for most non-essential attributes """ @@ -264,100 +205,6 @@ def parse_result_items(items): log_unknown_card(item) return results, extras -def parse_infocard(card): - """ - parses a single infocard into a format that's easier to handle. - """ - card = card['cardRenderer'] - if not 'content' in card: - return None # probably the "View corrections" card, ignore. - ctype = list(card['content'].keys())[0] - content = card['content'][ctype] - if ctype == "pollRenderer": - return {'type': "POLL", 'content': { - 'question': content['question']['simpleText'], - 'answers': [(a['text']['simpleText'],a['numVotes']) \ - for a in content['choices']], - }} - elif ctype == "videoInfoCardContentRenderer": - is_live = content.get('badge',{}).get('liveBadgeRenderer') is not None - return {'type': "VIDEO", 'content': { - 'video_id': content['action']['watchEndpoint']['videoId'], - 'title': content['videoTitle']['simpleText'], - 'author': delL(content['channelName']['simpleText']), - 'length': content.get('lengthString',{}).get('simpleText') \ - if not is_live else "LIVE", # "23:03" - 'views': toInt(content.get('viewCountText',{}).get('simpleText')), - # XXX: views sometimes "Starts: July 31, 2020 at 1:30 PM" - }} - elif ctype == "playlistInfoCardContentRenderer": - return {'type': "PLAYLIST", 'content': { - 'playlist_id': content['action']['watchEndpoint']['playlistId'], - 'video_id': content['action']['watchEndpoint']['videoId'], - 'title': content['playlistTitle']['simpleText'], - 'author': delL(content['channelName']['simpleText']), - 'n_videos': toInt(content['playlistVideoCount']['simpleText']), - }} - elif ctype == "simpleCardContentRenderer" and \ - 'urlEndpoint' in content['command']: - return {'type': "WEBSITE", 'content': { - 'url': clean_url(content['command']['urlEndpoint']['url']), - 'domain': content['displayDomain']['simpleText'], - 'title': content['title']['simpleText'], - # XXX: no thumbnails for infocards - }} - elif ctype == "collaboratorInfoCardContentRenderer": - return {'type': "CHANNEL", 'content': { - 'channel_id': content['endpoint']['browseEndpoint']['browseId'], - 'title': content['channelName']['simpleText'], - 'icons': mkthumbs(content['channelAvatar']['thumbnails']), - 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers" - }} - else: - log_unknown_card(card) - return None - -def parse_endcard(card): - """ - parses a single endcard into a format that's easier to handle. - """ - card = card.get('endscreenElementRenderer', card) #only sometimes nested - ctype = card['style'] - if ctype == "CHANNEL": - return {'type': ctype, 'content': { - 'channel_id': card['endpoint']['browseEndpoint']['browseId'], - 'title': card['title']|G.text, - 'icons': mkthumbs(card['image']['thumbnails']), - }} - elif ctype == "VIDEO": - if not 'endpoint' in card: return None # title == "This video is unavailable." - return {'type': ctype, 'content': { - 'video_id': card['endpoint']['watchEndpoint']['videoId'], - 'title': card['title']|G.text, - 'length': card|G('videoDuration')|G.text, # '12:21' - 'views': toInt(card['metadata']|G.text), - # XXX: no channel name - }} - elif ctype == "PLAYLIST": - return {'type': ctype, 'content': { - 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'], - 'video_id': card['endpoint']['watchEndpoint']['videoId'], - 'title': card['title']|G.text, - 'author': delL(card['metadata']|G.text), - 'n_videos': toInt(card['playlistLength']|G.text), - }} - elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE": - url = clean_url(card['endpoint']['urlEndpoint']['url']) - return {'type': "WEBSITE", 'content': { - 'url': url, - 'domain': urlparse(url).netloc, - 'title': card['title']|G.text, - 'icons': mkthumbs(card['image']['thumbnails']), - }} - else: - log_unknown_card(card) - return None - def parse_channel_items(items, channel_id, author): result = [] extra = [] diff --git a/app/browse/lib.py b/app/browse/lib.py index e4e36b6..c389d38 100644 --- a/app/browse/lib.py +++ b/app/browse/lib.py @@ -2,7 +2,7 @@ import re import requests from datetime import datetime, timezone -from ..common.innertube import G +from ..common.common import G def fetch_ajax(endpoint, **kwargs): """ diff --git a/app/common/common.py b/app/common/common.py index 5a3639f..1045174 100644 --- a/app/common/common.py +++ b/app/common/common.py @@ -50,6 +50,26 @@ class _NSASession(OriginalSession): return response requests.Session = requests.sessions.Session = _NSASession +class G: + """ + null-coalescing version of dict.get() that also works on lists. + + the | operator is overloaded to achieve similar looking code to jq(1) filters. + the first found key is used: dict(foo=1)|G('bar','foo') returns 1. + """ + def __init__(self, *keys): + self.keys = keys + def __ror__(self, other): + for key in self.keys: + try: return other[key] + except: continue + return None + class _Text: + """ parses youtube's .runs[].text and .simpleText variants """ + def __ror__(self, other): # Note: only returning runs[0], not concat'ing all! + return other|G('simpleText') or other|G('runs')|G(0)|G('text') + text = _Text() + def fetch_xml(feed_type, feed_id): # TODO: handle requests.exceptions.ConnectionError r = requests.get("https://www.youtube.com/feeds/videos.xml", { @@ -367,6 +387,11 @@ def video_metadata(metadata): 'shorts': is_short, } +def mkthumbs(thumbs): + output = {str(e['height']): e['url'] for e in thumbs} + largest=next(iter(sorted(output.keys(),reverse=True,key=int)),None) + return {**output, 'largest': largest} + def store_video_metadata(video_id): # check if we know about it, and if not, fetch and store video metadata with sqlite3.connect(cf['global']['database']) as conn: @@ -467,7 +492,12 @@ def flask_logger(msg, level="warning"): except: pass -def pp(*args): - from pprint import pprint - import sys, codecs - pprint(args, stream=codecs.getwriter("utf-8")(sys.stderr.buffer)) +def log_unknown_card(data): + import json + try: + from flask import request + source = request.url + except: source = "unknown" + with open("/tmp/innertube.err", "a", encoding="utf-8", errors="backslashreplace") as f: + f.write(f"\n/***** {source} *****/\n") + json.dump(data, f, indent=2) diff --git a/app/youtube/cards.py b/app/youtube/cards.py new file mode 100644 index 0000000..2377a8f --- /dev/null +++ b/app/youtube/cards.py @@ -0,0 +1,121 @@ +from urllib.parse import parse_qs, urlparse + +from ..common.common import mkthumbs, log_unknown_card, G # TODO: temporary, will move to somewhere else in common + +def prepare_infocards(metadata): + cards = metadata.get('cards',{}).get('cardCollectionRenderer',{}).get('cards',[]) + return list(filter(None, map(parse_infocard, cards))) + +def prepare_endcards(metadata): + endsc = metadata.get('endscreen',{}).get('endscreenRenderer',{}).get('elements',[]) + return list(filter(None, map(parse_endcard, endsc))) + +def clean_url(url): + # externals URLs are redirected through youtube.com/redirect, but we + # may encounter internal URLs, too + return parse_qs(urlparse(url).query).get('q',[url])[0] + +def toInt(s, fallback=0): + if s is None: + return fallback + try: + return int(''.join(filter(str.isdigit, s))) + except ValueError: + return fallback + +# Remove left-/rightmost word from string: +delL = lambda s: s.partition(' ')[2] + +def parse_infocard(card): + """ + parses a single infocard into a format that's easier to handle. + """ + card = card['cardRenderer'] + if not 'content' in card: + return None # probably the "View corrections" card, ignore. + ctype = list(card['content'].keys())[0] + content = card['content'][ctype] + if ctype == "pollRenderer": + return {'type': "POLL", 'content': { + 'question': content['question']['simpleText'], + 'answers': [(a['text']['simpleText'],a['numVotes']) \ + for a in content['choices']], + }} + elif ctype == "videoInfoCardContentRenderer": + is_live = content.get('badge',{}).get('liveBadgeRenderer') is not None + return {'type': "VIDEO", 'content': { + 'video_id': content['action']['watchEndpoint']['videoId'], + 'title': content['videoTitle']['simpleText'], + 'author': delL(content['channelName']['simpleText']), + 'length': content.get('lengthString',{}).get('simpleText') \ + if not is_live else "LIVE", # "23:03" + 'views': toInt(content.get('viewCountText',{}).get('simpleText')), + # XXX: views sometimes "Starts: July 31, 2020 at 1:30 PM" + }} + elif ctype == "playlistInfoCardContentRenderer": + return {'type': "PLAYLIST", 'content': { + 'playlist_id': content['action']['watchEndpoint']['playlistId'], + 'video_id': content['action']['watchEndpoint']['videoId'], + 'title': content['playlistTitle']['simpleText'], + 'author': delL(content['channelName']['simpleText']), + 'n_videos': toInt(content['playlistVideoCount']['simpleText']), + }} + elif ctype == "simpleCardContentRenderer" and \ + 'urlEndpoint' in content['command']: + return {'type': "WEBSITE", 'content': { + 'url': clean_url(content['command']['urlEndpoint']['url']), + 'domain': content['displayDomain']['simpleText'], + 'title': content['title']['simpleText'], + # XXX: no thumbnails for infocards + }} + elif ctype == "collaboratorInfoCardContentRenderer": + return {'type': "CHANNEL", 'content': { + 'channel_id': content['endpoint']['browseEndpoint']['browseId'], + 'title': content['channelName']['simpleText'], + 'icons': mkthumbs(content['channelAvatar']['thumbnails']), + 'subscribers': content.get('subscriberCountText',{}).get('simpleText',''), # "545K subscribers" + }} + else: + log_unknown_card(card) + return None + +def parse_endcard(card): + """ + parses a single endcard into a format that's easier to handle. + """ + card = card.get('endscreenElementRenderer', card) #only sometimes nested + ctype = card['style'] + if ctype == "CHANNEL": + return {'type': ctype, 'content': { + 'channel_id': card['endpoint']['browseEndpoint']['browseId'], + 'title': card['title']|G.text, + 'icons': mkthumbs(card['image']['thumbnails']), + }} + elif ctype == "VIDEO": + if not 'endpoint' in card: return None # title == "This video is unavailable." + return {'type': ctype, 'content': { + 'video_id': card['endpoint']['watchEndpoint']['videoId'], + 'title': card['title']|G.text, + 'length': card|G('videoDuration')|G.text, # '12:21' + 'views': toInt(card['metadata']|G.text), + # XXX: no channel name + }} + elif ctype == "PLAYLIST": + return {'type': ctype, 'content': { + 'playlist_id': card['endpoint']['watchEndpoint']['playlistId'], + 'video_id': card['endpoint']['watchEndpoint']['videoId'], + 'title': card['title']|G.text, + 'author': delL(card['metadata']|G.text), + 'n_videos': toInt(card['playlistLength']|G.text), + }} + elif ctype == "WEBSITE" or ctype == "CREATOR_MERCHANDISE": + url = clean_url(card['endpoint']['urlEndpoint']['url']) + return {'type': "WEBSITE", 'content': { + 'url': url, + 'domain': urlparse(url).netloc, + 'title': card['title']|G.text, + 'icons': mkthumbs(card['image']['thumbnails']), + }} + else: + log_unknown_card(card) + return None diff --git a/app/youtube/lib.py b/app/youtube/lib.py index 9d42320..e9fe869 100644 --- a/app/youtube/lib.py +++ b/app/youtube/lib.py @@ -2,8 +2,9 @@ import re import requests from urllib.parse import urlparse +from .cards import prepare_infocards, prepare_endcards from ..common.common import video_metadata -from ..common.innertube import prepare_infocards, prepare_endcards, G +from ..common.common import G def prepare_metadata(metadata): meta = metadata['videoDetails'] -- 2.39.3 From 887dc64d607b369be5a2e213fe28be266f15f3dd Mon Sep 17 00:00:00 2001 From: girst Date: Sat, 29 Apr 2023 14:33:54 +0000 Subject: [PATCH 11/16] remove invidious redirection from ?show=raw not very useful (not understood by