]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
10 from xml
. etree
import ElementTree
11 from configparser
import ConfigParser
12 from datetime
import datetime
, timezone
13 from urllib
. parse
import parse_qs
, urlparse
16 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
17 cf
. read ( config_filename
)
18 if not 'global' in cf
: # todo: full config check
19 raise Exception ( "Configuration file not found or empty" )
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,), allowable_methods
=( 'GET' , 'HEAD' , 'POST' ))
24 # Note: requests-cache doesn't use redis expiry, so we need this in all backends:
25 # https://github.com/reclosedev/requests-cache/issues/58#issuecomment-164537971
26 # TODO: only run for long-running processes, i.e. the frontend
27 from threading
import Timer
29 requests_cache
. remove_expired_responses ()
30 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
35 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
36 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
39 from requests
import Session
as OriginalSession
40 class _NSASession ( OriginalSession
):
41 def request ( self
, method
, url
, params
= None , data
= None , json
= None , ** kwargs
):
42 response
= super ( _NSASession
, self
). request (
43 method
, url
, params
= params
, data
= data
, json
= json
, ** kwargs
46 if 'api_requests' not in g
:
48 g
. api_requests
. append (( url
, params
, json
, response
. text
))
49 except RuntimeError : pass # not within flask (e.g. utils.py)
51 requests
. Session
= requests
. sessions
. Session
= _NSASession
55 null-coalescing version of dict.get() that also works on lists.
57 the | operator is overloaded to achieve similar looking code to jq(1) filters.
58 the first found key is used: dict(foo=1)|G('bar','foo') returns 1.
60 def __init__ ( self
, * keys
):
62 def __ror__ ( self
, other
):
64 try : return other
[ key
]
68 """ parses youtube's .runs[].text and .simpleText variants """
69 def __ror__ ( self
, other
): # Note: only returning runs[0], not concat'ing all!
70 return other|
G ( 'simpleText' ) or other|
G ( 'runs' ) |
G ( 0 ) |
G ( 'text' )
73 def fetch_xml ( feed_type
, feed_id
):
74 # TODO: handle requests.exceptions.ConnectionError
75 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
83 def parse_xml ( xmldata
):
85 'atom' : "http://www.w3.org/2005/Atom" ,
86 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
87 'media' : "http://search.yahoo.com/mrss/" ,
88 'at' : "http://purl.org/atompub/tombstones/1.0" ,
91 feed
= ElementTree
. fromstring ( xmldata
)
93 if feed
. find ( 'at:deleted-entry' , ns
):
94 del_entry
= feed
. find ( 'at:deleted-entry' , ns
)
95 del_author
= del_entry
. find ( 'at:by' , ns
)
96 _
, _
, vid
= del_entry
. get ( 'ref' ). rpartition ( ':' )
97 _
, _
, channel_id
= del_author
. find ( 'atom:uri' , ns
). text
. rpartition ( '/' )
98 author
= del_author
. find ( 'atom:name' , ns
). text
102 'channel_id' : channel_id
,
105 return None , None , entry
, None , None
107 title
= feed
. find ( 'atom:title' , ns
). text
108 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
109 if feed
. find ( 'atom:author' , ns
) else None
110 # for /user/<> endpoint: find out UC-id:
111 # for playlists: this is who created the playlist:
112 try : channel_id
= feed
. find ( 'yt:channelId' , ns
). text
113 except : channel_id
= None
114 # for pullsub: if this exists, we're looking at a playlist:
115 try : playlist_id
= feed
. find ( 'yt:playlistId' , ns
). text
116 except : playlist_id
= None
118 for entry
in feed
. findall ( 'atom:entry' , ns
):
120 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
121 'title' : entry
. find ( 'atom:title' , ns
). text
,
122 'published' : entry
. find ( 'atom:published' , ns
). text
,
123 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
124 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
125 # extra fields for pull_subs/webhook:
126 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
129 return title
, author
, videos
, channel_id
, playlist_id
131 def update_channel ( db
, xmldata
, from_webhook
= False ):
132 if not xmldata
: return False
134 # Note: websub does not return global author, hence taking from first video
135 title
, author
, videos
, channel
, playlist
= parse_xml ( xmldata
)
138 for i
, video
in enumerate ( videos
):
139 if video
. get ( 'deleted' ):
140 # Note: Deletion events are not just fired for actual deletions,
141 # but also for unlisting videos and livestreams that just ended
142 # (even postLiveDVR ones). Hence, we don't follow it.
143 flask_logger ( f
"ignoring deleted/unlisted video or ended livestream {video['video_id']} by {video['channel_id']} ({video['author']})" )
146 c
. execute ( "SELECT 1 FROM videos WHERE id=?" ,( video
[ 'video_id' ],))
147 new_video
= len ( c
. fetchall ()) < 1
149 # TODO: call store_video_metadata(video_id) here instead and pass video-fallback-metadata to it
150 _
, _
, meta
, _
, _
= get_video_info ( video
[ 'video_id' ], metaOnly
= True )
151 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
152 # video gets uploaded as unlisted on day A and set to public on day B;
153 # the webhook is sent on day B, but 'published' says A. The video
154 # therefore looks like it's just an update to an older video).
155 # g_v_i gives is the date the video was published to viewers, so we
156 # prefer that. But since g_v_i only returns the date without time,
157 # we still use xmlfeed's date if it's the same date.
158 published
= dateutil
. parser
. parse ( video
[ 'published' ])
164 meta
= video_metadata ( meta
)
165 published2
= dateutil
. parser
. parse ( meta
[ 'published' ])
166 if published
< published2
: # g_v_i date is more accurate:
167 published
= published2
168 length
= meta
[ 'length' ]
169 livestream
= meta
[ 'livestream' ]
170 premiere
= meta
[ 'premiere' ]
171 shorts
= meta
[ 'shorts' ]
173 now
= datetime
. now ( timezone
. utc
)
175 # we pretend that all videos uploaded this week were uploaded just
176 # now, so the user sees it at the top of the feed, and it doesn't
177 # get inserted somewhere further down.
178 if ( now
- published
). days
< 7 :
180 else : #, it's just an update to an older video.
181 timestamp
= published
184 INSERT OR IGNORE INTO videos
185 (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
186 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
199 # update video title (everything else can't change)
201 UPDATE OR IGNORE videos
209 # for channels, this is obviously always the same, but playlists can
210 # consist of videos from different channels:
211 if i
== 0 or playlist
:
213 INSERT OR REPLACE INTO channels (id, name)
215 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
217 # keep track of which videos are in a playlist, so we can show the user
218 # why a video is in their feed:
221 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
223 """ , ( video
[ 'video_id' ], playlist
))
225 if playlist
and not from_webhook
: # Note: playlists can't get updated via websub
227 INSERT OR REPLACE INTO playlists (id, name, author)
229 """ , ( playlist
, title
, channel
))
231 INSERT OR REPLACE INTO channels (id, name)
233 """ , ( channel
, author
))
239 def is_agegated ( metadata
):
240 playabilityStatus
= metadata
[ 'playabilityStatus' ]
242 playabilityStatus
. get ( "status" ) == "CONTENT_CHECK_REQUIRED"
243 or playabilityStatus
. get ( "desktopLegacyAgeGateReason" )
246 def get_video_info ( video_id
, *, metaOnly
= False , _agegate_bypass
= False ):
248 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
249 error types: player, malformed, livestream, geolocked, agegated, no-url, exhausted
251 player_error
, metadata
= None , None # for 'exhausted'
252 today
= datetime
. now ( timezone
. utc
). strftime ( "%Y%m %d " )
253 key
= "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" if metaOnly
or _agegate_bypass
else "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w"
254 # ANDROID returns streams that are not throttled or cipher-scambled, but less metadata than WEB.
255 # TVHTML5* returns throttled and possibly ciphered streams, but bypasses age-gate. atm, we don't decipher them.
256 # TODO: unscramble TVHTML5* streams (especially &n= throttling)
258 ( False , False ): { 'clientName' : 'ANDROID' , 'clientVersion' : '18.11.34' , 'androidSdkVersion' : 30 },
259 ( False , True ): { 'clientName' : 'TVHTML5_SIMPLY_EMBEDDED_PLAYER' , 'clientVersion' : '2.0' },
260 ( True , False ): { 'clientName' : 'WEB' , 'clientVersion' : f
'2. {today} .01.01' },
261 }[( metaOnly
, _agegate_bypass
)]
262 r
= requests
. post ( "https://youtubei.googleapis.com/youtubei/v1/player" , params
={ 'key' : key
}, json
={
270 'thirdParty' : { 'embedUrl' : 'https://www.youtube.com/' }
272 "racyCheckOk" : True , # seems to do nothing, cargo-culted
273 "contentCheckOk" : True , # fix "This video may be inappropriate for some users."
274 "params" : "CgIIAQ%3D%3D" , # otherwise googlevideo URLs become 403/Forbidden after a few accesses (breaks buffering/scrubbing) and ANDROID client gets blocked
275 }, headers
={ "User-Agent" : "com.google.android.youtube/18.11.34 (Linux; U; Android 11) gzip" })
277 if not r
or r
. status_code
== 429 :
278 return None , None , None , 'banned' , 'possible IP ban'
281 if "error" in metadata
:
282 return None , None , metadata
, "malformed" , metadata
. get ( "error" ,{}). get ( "message" , "" )
283 real_vid
= metadata
. get ( "videoDetails" , {}). get ( "videoId" )
284 if video_id
!= real_vid
and real_vid
in ( "M5t4UHllkUM" , "aQvGIIdgFDM" ):
285 # youtube redirected us to a clip called "Video Not Available". indicates a long-term ip ban.
286 return None , None , {}, "banned" , "instance is probably ip banned"
287 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
288 if playabilityStatus
!= "OK" :
289 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
290 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
291 player_error
= f
" {playabilityStatus} : {playabilityReason} "
292 if ( is_agegated ( metadata
)
293 and not metaOnly
# only need metadata (e.g. called from pubsubhubbub)
294 and not _agegate_bypass
296 _
, _
, metadata_embed
, error_embed
, errormsg_embed
= get_video_info ( video_id
, _agegate_bypass
= True )
297 if error_embed
== "player" : # agegate bypass failed?
298 return None , None , metadata
, 'agegated' , player_error
299 elif not error_embed
or error_embed
in ( 'livestream' , 'geolocked' , 'scrambled' , 'throttled' ):
300 metadata
= metadata_embed
302 return None , None , metadata
, error_embed
, errormsg_embed
304 # without videoDetails, there's only the error message
305 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
306 return None , None , maybe_metadata
, 'player' , player_error
308 # livestreams have no adaptive/muxed formats:
309 is_live
= metadata
[ 'videoDetails' ]. get ( 'isLive' , False )
311 if not 'formats' in metadata
[ 'streamingData' ] and not is_live
:
312 return None , None , metadata
, 'no-url' , player_error
314 formats
= metadata
[ 'streamingData' ]. get ( 'formats' ,[])
315 adaptive
= metadata
[ 'streamingData' ]. get ( 'adaptiveFormats' ,[])
317 'adaptive_video' : [ a
for a
in adaptive
if a
[ 'mimeType' ]. startswith ( 'video/' )],
318 'adaptive_audio' : [ a
for a
in adaptive
if a
[ 'mimeType' ]. startswith ( 'audio/' )],
320 'hlsManifestUrl' : metadata
[ 'streamingData' ]. get ( 'hlsManifestUrl' ),
324 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
326 query
= parse_qs ( urlparse ( url
). query
)
327 # ip-locked videos can be recovered if the proxy module is loaded:
328 is_geolocked
= 'gcr' in query
329 # "n-signature" requires javascript descrambling (not implemented):
330 is_throttled
= 'ns' in query
336 is_drm
= formats
and 'signatureCipher' in formats
[ 0 ]
338 nonfatal
= 'livestream' if is_live \
339 else 'geolocked' if is_geolocked \
340 else 'scrambled' if is_drm \
341 else 'throttled' if is_throttled \
344 return url
, stream_map
, metadata
, nonfatal
, None
346 def video_metadata ( metadata
):
350 meta1
= metadata
[ 'videoDetails' ]
351 # With ANDROID player API, we don't get microformat => no publishDate!
352 meta2
= metadata
. get ( 'microformat' ,{}). get ( 'playerMicroformatRenderer' ,{})
354 # sometimes, we receive the notification so early that the length is not
355 # yet populated. Nothing we can do about it. meta1 and meta2 use a
356 # different rounding strategy, meta2 is sometimes (incorrectly) 1s longer.
357 length
= int ( meta1
. get ( 'lengthSeconds' , 0 )) or int ( meta2
. get ( 'lengthSeconds' , 0 )) or None
359 views
= int ( meta1
[ 'viewCount' ]) if 'viewCount' in meta1
else None
361 scheduled_time
= metadata
. get ( 'playabilityStatus' ,{}) \
362 . get ( 'liveStreamability' ,{}). get ( 'liveStreamabilityRenderer' ,{}) \
363 . get ( 'offlineSlate' ,{}). get ( 'liveStreamOfflineSlateRenderer' ,{}) \
364 . get ( 'scheduledStartTime' )
366 scheduled_time
= datetime
. fromtimestamp ( int ( scheduled_time
)) \
367 . strftime ( "%Y-%m- %d T%H:%M:%SZ" )
369 meta2
. get ( 'liveBroadcastDetails' ,{}) . get ( 'startTimestamp' ) or
371 meta2
. get ( 'publishDate' , '1970-01-01T00:00:00Z' )
374 # the actual video streams have exact information:
375 # Note that we use x:1 (cinema style) aspect ratios, omitting the ':1' part.
377 sd
= metadata
[ 'streamingData' ]
378 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
379 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
380 # if that's unavailable (e.g. on livestreams), fall back to 16:9 (later)
384 is_livestream
= meta1
[ 'isLiveContent' ]
385 is_premiere
= meta1
. get ( 'isUpcoming' , False ) and not is_livestream
386 # shorts are <= 60 seconds and vertical or square. they can't be premiere
387 # or livestreams. if we were unable to determine it, we set it to None.
389 True if ( length
or 61 ) <= 60 and ( aspect_ratio
or 2 ) <= 1 else
390 False if ( length
or 0 ) > 60 or ( aspect_ratio
or 0 ) > 1 else
391 None if not is_premiere
and not is_livestream
else False
394 # Note: 'premiere' videos have livestream=False and published= will be the
395 # start of the premiere.
397 'title' : meta1
[ 'title' ],
398 'author' : meta1
[ 'author' ],
399 'channel_id' : meta1
[ 'channelId' ],
400 'published' : published_at
,
403 'aspect' : aspect_ratio
or 16 / 9 ,
404 'livestream' : is_livestream
,
405 'premiere' : is_premiere
,
409 def mkthumbs ( thumbs
):
410 output
= { str ( e
[ 'height' ]): e
[ 'url' ] for e
in thumbs
}
411 largest
= next ( iter ( sorted ( output
. keys (), reverse
= True , key
= int )), None )
412 return {** output
, 'largest' : largest
}
414 def store_video_metadata ( video_id
):
415 # check if we know about it, and if not, fetch and store video metadata
416 with sqlite3
. connect ( cf
[ 'global' ][ 'database' ]) as conn
:
418 c
. execute ( "SELECT 1 from videos where id = ?" , ( video_id
,))
419 new_video
= len ( c
. fetchall ()) < 1
421 _
, _
, meta
, _
, _
= get_video_info ( video_id
, metaOnly
= True )
423 meta
= video_metadata ( meta
)
425 INSERT OR IGNORE INTO videos (id, channel_id, title, length, livestream, premiere, shorts, published, crawled)
426 VALUES (?, ?, ?, ?, ?, ?, ?, datetime(?), datetime(?))
439 INSERT OR REPLACE INTO channels (id, name)
441 """ , ( meta
[ 'channel_id' ], meta
[ 'author' ]))
443 def fetch_video_flags ( token
, video_ids
):
444 with sqlite3
. connect ( cf
[ 'global' ][ 'database' ]) as conn
:
447 SELECT video_id,display
450 AND display IS NOT NULL
452 -- AND display = 'pinned'
453 """ . format ( "," . join ([ "?" ]* len ( video_ids
))), ( token
,* video_ids
))
455 pinned
= [ video
for video
, disp
in flags
if disp
== 'pinned' ]
456 hidden
= [ video
for video
, disp
in flags
if disp
== 'hidden' ]
458 return pinned
, hidden
460 def apply_video_flags ( token
, rows
, settings
={}):
461 video_ids
= [ card
[ 'content' ][ 'video_id' ] for card
in rows
if 'video_id' in card
[ 'content' ]]
462 pinned
, hidden
= fetch_video_flags ( token
, video_ids
)
463 noshorts
= settings
. get ( 'noshorts' ) or False
465 { 'type' : v
[ 'type' ], 'content' :{** v
[ 'content' ], 'pinned' : v
[ 'content' ][ 'video_id' ] in pinned
if 'video_id' in v
[ 'content' ] else False }}
468 'video_id' not in v
[ 'content' ] or v
[ 'content' ][ 'video_id' ] not in hidden
470 not ( noshorts
and v
[ 'content' ]. get ( 'shorts' ))
472 ], key
= lambda v
: v
[ 'content' ][ 'pinned' ], reverse
= True )
474 from werkzeug
. exceptions
import NotFound
475 class NoFallbackException ( NotFound
): pass
476 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
478 finds the next route that matches the current url rule, and executes it.
479 args, kwargs: pass all arguments of the current route
481 from flask
import current_app
, request
, g
483 # build a list of endpoints that match the current request's url rule:
486 for rule
in current_app
. url_map
. iter_rules ()
487 if rule
. rule
== request
. url_rule
. rule
489 current
= matching
. index ( request
. endpoint
)
491 # since we can't change request.endpoint, we always get the original
492 # endpoint back. so for repeated fall throughs, we use the g object to
493 # increment how often we want to fall through.
494 if not '_fallback_next' in g
:
496 g
._ fallback
_ next
+= 1
498 next_ep
= current
+ g
._ fallback
_ next
500 if next_ep
< len ( matching
):
501 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
503 raise NoFallbackException
505 def websub_url_hmac ( key
, feed_id
, timestamp
, nonce
):
506 """ generate sha1 hmac, as required by websub/pubsubhubbub """
507 sig_input
= f
" {feed_id} : {timestamp} : {nonce} " . encode ( 'ascii' )
508 return hmac
. new ( key
. encode ( 'ascii' ), sig_input
, hashlib
. sha1
). hexdigest ()
510 def websub_body_hmac ( key
, body
):
511 return hmac
. new ( key
. encode ( 'ascii' ), body
, hashlib
. sha1
). hexdigest ()
513 def flask_logger ( msg
, level
= "warning" ):
521 ). get ( level
. upper (), 0 )
523 from flask
import current_app
524 current_app
. logger
. log ( level
, msg
)
528 def log_unknown_card ( data
):
531 from flask
import request
533 except : source
= "unknown"
534 with
open ( "/tmp/innertube.err" , "a" , encoding
= "utf-8" , errors
= "backslashreplace" ) as f
:
535 f
. write ( f
" \n /***** {source} *****/ \n " )
536 json
. dump ( data
, f
, indent
= 2 )