]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
8 from xml
. etree
import ElementTree
9 from configparser
import ConfigParser
10 from datetime
import datetime
, timezone
11 from urllib
. parse
import parse_qs
, urlparse
14 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
15 cf
. read ( config_filename
)
16 if not 'global' in cf
: # todo: full config check
17 raise Exception ( "Configuration file not found or empty" )
19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
20 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
22 # Note: this should only be required for the 'memory' backed cache.
23 # TODO: only run for long-running processes, i.e. the frontend
24 from threading
import Timer
26 requests_cache
. remove_expired_responses ()
27 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from requests
import Session
as OriginalSession
39 class _NSASession ( OriginalSession
):
40 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
41 response
= super ( _NSASession
, self
). request (
42 method
, url
, params
, data
, ** kwargs
44 if 'api_requests' not in g
:
46 g
. api_requests
. append (( url
, params
, response
. text
))
48 requests
. Session
= requests
. sessions
. Session
= _NSASession
52 def fetch_xml ( feed_type
, feed_id
):
53 # TODO: handle requests.exceptions.ConnectionError
54 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
62 def parse_xml ( xmldata
):
64 'atom' : "http://www.w3.org/2005/Atom" ,
65 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
66 'media' : "http://search.yahoo.com/mrss/" ,
67 'at' : "http://purl.org/atompub/tombstones/1.0" ,
70 feed
= ElementTree
. fromstring ( xmldata
)
71 if feed
. find ( 'at:deleted-entry' , ns
):
72 author
= feed
. find ( 'at:deleted-entry/at:by/name' , ns
). text
73 ref
= feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' )
74 ( _
, _
, video_id
) = ref
. rpartition ( ':' )
76 title
= feed
. find ( 'atom:title' , ns
). text
77 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
78 if feed
. find ( 'atom:author' , ns
) else None
80 for entry
in feed
. findall ( 'atom:entry' , ns
):
82 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
83 'title' : entry
. find ( 'atom:title' , ns
). text
,
84 'published' : entry
. find ( 'atom:published' , ns
). text
,
85 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
86 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
87 # extra fields for pull_subs/webhook:
88 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
91 return title
, author
, videos
93 def update_channel ( db
, xmldata
):
94 if not xmldata
: return False
96 # Note: websub does not return global author, hence taking from first video
97 title
, _
, videos
= parse_xml ( xmldata
)
99 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
102 for i
, video
in enumerate ( videos
):
103 now
= datetime
. now ( timezone
. utc
)
104 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
105 published
= dateutil
. parser
. parse ( video
[ 'published' ])
106 # if update and published time are near-identical, we assume it's new.
107 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
109 else : #, it's just an update to an older video.
110 timestamp
= published
113 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
114 VALUES (?, ?, ?, datetime(?), datetime(?))
123 if i
== 0 : # only required once per feed
125 INSERT OR REPLACE INTO channels (id, name)
127 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
132 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
134 returns: best-quality muxed video stream, player_response, error-type/mesage
135 error types: player, malformed, livestream, geolocked, exhausted
137 player_error
= None # for 'exhausted'
138 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
139 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
140 "video_id" : video_id
,
141 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
146 params
= parse_qs ( r
. text
)
147 if 'errorcode' in params
: # status=fail
148 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
150 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
151 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
152 if playabilityStatus
!= "OK" :
153 playabilityReason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
154 player_error
= f
" {playabilityStatus} : {playabilityReason} "
155 if playabilityStatus
== "UNPLAYABLE" :
156 continue # try again with next el value (or fail as exhausted)
157 # without videoDetails, there's only the error message
158 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
159 return None , maybe_metadata
, 'player' , player_error
160 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
161 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
162 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
163 return None , metadata
, 'livestream' , None
165 if not 'formats' in metadata
[ 'streamingData' ]:
168 formats
= metadata
[ 'streamingData' ][ 'formats' ]
169 for ( i
, v
) in enumerate ( formats
):
170 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
171 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
172 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
174 # todo: check if we have urls or try again
175 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
177 if 'gcr' in parse_qs ( url
):
178 return None , metadata
, 'geolocked' , None
180 return url
, metadata
, None , None
182 return None , metadata
, 'exhausted' , player_error
184 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
185 signature
= list ( cipher
[ 's' ][ 0 ])
186 for c
in algo
. split ():
187 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
188 ix
= int ( ix
) % len ( signature
) if ix
else 0
190 if op
== 'r' : signature
= list ( reversed ( signature
))
191 if op
== 's' : signature
= signature
[ ix
:]
192 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
193 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
194 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
195 return f
"{cipher['url'][0]}& {sp} = {sig} "
197 def prepare_metadata ( metadata
):
198 meta1
= metadata
[ 'videoDetails' ]
199 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
200 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
201 if 'cards' in metadata
else []
202 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
203 if 'endscreen' in metadata
else []
205 # the actual video streams have exact information:
207 sd
= metadata
[ 'streamingData' ]
208 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
209 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
210 # if that's unavailable (e.g. on livestreams), fall back to
211 # thumbnails (only either 4:3 or 16:9).
213 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
214 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
217 { 'url' : cc
[ 'baseUrl' ],
218 'code' : cc
[ 'languageCode' ],
219 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
220 'name' : cc
[ 'name' ][ 'simpleText' ]}
221 for cc
in metadata
. get ( 'captions' ,{})
222 . get ( 'playerCaptionsTracklistRenderer' ,{})
223 . get ( 'captionTracks' ,[])
224 ], key
= lambda cc
: cc
[ 'autogenerated' ])
227 # externals URLs are redirected through youtube.com/redirect, but we
228 # may encounter internal URLs, too
229 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
230 # Remove left-/rightmost word from string:
231 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
232 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
233 # Thousands seperator aware int():
234 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
236 def parse_infocard ( card
):
237 card
= card
[ 'cardRenderer' ]
238 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
239 content
= card
[ 'content' ][ ctype
]
240 if ctype
== "pollRenderer" :
243 'question' : content
[ 'question' ][ 'simpleText' ],
244 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
245 for a
in content
[ 'choices' ]],
247 elif ctype
== "videoInfoCardContentRenderer" :
249 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
250 # TODO: this is ugly; cleanup.
251 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
252 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
254 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
255 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
256 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
258 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
260 elif ctype
== "playlistInfoCardContentRenderer" :
263 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
264 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
265 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
266 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
267 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
269 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
272 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
273 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
274 'title' : content
[ 'title' ][ 'simpleText' ],
275 # XXX: no thumbnails for infocards
277 elif ctype
== "collaboratorInfoCardContentRenderer" :
280 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
281 'title' : content
[ 'channelName' ][ 'simpleText' ],
282 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
283 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
287 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
289 return { 'type' : ctype
, 'content' : content
}
291 def mkthumbs ( thumbs
):
292 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
293 def parse_endcard ( card
):
294 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
295 ctype
= card
[ 'style' ]
296 if ctype
== "CHANNEL" :
298 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
299 'title' : card
[ 'title' ][ 'simpleText' ],
300 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
302 elif ctype
== "VIDEO" :
304 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
305 'title' : card
[ 'title' ][ 'simpleText' ],
306 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
307 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
308 # XXX: no channel name
310 elif ctype
== "PLAYLIST" :
312 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
313 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
314 'title' : card
[ 'title' ][ 'simpleText' ],
315 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
316 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
318 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
320 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
323 'domain' : urlparse ( url
). netloc
,
324 'title' : card
[ 'title' ][ 'simpleText' ],
325 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
329 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
331 return { 'type' : ctype
, 'content' : content
}
333 infocards
= [ parse_infocard ( card
) for card
in cards
]
334 endcards
= [ parse_endcard ( card
) for card
in endsc
]
335 # combine cards to weed out duplicates. for videos and playlists prefer
336 # infocards, for channels and websites prefer endcards, as those have more
337 # information than the other.
338 # if the card type is not in ident, we use the whole card for comparison
339 # (otherwise they'd all replace each other)
340 ident
= { # ctype -> ident
342 'PLAYLIST' : 'playlist_id' ,
343 'CHANNEL' : 'channel_id' ,
347 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
348 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
349 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
351 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
352 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
354 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
355 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
356 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
357 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
358 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
359 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
360 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
361 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
362 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
363 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
364 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
365 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
366 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
369 'title' : meta1
[ 'title' ],
370 'author' : meta1
[ 'author' ],
371 'channel_id' : meta1
[ 'channelId' ],
372 'description' : meta1
[ 'shortDescription' ],
373 'published' : meta2
[ 'publishDate' ],
374 'views' : meta1
[ 'viewCount' ],
375 'length' : int ( meta1
[ 'lengthSeconds' ]),
376 'rating' : meta1
[ 'averageRating' ],
377 'category' : meta2
[ 'category' ],
378 'aspectr' : aspect_ratio
,
379 'unlisted' : meta2
[ 'isUnlisted' ],
380 'countries' : whitelisted
,
381 'blacklisted' : blacklisted
,
382 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
383 'infocards' : infocards
,
384 'endcards' : endcards
,
385 'all_cards' : allcards
,
386 'subtitles' : subtitles
,
389 class RedditException ( Exception ): pass
390 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
391 count
= None , before
= None , after
= None ):
393 fetches data from a subreddit (or a multireddit like gif+gifs) and
394 filters/sorts results.
395 sorted_by values: hot, new, rising, controversial, top
396 time values: hour, day, week, month, year, all (for top and controversial)
397 returns a tuple of ([ {video} ],before,after)
399 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
402 return [], None , None
404 query
= { k
: v
for k
, v
in {
408 'limit' : limit
, # 1..100 (default 25)
409 't' : time
, # hour,week,month,year,all
411 multireddit
= '+' . join ( subreddits
)
412 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
413 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
414 if not r
. ok
or not 'data' in r
. json ():
415 raise RedditException ( r
. text
)
418 entries
= sorted ( r
. json ()[ 'data' ][ 'children' ], key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 , reverse
= True )
419 for entry
in entries
:
421 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
424 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
425 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
427 continue # XXX: should we log that?
428 if not video_id
: continue
430 'video_id' : video_id
,
431 'title' : html
. unescape ( e
[ 'title' ]), # Note: we unescape and re-escape in the template
432 'url' : e
[ 'permalink' ],
433 'n_comments' : e
[ 'num_comments' ],
434 'n_karma' : e
[ 'score' ],
435 'subreddit' : e
[ 'subreddit' ],
438 before
= r
. json ()[ 'data' ][ 'before' ]
439 after
= r
. json ()[ 'data' ][ 'after' ]
441 return videos
, before
, after
443 class NoFallbackException ( Exception ): pass
444 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
446 finds the next route that matches the current url rule, and executes it.
447 args, kwargs: pass all arguments of the current route
449 from flask
import current_app
, request
, g
450 from werkzeug
. exceptions
import NotFound
452 # build a list of endpoints that match the current request's url rule:
455 for rule
in current_app
. url_map
. iter_rules ()
456 if rule
. rule
== request
. url_rule
. rule
458 current
= matching
. index ( request
. endpoint
)
460 # since we can't change request.endpoint, we always get the original
461 # endpoint back. so for repeated fall throughs, we use the g object to
462 # increment how often we want to fall through.
463 if not '_fallback_next' in g
:
465 g
._ fallback
_ next
+= 1
467 next_ep
= current
+ g
._ fallback
_ next
469 if next_ep
< len ( matching
):
470 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
472 raise NoFallbackException
475 from pprint
import pprint
477 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))