]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
8 from xml
. etree
import ElementTree
9 from configparser
import ConfigParser
10 from datetime
import datetime
, timezone
11 from urllib
. parse
import parse_qs
, urlparse
14 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
15 cf
. read ( config_filename
)
16 if not 'global' in cf
: # todo: full config check
17 raise Exception ( "Configuration file not found or empty" )
19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
20 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
22 # Note: this should only be required for the 'memory' backed cache.
23 # TODO: only run for long-running processes, i.e. the frontend
24 from threading
import Timer
26 requests_cache
. remove_expired_responses ()
27 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from requests
import Session
as OriginalSession
39 class _NSASession ( OriginalSession
):
40 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
41 response
= super ( _NSASession
, self
). request (
42 method
, url
, params
, data
, ** kwargs
44 if 'api_requests' not in g
:
46 g
. api_requests
. append (( url
, params
, response
. text
))
48 requests
. Session
= requests
. sessions
. Session
= _NSASession
52 def fetch_xml ( feed_type
, feed_id
):
53 # TODO: handle requests.exceptions.ConnectionError
54 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
62 def parse_xml ( xmldata
):
64 'atom' : "http://www.w3.org/2005/Atom" ,
65 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
66 'media' : "http://search.yahoo.com/mrss/" ,
67 'at' : "http://purl.org/atompub/tombstones/1.0" ,
70 feed
= ElementTree
. fromstring ( xmldata
)
71 if feed
. find ( 'at:deleted-entry' , ns
):
72 author
= feed
. find ( 'at:deleted-entry/at:by/name' , ns
). text
73 ref
= feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' )
74 ( _
, _
, video_id
) = ref
. rpartition ( ':' )
76 title
= feed
. find ( 'atom:title' , ns
). text
77 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
78 if feed
. find ( 'atom:author' , ns
) else None
80 for entry
in feed
. findall ( 'atom:entry' , ns
):
82 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
83 'title' : entry
. find ( 'atom:title' , ns
). text
,
84 'published' : entry
. find ( 'atom:published' , ns
). text
,
85 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
86 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
87 # extra fields for pull_subs/webhook:
88 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
91 return title
, author
, videos
93 def update_channel ( db
, xmldata
):
94 if not xmldata
: return False
96 # Note: websub does not return global author, hence taking from first video
97 title
, _
, videos
= parse_xml ( xmldata
)
99 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
102 for i
, video
in enumerate ( videos
):
103 now
= datetime
. now ( timezone
. utc
)
104 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
105 published
= dateutil
. parser
. parse ( video
[ 'published' ])
106 # if update and published time are near-identical, we assume it's new.
107 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
109 else : #, it's just an update to an older video.
110 timestamp
= published
113 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
114 VALUES (?, ?, ?, datetime(?), datetime(?))
123 if i
== 0 : # only required once per feed
125 INSERT OR REPLACE INTO channels (id, name)
127 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
132 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
134 returns: best-quality muxed video stream, player_response, error-type/mesage
135 error types: player, malformed, livestream, geolocked, exhausted
137 player_error
= None # for 'exhausted'
138 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
139 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
140 "video_id" : video_id
,
141 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
146 params
= parse_qs ( r
. text
)
147 if 'errorcode' in params
: # status=fail
148 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
150 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
151 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
152 if playabilityStatus
!= "OK" :
153 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
154 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
155 player_error
= f
" {playabilityStatus} : {playabilityReason} "
156 if playabilityStatus
== "UNPLAYABLE" :
157 continue # try again with next el value (or fail as exhausted)
158 # without videoDetails, there's only the error message
159 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
160 return None , maybe_metadata
, 'player' , player_error
161 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
162 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
163 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
164 return None , metadata
, 'livestream' , None
166 if not 'formats' in metadata
[ 'streamingData' ]:
169 formats
= metadata
[ 'streamingData' ][ 'formats' ]
170 for ( i
, v
) in enumerate ( formats
):
171 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
172 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
173 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
175 # todo: check if we have urls or try again
176 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
178 if 'gcr' in parse_qs ( url
):
179 return None , metadata
, 'geolocked' , None
181 return url
, metadata
, None , None
183 return None , metadata
, 'exhausted' , player_error
185 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
186 signature
= list ( cipher
[ 's' ][ 0 ])
187 for c
in algo
. split ():
188 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
189 ix
= int ( ix
) % len ( signature
) if ix
else 0
191 if op
== 'r' : signature
= list ( reversed ( signature
))
192 if op
== 's' : signature
= signature
[ ix
:]
193 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
194 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
195 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
196 return f
"{cipher['url'][0]}& {sp} = {sig} "
198 def prepare_metadata ( metadata
):
199 meta1
= metadata
[ 'videoDetails' ]
200 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
201 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
202 if 'cards' in metadata
else []
203 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
204 if 'endscreen' in metadata
else []
206 # the actual video streams have exact information:
208 sd
= metadata
[ 'streamingData' ]
209 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
210 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
211 # if that's unavailable (e.g. on livestreams), fall back to
212 # thumbnails (only either 4:3 or 16:9).
214 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
215 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
218 { 'url' : cc
[ 'baseUrl' ],
219 'code' : cc
[ 'languageCode' ],
220 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
221 'name' : cc
[ 'name' ][ 'simpleText' ]}
222 for cc
in metadata
. get ( 'captions' ,{})
223 . get ( 'playerCaptionsTracklistRenderer' ,{})
224 . get ( 'captionTracks' ,[])
225 ], key
= lambda cc
: cc
[ 'autogenerated' ])
228 # externals URLs are redirected through youtube.com/redirect, but we
229 # may encounter internal URLs, too
230 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
231 # Remove left-/rightmost word from string:
232 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
233 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
234 # Thousands seperator aware int():
235 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
237 def parse_infocard ( card
):
238 card
= card
[ 'cardRenderer' ]
239 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
240 content
= card
[ 'content' ][ ctype
]
241 if ctype
== "pollRenderer" :
244 'question' : content
[ 'question' ][ 'simpleText' ],
245 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
246 for a
in content
[ 'choices' ]],
248 elif ctype
== "videoInfoCardContentRenderer" :
250 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
251 # TODO: this is ugly; cleanup.
252 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
253 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
255 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
256 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
257 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
259 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
261 elif ctype
== "playlistInfoCardContentRenderer" :
264 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
265 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
266 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
267 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
268 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
270 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
273 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
274 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
275 'title' : content
[ 'title' ][ 'simpleText' ],
276 # XXX: no thumbnails for infocards
278 elif ctype
== "collaboratorInfoCardContentRenderer" :
281 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
282 'title' : content
[ 'channelName' ][ 'simpleText' ],
283 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
284 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
288 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
290 return { 'type' : ctype
, 'content' : content
}
292 def mkthumbs ( thumbs
):
293 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
294 def parse_endcard ( card
):
295 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
296 ctype
= card
[ 'style' ]
297 if ctype
== "CHANNEL" :
299 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
300 'title' : card
[ 'title' ][ 'simpleText' ],
301 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
303 elif ctype
== "VIDEO" :
305 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
306 'title' : card
[ 'title' ][ 'simpleText' ],
307 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
308 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
309 # XXX: no channel name
311 elif ctype
== "PLAYLIST" :
313 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
314 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
315 'title' : card
[ 'title' ][ 'simpleText' ],
316 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
317 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
319 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
321 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
324 'domain' : urlparse ( url
). netloc
,
325 'title' : card
[ 'title' ][ 'simpleText' ],
326 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
330 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
332 return { 'type' : ctype
, 'content' : content
}
334 infocards
= [ parse_infocard ( card
) for card
in cards
]
335 endcards
= [ parse_endcard ( card
) for card
in endsc
]
336 # combine cards to weed out duplicates. for videos and playlists prefer
337 # infocards, for channels and websites prefer endcards, as those have more
338 # information than the other.
339 # if the card type is not in ident, we use the whole card for comparison
340 # (otherwise they'd all replace each other)
341 ident
= { # ctype -> ident
343 'PLAYLIST' : 'playlist_id' ,
344 'CHANNEL' : 'channel_id' ,
348 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
349 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
350 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
352 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
353 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
355 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
356 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
357 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
358 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
359 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
360 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
361 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
362 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
363 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
364 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
365 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
366 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
367 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
370 'title' : meta1
[ 'title' ],
371 'author' : meta1
[ 'author' ],
372 'channel_id' : meta1
[ 'channelId' ],
373 'description' : meta1
[ 'shortDescription' ],
374 'published' : meta2
[ 'publishDate' ],
375 'views' : meta1
[ 'viewCount' ],
376 'length' : int ( meta1
[ 'lengthSeconds' ]),
377 'rating' : meta1
[ 'averageRating' ],
378 'category' : meta2
[ 'category' ],
379 'aspectr' : aspect_ratio
,
380 'unlisted' : meta2
[ 'isUnlisted' ],
381 'countries' : whitelisted
,
382 'blacklisted' : blacklisted
,
383 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
384 'infocards' : infocards
,
385 'endcards' : endcards
,
386 'all_cards' : allcards
,
387 'subtitles' : subtitles
,
390 class RedditException ( Exception ): pass
391 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
392 count
= None , before
= None , after
= None ):
394 fetches data from a subreddit (or a multireddit like gif+gifs) and
395 filters/sorts results.
396 sorted_by values: hot, new, rising, controversial, top
397 time values: hour, day, week, month, year, all (for top and controversial)
403 query
= { k
: v
for k
, v
in {
407 'limit' : limit
, # 1..100 (default 25)
408 't' : time
, # hour,week,month,year,all
410 multireddit
= '+' . join ( subreddits
)
411 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
412 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
413 if not r
. ok
or not 'data' in r
. json ():
414 raise RedditException ( r
. text
)
418 def fetch_reddit_post ( post_id
):
419 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
420 r
= requests
. get ( f
"https://old.reddit.com/by_id/t3_ {post_id} .json" ,
421 headers
={ 'User-Agent' : 'Mozilla/5.0' })
422 if not r
. ok
or not 'data' in r
. json ():
423 raise RedditException ( r
. text
)
427 def parse_reddit_videos ( data
):
429 entries
= sorted ( data
[ 'data' ][ 'children' ],
430 key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 ,
432 for entry
in entries
:
434 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
437 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
438 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
440 continue # XXX: should we log that?
441 if not video_id
: continue
443 'video_id' : video_id
,
444 'title' : html
. unescape ( e
[ 'title' ]), # Note: we unescape and re-escape in the template
445 'url' : e
[ 'permalink' ],
446 'n_comments' : e
[ 'num_comments' ],
447 'n_karma' : e
[ 'score' ],
448 'subreddit' : e
[ 'subreddit' ],
454 class NoFallbackException ( Exception ): pass
455 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
457 finds the next route that matches the current url rule, and executes it.
458 args, kwargs: pass all arguments of the current route
460 from flask
import current_app
, request
, g
461 from werkzeug
. exceptions
import NotFound
463 # build a list of endpoints that match the current request's url rule:
466 for rule
in current_app
. url_map
. iter_rules ()
467 if rule
. rule
== request
. url_rule
. rule
469 current
= matching
. index ( request
. endpoint
)
471 # since we can't change request.endpoint, we always get the original
472 # endpoint back. so for repeated fall throughs, we use the g object to
473 # increment how often we want to fall through.
474 if not '_fallback_next' in g
:
476 g
._ fallback
_ next
+= 1
478 next_ep
= current
+ g
._ fallback
_ next
480 if next_ep
< len ( matching
):
481 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
483 raise NoFallbackException
486 from pprint
import pprint
488 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))