]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
8 from xml
. etree
import ElementTree
9 from configparser
import ConfigParser
10 from datetime
import datetime
, timezone
11 from urllib
. parse
import parse_qs
, urlparse
14 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
15 cf
. read ( config_filename
)
16 if not 'global' in cf
: # todo: full config check
17 raise Exception ( "Configuration file not found or empty" )
19 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
20 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
22 # Note: this should only be required for the 'memory' backed cache.
23 # TODO: only run for long-running processes, i.e. the frontend
24 from threading
import Timer
26 requests_cache
. remove_expired_responses ()
27 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
32 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
33 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
36 from requests
import Session
as OriginalSession
37 class _NSASession ( OriginalSession
):
38 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
39 response
= super ( _NSASession
, self
). request (
40 method
, url
, params
, data
, ** kwargs
43 if 'api_requests' not in g
:
45 g
. api_requests
. append (( url
, params
, response
. text
))
46 except RuntimeError : pass # not within flask (e.g. utils.py)
48 requests
. Session
= requests
. sessions
. Session
= _NSASession
50 def fetch_xml ( feed_type
, feed_id
):
51 # TODO: handle requests.exceptions.ConnectionError
52 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
60 def parse_xml ( xmldata
):
62 'atom' : "http://www.w3.org/2005/Atom" ,
63 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
64 'media' : "http://search.yahoo.com/mrss/" ,
65 'at' : "http://purl.org/atompub/tombstones/1.0" ,
68 feed
= ElementTree
. fromstring ( xmldata
)
69 if feed
. find ( 'at:deleted-entry' , ns
):
70 author
= feed
. find ( 'at:deleted-entry/at:by/name' , ns
). text
71 ref
= feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' )
72 ( _
, _
, video_id
) = ref
. rpartition ( ':' )
74 title
= feed
. find ( 'atom:title' , ns
). text
75 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
76 if feed
. find ( 'atom:author' , ns
) else None
78 for entry
in feed
. findall ( 'atom:entry' , ns
):
80 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
81 'title' : entry
. find ( 'atom:title' , ns
). text
,
82 'published' : entry
. find ( 'atom:published' , ns
). text
,
83 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
84 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
85 # extra fields for pull_subs/webhook:
86 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
89 return title
, author
, videos
91 def update_channel ( db
, xmldata
):
92 if not xmldata
: return False
94 # Note: websub does not return global author, hence taking from first video
95 title
, _
, videos
= parse_xml ( xmldata
)
97 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
100 for i
, video
in enumerate ( videos
):
101 now
= datetime
. now ( timezone
. utc
)
102 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
103 published
= dateutil
. parser
. parse ( video
[ 'published' ])
104 # if update and published time are near-identical, we assume it's new.
105 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
107 else : #, it's just an update to an older video.
108 timestamp
= published
111 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
112 VALUES (?, ?, ?, datetime(?), datetime(?))
121 if i
== 0 : # only required once per feed
123 INSERT OR REPLACE INTO channels (id, name)
125 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
130 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
132 returns: best-quality muxed video stream, player_response, error-type/mesage
133 error types: player, malformed, livestream, geolocked, exhausted
135 player_error
= None # for 'exhausted'
136 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
137 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
138 "video_id" : video_id
,
139 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
144 params
= parse_qs ( r
. text
)
145 if 'errorcode' in params
: # status=fail
146 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
148 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
149 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
150 if playabilityStatus
!= "OK" :
151 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
152 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
153 player_error
= f
" {playabilityStatus} : {playabilityReason} "
154 if playabilityStatus
== "UNPLAYABLE" :
155 continue # try again with next el value (or fail as exhausted)
156 # without videoDetails, there's only the error message
157 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
158 return None , maybe_metadata
, 'player' , player_error
159 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
160 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
161 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
162 return None , metadata
, 'livestream' , None
164 if not 'formats' in metadata
[ 'streamingData' ]:
167 formats
= metadata
[ 'streamingData' ][ 'formats' ]
168 for ( i
, v
) in enumerate ( formats
):
169 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
170 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
171 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
173 # todo: check if we have urls or try again
174 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
176 if 'gcr' in parse_qs ( url
):
177 return None , metadata
, 'geolocked' , None
179 return url
, metadata
, None , None
181 return None , metadata
, 'exhausted' , player_error
183 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
184 signature
= list ( cipher
[ 's' ][ 0 ])
185 for c
in algo
. split ():
186 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
187 ix
= int ( ix
) % len ( signature
) if ix
else 0
189 if op
== 'r' : signature
= list ( reversed ( signature
))
190 if op
== 's' : signature
= signature
[ ix
:]
191 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
192 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
193 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
194 return f
"{cipher['url'][0]}& {sp} = {sig} "
196 def prepare_metadata ( metadata
):
197 meta1
= metadata
[ 'videoDetails' ]
198 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
199 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
200 if 'cards' in metadata
else []
201 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
202 if 'endscreen' in metadata
else []
204 # the actual video streams have exact information:
206 sd
= metadata
[ 'streamingData' ]
207 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
208 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
209 # if that's unavailable (e.g. on livestreams), fall back to
210 # thumbnails (only either 4:3 or 16:9).
212 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
213 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
216 { 'url' : cc
[ 'baseUrl' ],
217 'code' : cc
[ 'languageCode' ],
218 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
219 'name' : cc
[ 'name' ][ 'simpleText' ]}
220 for cc
in metadata
. get ( 'captions' ,{})
221 . get ( 'playerCaptionsTracklistRenderer' ,{})
222 . get ( 'captionTracks' ,[])
223 ], key
= lambda cc
: cc
[ 'autogenerated' ])
226 # externals URLs are redirected through youtube.com/redirect, but we
227 # may encounter internal URLs, too
228 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
229 # Remove left-/rightmost word from string:
230 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
231 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
232 # Thousands seperator aware int():
233 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
235 def parse_infocard ( card
):
236 card
= card
[ 'cardRenderer' ]
237 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
238 content
= card
[ 'content' ][ ctype
]
239 if ctype
== "pollRenderer" :
242 'question' : content
[ 'question' ][ 'simpleText' ],
243 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
244 for a
in content
[ 'choices' ]],
246 elif ctype
== "videoInfoCardContentRenderer" :
248 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
249 # TODO: this is ugly; cleanup.
250 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
251 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
253 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
254 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
255 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
257 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
259 elif ctype
== "playlistInfoCardContentRenderer" :
262 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
263 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
264 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
265 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
266 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
268 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
271 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
272 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
273 'title' : content
[ 'title' ][ 'simpleText' ],
274 # XXX: no thumbnails for infocards
276 elif ctype
== "collaboratorInfoCardContentRenderer" :
279 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
280 'title' : content
[ 'channelName' ][ 'simpleText' ],
281 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
282 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
286 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
288 return { 'type' : ctype
, 'content' : content
}
290 def mkthumbs ( thumbs
):
291 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
292 def parse_endcard ( card
):
293 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
294 ctype
= card
[ 'style' ]
295 if ctype
== "CHANNEL" :
297 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
298 'title' : card
[ 'title' ][ 'simpleText' ],
299 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
301 elif ctype
== "VIDEO" :
303 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
304 'title' : card
[ 'title' ][ 'simpleText' ],
305 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
306 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
307 # XXX: no channel name
309 elif ctype
== "PLAYLIST" :
311 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
312 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
313 'title' : card
[ 'title' ][ 'simpleText' ],
314 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
315 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
317 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
319 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
322 'domain' : urlparse ( url
). netloc
,
323 'title' : card
[ 'title' ][ 'simpleText' ],
324 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
328 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
330 return { 'type' : ctype
, 'content' : content
}
332 infocards
= [ parse_infocard ( card
) for card
in cards
]
333 endcards
= [ parse_endcard ( card
) for card
in endsc
]
334 # combine cards to weed out duplicates. for videos and playlists prefer
335 # infocards, for channels and websites prefer endcards, as those have more
336 # information than the other.
337 # if the card type is not in ident, we use the whole card for comparison
338 # (otherwise they'd all replace each other)
339 ident
= { # ctype -> ident
341 'PLAYLIST' : 'playlist_id' ,
342 'CHANNEL' : 'channel_id' ,
346 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
347 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
348 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
350 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
351 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
353 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
354 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
355 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
356 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
357 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
358 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
359 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
360 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
361 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
362 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
363 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
364 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
365 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
368 'title' : meta1
[ 'title' ],
369 'author' : meta1
[ 'author' ],
370 'channel_id' : meta1
[ 'channelId' ],
371 'description' : meta1
[ 'shortDescription' ],
372 'published' : meta2
[ 'publishDate' ],
373 'views' : meta1
[ 'viewCount' ],
374 'length' : int ( meta1
[ 'lengthSeconds' ]),
375 'rating' : meta1
[ 'averageRating' ],
376 'category' : meta2
[ 'category' ],
377 'aspectr' : aspect_ratio
,
378 'unlisted' : meta2
[ 'isUnlisted' ],
379 'countries' : whitelisted
,
380 'blacklisted' : blacklisted
,
381 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
382 'infocards' : infocards
,
383 'endcards' : endcards
,
384 'all_cards' : allcards
,
385 'subtitles' : subtitles
,
388 class RedditException ( Exception ): pass
389 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
390 count
= None , before
= None , after
= None ):
392 fetches data from a subreddit (or a multireddit like gif+gifs) and
393 filters/sorts results.
394 sorted_by values: hot, new, rising, controversial, top
395 time values: hour, day, week, month, year, all (for top and controversial)
401 query
= { k
: v
for k
, v
in {
405 'limit' : limit
, # 1..100 (default 25)
406 't' : time
, # hour,week,month,year,all
408 multireddit
= '+' . join ( subreddits
)
409 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
410 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
411 if not r
. ok
or not 'data' in r
. json ():
412 raise RedditException ( r
. text
)
416 def fetch_reddit_post ( post_id
):
417 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
418 r
= requests
. get ( f
"https://old.reddit.com/by_id/t3_ {post_id} .json" ,
419 headers
={ 'User-Agent' : 'Mozilla/5.0' })
420 if not r
. ok
or not 'data' in r
. json ():
421 raise RedditException ( r
. text
)
425 def parse_reddit_videos ( data
):
427 entries
= sorted ( data
[ 'data' ][ 'children' ],
428 key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 ,
430 for entry
in entries
:
432 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
435 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
436 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
438 continue # XXX: should we log that?
439 if not video_id
: continue
441 'video_id' : video_id
,
442 'title' : html
. unescape ( e
[ 'title' ]), # Note: we unescape and re-escape in the template
443 'url' : e
[ 'permalink' ],
444 'n_comments' : e
[ 'num_comments' ],
445 'n_karma' : e
[ 'score' ],
446 'subreddit' : e
[ 'subreddit' ],
452 class NoFallbackException ( Exception ): pass
453 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
455 finds the next route that matches the current url rule, and executes it.
456 args, kwargs: pass all arguments of the current route
458 from flask
import current_app
, request
, g
459 from werkzeug
. exceptions
import NotFound
461 # build a list of endpoints that match the current request's url rule:
464 for rule
in current_app
. url_map
. iter_rules ()
465 if rule
. rule
== request
. url_rule
. rule
467 current
= matching
. index ( request
. endpoint
)
469 # since we can't change request.endpoint, we always get the original
470 # endpoint back. so for repeated fall throughs, we use the g object to
471 # increment how often we want to fall through.
472 if not '_fallback_next' in g
:
474 g
._ fallback
_ next
+= 1
476 next_ep
= current
+ g
._ fallback
_ next
478 if next_ep
< len ( matching
):
479 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
481 raise NoFallbackException
484 from pprint
import pprint
486 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))