]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
10 from xml
. etree
import ElementTree
11 from configparser
import ConfigParser
12 from datetime
import datetime
, timezone
13 from urllib
. parse
import parse_qs
, urlparse
16 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
17 cf
. read ( config_filename
)
18 if not 'global' in cf
: # todo: full config check
19 raise Exception ( "Configuration file not found or empty" )
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading
import Timer
28 requests_cache
. remove_expired_responses ()
29 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from requests
import Session
as OriginalSession
39 class _NSASession ( OriginalSession
):
40 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
41 response
= super ( _NSASession
, self
). request (
42 method
, url
, params
, data
, ** kwargs
45 if 'api_requests' not in g
:
47 g
. api_requests
. append (( url
, params
, response
. text
))
48 except RuntimeError : pass # not within flask (e.g. utils.py)
50 requests
. Session
= requests
. sessions
. Session
= _NSASession
52 def fetch_xml ( feed_type
, feed_id
):
53 # TODO: handle requests.exceptions.ConnectionError
54 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
62 def parse_xml ( xmldata
):
64 'atom' : "http://www.w3.org/2005/Atom" ,
65 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
66 'media' : "http://search.yahoo.com/mrss/" ,
67 'at' : "http://purl.org/atompub/tombstones/1.0" ,
70 feed
= ElementTree
. fromstring ( xmldata
)
72 if feed
. find ( 'at:deleted-entry' , ns
):
73 ( _
, _
, vid
) = feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' ). rpartition ( ':' )
74 return None , None , [{ 'deleted' : True , 'video_id' : vid
}]
76 title
= feed
. find ( 'atom:title' , ns
). text
77 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
78 if feed
. find ( 'atom:author' , ns
) else None
80 for entry
in feed
. findall ( 'atom:entry' , ns
):
82 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
83 'title' : entry
. find ( 'atom:title' , ns
). text
,
84 'published' : entry
. find ( 'atom:published' , ns
). text
,
85 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
86 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
87 # extra fields for pull_subs/webhook:
88 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
91 return title
, author
, videos
93 def update_channel ( db
, xmldata
):
94 if not xmldata
: return False
96 # Note: websub does not return global author, hence taking from first video
97 _
, _
, videos
= parse_xml ( xmldata
)
100 for i
, video
in enumerate ( videos
):
101 if video
. get ( 'deleted' ):
102 from flask
import current_app
# XXX: remove
103 current_app
. logger
. info ( f
"ignoring deleted video {video['video_id']}" ) # XXX: remove
104 # TODO: enable once we enforce hmac validation:
105 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
107 now
= datetime
. now ( timezone
. utc
)
108 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
109 published
= dateutil
. parser
. parse ( video
[ 'published' ])
110 # if update and published time are near-identical, we assume it's new.
111 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
113 else : #, it's just an update to an older video.
114 timestamp
= published
117 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
118 VALUES (?, ?, ?, datetime(?), datetime(?))
127 if i
== 0 : # only required once per feed
129 INSERT OR REPLACE INTO channels (id, name)
131 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
136 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
138 returns: best-quality muxed video stream, player_response, error-type/mesage
139 error types: player, malformed, livestream, geolocked, exhausted
141 player_error
= None # for 'exhausted'
142 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
143 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
144 "video_id" : video_id
,
145 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
150 params
= parse_qs ( r
. text
)
151 if 'errorcode' in params
: # status=fail
152 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
154 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
155 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
156 if playabilityStatus
!= "OK" :
157 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
158 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
159 player_error
= f
" {playabilityStatus} : {playabilityReason} "
160 if playabilityStatus
== "UNPLAYABLE" :
161 continue # try again with next el value (or fail as exhausted)
162 # without videoDetails, there's only the error message
163 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
164 return None , maybe_metadata
, 'player' , player_error
165 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
166 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
167 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
168 return None , metadata
, 'livestream' , None
170 if not 'formats' in metadata
[ 'streamingData' ]:
173 formats
= metadata
[ 'streamingData' ][ 'formats' ]
174 for ( i
, v
) in enumerate ( formats
):
175 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
176 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
177 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
179 # todo: check if we have urls or try again
180 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
182 if 'gcr' in parse_qs ( url
):
183 return None , metadata
, 'geolocked' , None
185 return url
, metadata
, None , None
187 return None , metadata
, 'exhausted' , player_error
189 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
190 signature
= list ( cipher
[ 's' ][ 0 ])
191 for c
in algo
. split ():
192 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
193 ix
= int ( ix
) % len ( signature
) if ix
else 0
195 if op
== 'r' : signature
= list ( reversed ( signature
))
196 if op
== 's' : signature
= signature
[ ix
:]
197 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
198 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
199 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
200 return f
"{cipher['url'][0]}& {sp} = {sig} "
202 def prepare_metadata ( metadata
):
203 meta1
= metadata
[ 'videoDetails' ]
204 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
205 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
206 if 'cards' in metadata
else []
207 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
208 if 'endscreen' in metadata
else []
210 # the actual video streams have exact information:
212 sd
= metadata
[ 'streamingData' ]
213 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
214 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
215 # if that's unavailable (e.g. on livestreams), fall back to
216 # thumbnails (only either 4:3 or 16:9).
218 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
219 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
222 { 'url' : cc
[ 'baseUrl' ],
223 'code' : cc
[ 'languageCode' ],
224 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
225 'name' : cc
[ 'name' ][ 'simpleText' ]}
226 for cc
in metadata
. get ( 'captions' ,{})
227 . get ( 'playerCaptionsTracklistRenderer' ,{})
228 . get ( 'captionTracks' ,[])
229 ], key
= lambda cc
: cc
[ 'autogenerated' ])
232 # externals URLs are redirected through youtube.com/redirect, but we
233 # may encounter internal URLs, too
234 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
235 # Remove left-/rightmost word from string:
236 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
237 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
238 # Thousands seperator aware int():
239 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
241 def parse_infocard ( card
):
242 card
= card
[ 'cardRenderer' ]
243 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
244 content
= card
[ 'content' ][ ctype
]
245 if ctype
== "pollRenderer" :
248 'question' : content
[ 'question' ][ 'simpleText' ],
249 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
250 for a
in content
[ 'choices' ]],
252 elif ctype
== "videoInfoCardContentRenderer" :
254 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
255 # TODO: this is ugly; cleanup.
256 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
257 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
259 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
260 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
261 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
263 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
265 elif ctype
== "playlistInfoCardContentRenderer" :
268 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
269 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
270 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
271 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
272 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
274 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
277 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
278 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
279 'title' : content
[ 'title' ][ 'simpleText' ],
280 # XXX: no thumbnails for infocards
282 elif ctype
== "collaboratorInfoCardContentRenderer" :
285 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
286 'title' : content
[ 'channelName' ][ 'simpleText' ],
287 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
288 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
292 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
294 return { 'type' : ctype
, 'content' : content
}
296 def mkthumbs ( thumbs
):
297 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
298 def parse_endcard ( card
):
299 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
300 ctype
= card
[ 'style' ]
301 if ctype
== "CHANNEL" :
303 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
304 'title' : card
[ 'title' ][ 'simpleText' ],
305 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
307 elif ctype
== "VIDEO" :
309 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
310 'title' : card
[ 'title' ][ 'simpleText' ],
311 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
312 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
313 # XXX: no channel name
315 elif ctype
== "PLAYLIST" :
317 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
318 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
319 'title' : card
[ 'title' ][ 'simpleText' ],
320 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
321 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
323 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
325 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
328 'domain' : urlparse ( url
). netloc
,
329 'title' : card
[ 'title' ][ 'simpleText' ],
330 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
334 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
336 return { 'type' : ctype
, 'content' : content
}
338 infocards
= [ parse_infocard ( card
) for card
in cards
]
339 endcards
= [ parse_endcard ( card
) for card
in endsc
]
340 # combine cards to weed out duplicates. for videos and playlists prefer
341 # infocards, for channels and websites prefer endcards, as those have more
342 # information than the other.
343 # if the card type is not in ident, we use the whole card for comparison
344 # (otherwise they'd all replace each other)
345 ident
= { # ctype -> ident
347 'PLAYLIST' : 'playlist_id' ,
348 'CHANNEL' : 'channel_id' ,
352 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
353 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
354 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
356 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
357 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
359 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
360 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
361 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
362 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
363 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
364 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
365 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
366 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
367 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
368 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
369 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
370 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
371 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
374 'title' : meta1
[ 'title' ],
375 'author' : meta1
[ 'author' ],
376 'channel_id' : meta1
[ 'channelId' ],
377 'description' : meta1
[ 'shortDescription' ],
378 'published' : meta2
[ 'publishDate' ],
379 'views' : meta1
[ 'viewCount' ],
380 'length' : int ( meta1
[ 'lengthSeconds' ]),
381 'rating' : meta1
[ 'averageRating' ],
382 'category' : meta2
[ 'category' ],
383 'aspectr' : aspect_ratio
,
384 'unlisted' : meta2
[ 'isUnlisted' ],
385 'whitelisted' : whitelisted
,
386 'blacklisted' : blacklisted
,
387 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
388 'infocards' : infocards
,
389 'endcards' : endcards
,
390 'all_cards' : allcards
,
391 'subtitles' : subtitles
,
394 class RedditException ( Exception ): pass
395 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
396 count
= None , before
= None , after
= None ):
398 fetches data from a subreddit (or a multireddit like gif+gifs) and
399 filters/sorts results.
400 sorted_by values: hot, new, rising, controversial, top
401 time values: hour, day, week, month, year, all (for top and controversial)
407 query
= { k
: v
for k
, v
in {
411 'limit' : limit
, # 1..100 (default 25)
412 't' : time
, # hour,week,month,year,all
414 multireddit
= '+' . join ( subreddits
)
415 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
416 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
417 if not r
. ok
or not 'data' in r
. json ():
418 raise RedditException ( r
. text
)
422 def fetch_reddit_post ( post_id
):
423 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
424 r
= requests
. get ( f
"https://old.reddit.com/by_id/t3_ {post_id} .json" ,
425 headers
={ 'User-Agent' : 'Mozilla/5.0' })
426 if not r
. ok
or not 'data' in r
. json ():
427 raise RedditException ( r
. text
)
431 def parse_reddit_videos ( data
):
433 entries
= sorted ( data
[ 'data' ][ 'children' ],
434 key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 ,
436 for entry
in entries
:
438 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
441 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
442 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
444 continue # XXX: should we log that?
445 if not video_id
: continue
447 'video_id' : video_id
,
448 'title' : html
. unescape ( e
[ 'title' ]), # Note: we unescape and re-escape in the template
449 'url' : e
[ 'permalink' ],
450 'n_comments' : e
[ 'num_comments' ],
451 'n_karma' : e
[ 'score' ],
452 'subreddit' : e
[ 'subreddit' ],
458 class NoFallbackException ( Exception ): pass
459 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
461 finds the next route that matches the current url rule, and executes it.
462 args, kwargs: pass all arguments of the current route
464 from flask
import current_app
, request
, g
465 from werkzeug
. exceptions
import NotFound
467 # build a list of endpoints that match the current request's url rule:
470 for rule
in current_app
. url_map
. iter_rules ()
471 if rule
. rule
== request
. url_rule
. rule
473 current
= matching
. index ( request
. endpoint
)
475 # since we can't change request.endpoint, we always get the original
476 # endpoint back. so for repeated fall throughs, we use the g object to
477 # increment how often we want to fall through.
478 if not '_fallback_next' in g
:
480 g
._ fallback
_ next
+= 1
482 next_ep
= current
+ g
._ fallback
_ next
484 if next_ep
< len ( matching
):
485 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
487 raise NoFallbackException
489 def websub_url_hmac ( key
, feed_id
, timestamp
, nonce
):
490 """ generate sha1 hmac, as required by websub/pubsubhubbub """
491 sig_input
= f
" {feed_id} : {timestamp} : {nonce} " . encode ( 'ascii' )
492 return hmac
. new ( key
. encode ( 'ascii' ), sig_input
, hashlib
. sha1
). hexdigest ()
494 def websub_body_hmac ( key
, body
):
495 return hmac
. new ( key
. encode ( 'ascii' ), body
, hashlib
. sha1
). hexdigest ()
498 from pprint
import pprint
500 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))