]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
10 from xml
. etree
import ElementTree
11 from configparser
import ConfigParser
12 from datetime
import datetime
, timezone
13 from urllib
. parse
import parse_qs
, urlparse
16 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
17 cf
. read ( config_filename
)
18 if not 'global' in cf
: # todo: full config check
19 raise Exception ( "Configuration file not found or empty" )
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading
import Timer
28 requests_cache
. remove_expired_responses ()
29 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from requests
import Session
as OriginalSession
39 class _NSASession ( OriginalSession
):
40 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
41 response
= super ( _NSASession
, self
). request (
42 method
, url
, params
, data
, ** kwargs
45 if 'api_requests' not in g
:
47 g
. api_requests
. append (( url
, params
, response
. text
))
48 except RuntimeError : pass # not within flask (e.g. utils.py)
50 requests
. Session
= requests
. sessions
. Session
= _NSASession
52 def fetch_xml ( feed_type
, feed_id
):
53 # TODO: handle requests.exceptions.ConnectionError
54 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
62 def parse_xml ( xmldata
):
64 'atom' : "http://www.w3.org/2005/Atom" ,
65 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
66 'media' : "http://search.yahoo.com/mrss/" ,
67 'at' : "http://purl.org/atompub/tombstones/1.0" ,
70 feed
= ElementTree
. fromstring ( xmldata
)
71 if feed
. find ( 'at:deleted-entry' , ns
):
72 ( _
, _
, vid
) = feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' ). rpartition ( ':' )
73 return None , None , [{ 'deleted' : True , 'video_id' : vid
}]
74 #author = feed.find('at:deleted-entry/at:by/atom:name',ns).text
75 #channel_url = feed.find('at:deleted-entry/at:by/atom:uri',ns).text
76 #match = re.search(r"(UC[A-Za-z0-9_-]{22})", channel_url)
77 #channel_id = match.group(1) if match else None
78 #ref = feed.find('at:deleted-entry',ns).get('ref')
79 #(_, _, video_id) = ref.rpartition(':')
80 #return None, None, [{
81 # 'video_id': video_id,
83 # 'channel_id': channel_id,
86 title
= feed
. find ( 'atom:title' , ns
). text
87 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
88 if feed
. find ( 'atom:author' , ns
) else None
90 for entry
in feed
. findall ( 'atom:entry' , ns
):
92 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
93 'title' : entry
. find ( 'atom:title' , ns
). text
,
94 'published' : entry
. find ( 'atom:published' , ns
). text
,
95 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
96 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
97 # extra fields for pull_subs/webhook:
98 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
101 return title
, author
, videos
103 def update_channel ( db
, xmldata
):
104 if not xmldata
: return False
106 # Note: websub does not return global author, hence taking from first video
107 _
, _
, videos
= parse_xml ( xmldata
)
110 for i
, video
in enumerate ( videos
):
111 if video
. get ( 'deleted' ):
112 from flask
import current_app
113 current_app
. logger
. info ( f
"ignoring deleted video {video_id} from {channel_id} " ) # XXX: remove
114 # TODO: enable once we enforce hmac validation:
115 #c.execute("DELETE FROM videos WHERE id = ?", (video['video_id'],))
117 now
= datetime
. now ( timezone
. utc
)
118 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
119 published
= dateutil
. parser
. parse ( video
[ 'published' ])
120 # if update and published time are near-identical, we assume it's new.
121 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
123 else : #, it's just an update to an older video.
124 timestamp
= published
127 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
128 VALUES (?, ?, ?, datetime(?), datetime(?))
137 if i
== 0 : # only required once per feed
139 INSERT OR REPLACE INTO channels (id, name)
141 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
146 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
148 returns: best-quality muxed video stream, player_response, error-type/mesage
149 error types: player, malformed, livestream, geolocked, exhausted
151 player_error
= None # for 'exhausted'
152 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
153 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
154 "video_id" : video_id
,
155 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
160 params
= parse_qs ( r
. text
)
161 if 'errorcode' in params
: # status=fail
162 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
164 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
165 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
166 if playabilityStatus
!= "OK" :
167 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
168 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
169 player_error
= f
" {playabilityStatus} : {playabilityReason} "
170 if playabilityStatus
== "UNPLAYABLE" :
171 continue # try again with next el value (or fail as exhausted)
172 # without videoDetails, there's only the error message
173 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
174 return None , maybe_metadata
, 'player' , player_error
175 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
176 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
177 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
178 return None , metadata
, 'livestream' , None
180 if not 'formats' in metadata
[ 'streamingData' ]:
183 formats
= metadata
[ 'streamingData' ][ 'formats' ]
184 for ( i
, v
) in enumerate ( formats
):
185 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
186 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
187 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
189 # todo: check if we have urls or try again
190 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
192 if 'gcr' in parse_qs ( url
):
193 return None , metadata
, 'geolocked' , None
195 return url
, metadata
, None , None
197 return None , metadata
, 'exhausted' , player_error
199 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
200 signature
= list ( cipher
[ 's' ][ 0 ])
201 for c
in algo
. split ():
202 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
203 ix
= int ( ix
) % len ( signature
) if ix
else 0
205 if op
== 'r' : signature
= list ( reversed ( signature
))
206 if op
== 's' : signature
= signature
[ ix
:]
207 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
208 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
209 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
210 return f
"{cipher['url'][0]}& {sp} = {sig} "
212 def prepare_metadata ( metadata
):
213 meta1
= metadata
[ 'videoDetails' ]
214 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
215 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
216 if 'cards' in metadata
else []
217 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
218 if 'endscreen' in metadata
else []
220 # the actual video streams have exact information:
222 sd
= metadata
[ 'streamingData' ]
223 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
224 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
225 # if that's unavailable (e.g. on livestreams), fall back to
226 # thumbnails (only either 4:3 or 16:9).
228 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
229 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
232 { 'url' : cc
[ 'baseUrl' ],
233 'code' : cc
[ 'languageCode' ],
234 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
235 'name' : cc
[ 'name' ][ 'simpleText' ]}
236 for cc
in metadata
. get ( 'captions' ,{})
237 . get ( 'playerCaptionsTracklistRenderer' ,{})
238 . get ( 'captionTracks' ,[])
239 ], key
= lambda cc
: cc
[ 'autogenerated' ])
242 # externals URLs are redirected through youtube.com/redirect, but we
243 # may encounter internal URLs, too
244 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
245 # Remove left-/rightmost word from string:
246 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
247 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
248 # Thousands seperator aware int():
249 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
251 def parse_infocard ( card
):
252 card
= card
[ 'cardRenderer' ]
253 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
254 content
= card
[ 'content' ][ ctype
]
255 if ctype
== "pollRenderer" :
258 'question' : content
[ 'question' ][ 'simpleText' ],
259 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
260 for a
in content
[ 'choices' ]],
262 elif ctype
== "videoInfoCardContentRenderer" :
264 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
265 # TODO: this is ugly; cleanup.
266 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
267 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
269 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
270 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
271 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
273 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
275 elif ctype
== "playlistInfoCardContentRenderer" :
278 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
279 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
280 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
281 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
282 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
284 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
287 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
288 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
289 'title' : content
[ 'title' ][ 'simpleText' ],
290 # XXX: no thumbnails for infocards
292 elif ctype
== "collaboratorInfoCardContentRenderer" :
295 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
296 'title' : content
[ 'channelName' ][ 'simpleText' ],
297 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
298 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
302 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
304 return { 'type' : ctype
, 'content' : content
}
306 def mkthumbs ( thumbs
):
307 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
308 def parse_endcard ( card
):
309 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
310 ctype
= card
[ 'style' ]
311 if ctype
== "CHANNEL" :
313 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
314 'title' : card
[ 'title' ][ 'simpleText' ],
315 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
317 elif ctype
== "VIDEO" :
319 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ], # XXX: KeyError 'endpoint' exception (no idea which youtube video this was on)
320 'title' : card
[ 'title' ][ 'simpleText' ],
321 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
322 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
323 # XXX: no channel name
325 elif ctype
== "PLAYLIST" :
327 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
328 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
329 'title' : card
[ 'title' ][ 'simpleText' ],
330 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
331 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
333 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
335 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
338 'domain' : urlparse ( url
). netloc
,
339 'title' : card
[ 'title' ][ 'simpleText' ],
340 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
344 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
346 return { 'type' : ctype
, 'content' : content
}
348 infocards
= [ parse_infocard ( card
) for card
in cards
]
349 endcards
= [ parse_endcard ( card
) for card
in endsc
]
350 # combine cards to weed out duplicates. for videos and playlists prefer
351 # infocards, for channels and websites prefer endcards, as those have more
352 # information than the other.
353 # if the card type is not in ident, we use the whole card for comparison
354 # (otherwise they'd all replace each other)
355 ident
= { # ctype -> ident
357 'PLAYLIST' : 'playlist_id' ,
358 'CHANNEL' : 'channel_id' ,
362 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
363 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
364 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
366 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
367 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
369 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
370 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
371 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
372 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
373 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
374 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
375 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
376 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
377 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
378 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
379 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
380 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
381 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
384 'title' : meta1
[ 'title' ],
385 'author' : meta1
[ 'author' ],
386 'channel_id' : meta1
[ 'channelId' ],
387 'description' : meta1
[ 'shortDescription' ],
388 'published' : meta2
[ 'publishDate' ],
389 'views' : meta1
[ 'viewCount' ],
390 'length' : int ( meta1
[ 'lengthSeconds' ]),
391 'rating' : meta1
[ 'averageRating' ],
392 'category' : meta2
[ 'category' ],
393 'aspectr' : aspect_ratio
,
394 'unlisted' : meta2
[ 'isUnlisted' ],
395 'countries' : whitelisted
,
396 'blacklisted' : blacklisted
,
397 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
398 'infocards' : infocards
,
399 'endcards' : endcards
,
400 'all_cards' : allcards
,
401 'subtitles' : subtitles
,
404 class RedditException ( Exception ): pass
405 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
406 count
= None , before
= None , after
= None ):
408 fetches data from a subreddit (or a multireddit like gif+gifs) and
409 filters/sorts results.
410 sorted_by values: hot, new, rising, controversial, top
411 time values: hour, day, week, month, year, all (for top and controversial)
417 query
= { k
: v
for k
, v
in {
421 'limit' : limit
, # 1..100 (default 25)
422 't' : time
, # hour,week,month,year,all
424 multireddit
= '+' . join ( subreddits
)
425 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
426 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
427 if not r
. ok
or not 'data' in r
. json ():
428 raise RedditException ( r
. text
)
432 def fetch_reddit_post ( post_id
):
433 # Note: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
434 r
= requests
. get ( f
"https://old.reddit.com/by_id/t3_ {post_id} .json" ,
435 headers
={ 'User-Agent' : 'Mozilla/5.0' })
436 if not r
. ok
or not 'data' in r
. json ():
437 raise RedditException ( r
. text
)
441 def parse_reddit_videos ( data
):
443 entries
= sorted ( data
[ 'data' ][ 'children' ],
444 key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 ,
446 for entry
in entries
:
448 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
451 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
452 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
454 continue # XXX: should we log that?
455 if not video_id
: continue
457 'video_id' : video_id
,
458 'title' : html
. unescape ( e
[ 'title' ]), # Note: we unescape and re-escape in the template
459 'url' : e
[ 'permalink' ],
460 'n_comments' : e
[ 'num_comments' ],
461 'n_karma' : e
[ 'score' ],
462 'subreddit' : e
[ 'subreddit' ],
468 class NoFallbackException ( Exception ): pass
469 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
471 finds the next route that matches the current url rule, and executes it.
472 args, kwargs: pass all arguments of the current route
474 from flask
import current_app
, request
, g
475 from werkzeug
. exceptions
import NotFound
477 # build a list of endpoints that match the current request's url rule:
480 for rule
in current_app
. url_map
. iter_rules ()
481 if rule
. rule
== request
. url_rule
. rule
483 current
= matching
. index ( request
. endpoint
)
485 # since we can't change request.endpoint, we always get the original
486 # endpoint back. so for repeated fall throughs, we use the g object to
487 # increment how often we want to fall through.
488 if not '_fallback_next' in g
:
490 g
._ fallback
_ next
+= 1
492 next_ep
= current
+ g
._ fallback
_ next
494 if next_ep
< len ( matching
):
495 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
497 raise NoFallbackException
499 def websub_url_hmac ( key
, feed_id
, timestamp
, nonce
):
500 """ generate sha1 hmac, as required by websub/pubsubhubbub """
501 sig_input
= f
" {feed_id} : {timestamp} : {nonce} " . encode ( 'ascii' )
502 return hmac
. new ( key
. encode ( 'ascii' ), sig_input
, hashlib
. sha1
). hexdigest ()
504 def websub_body_hmac ( key
, body
):
505 return hmac
. new ( key
. encode ( 'ascii' ), body
, hashlib
. sha1
). hexdigest ()
508 from pprint
import pprint
510 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))