]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
10 from xml
. etree
import ElementTree
11 from configparser
import ConfigParser
12 from datetime
import datetime
, timezone
13 from urllib
. parse
import parse_qs
, urlparse
16 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
17 cf
. read ( config_filename
)
18 if not 'global' in cf
: # todo: full config check
19 raise Exception ( "Configuration file not found or empty" )
21 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
22 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
24 # Note: this should only be required for the 'memory' backed cache.
25 # TODO: only run for long-running processes, i.e. the frontend
26 from threading
import Timer
28 requests_cache
. remove_expired_responses ()
29 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
34 # for debugging purposes, monkey patch requests session to store each requests-request in a flask-request's g object (url and response). we can then use a flask error_handler to include the request data in the error log.
35 # since we also call config from outside the flask appcontext, it is wrapped in a try-catch block.
38 from requests
import Session
as OriginalSession
39 class _NSASession ( OriginalSession
):
40 def request ( self
, method
, url
, params
= None , data
= None , ** kwargs
):
41 response
= super ( _NSASession
, self
). request (
42 method
, url
, params
, data
, ** kwargs
45 if 'api_requests' not in g
:
47 g
. api_requests
. append (( url
, params
, response
. text
))
48 except RuntimeError : pass # not within flask (e.g. utils.py)
50 requests
. Session
= requests
. sessions
. Session
= _NSASession
52 def fetch_xml ( feed_type
, feed_id
):
53 # TODO: handle requests.exceptions.ConnectionError
54 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
62 def parse_xml ( xmldata
):
64 'atom' : "http://www.w3.org/2005/Atom" ,
65 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
66 'media' : "http://search.yahoo.com/mrss/" ,
67 'at' : "http://purl.org/atompub/tombstones/1.0" ,
70 feed
= ElementTree
. fromstring ( xmldata
)
72 if feed
. find ( 'at:deleted-entry' , ns
):
73 ( _
, _
, vid
) = feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' ). rpartition ( ':' )
74 return None , None , [{ 'deleted' : True , 'video_id' : vid
}], None , None
76 title
= feed
. find ( 'atom:title' , ns
). text
77 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
78 if feed
. find ( 'atom:author' , ns
) else None
79 # for /user/<> endpoint: find out UC-id:
80 # for playlists: this is who created the playlist:
81 try : channel_id
= feed
. find ( 'yt:channelId' , ns
). text
82 except : channel_id
= None # XXX: why does ternary not work!?
83 # for pullsub: if this exists, we're looking at a playlist:
84 try : playlist_id
= feed
. find ( 'yt:playlistId' , ns
). text
85 except : playlist_id
= None # XXX: why does ternary not work!?
87 for entry
in feed
. findall ( 'atom:entry' , ns
):
89 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
90 'title' : entry
. find ( 'atom:title' , ns
). text
,
91 'published' : entry
. find ( 'atom:published' , ns
). text
,
92 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
93 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
94 # extra fields for pull_subs/webhook:
95 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
98 return title
, author
, videos
, channel_id
, playlist_id
100 def update_channel ( db
, xmldata
, from_webhook
= False ):
101 if not xmldata
: return False
103 # Note: websub does not return global author, hence taking from first video
104 title
, author
, videos
, channel
, playlist
= parse_xml ( xmldata
)
107 for i
, video
in enumerate ( videos
):
108 if video
. get ( 'deleted' ):
109 # Note: Deletion events are not just fired for actual deletions,
110 # but also for unlisting videos and livestreams that just ended
111 # (even postLiveDVR ones). Hence, we don't follow it.
112 flask_logger ( f
"ignoring deleted/unlisted/ended video/stream {video['video_id']}" )
115 c
. execute ( "SELECT 1 FROM videos WHERE id=?" ,( video
[ 'video_id' ],))
116 new_video
= len ( c
. fetchall ()) < 1
118 flask_logger ( f
"new video {video['video_id']}" )
119 _
, _
, meta
, _
, _
= get_video_info ( video
[ 'video_id' ])
120 # The 'published' timestamp sent in websub POSTs are often wrong (e.g.:
121 # video gets uploaded as unlisted on day A and set to public on day B;
122 # the webhook is sent on day B, but 'published' says A. The video
123 # therefore looks like it's just an update to an older video).
124 # g_v_i gives is the date the video was published to viewers, so we
125 # prefer that. But since g_v_i only returns the date without time,
126 # we still use xmlfeed's date if it's the same date.
127 published
= dateutil
. parser
. parse ( video
[ 'published' ])
131 meta
= video_metadata ( meta
)
132 published2
= dateutil
. parser
. parse ( meta
[ 'published' ])
133 flask_logger ( f
"published {published} / {published2} " )
134 if published
< published2
: # g_v_i date is more accurate:
135 published
= published2
136 length
= meta
[ 'length' ]
137 livestream
= meta
[ 'livestream' ]
139 now
= datetime
. now ( timezone
. utc
)
141 # we pretend that all videos uploaded this week were uploaded just
142 # now, so the user sees it at the top of the feed, and it doesn't
143 # get inserted somewhere further down.
144 if ( now
- published
). days
< 7 :
146 else : #, it's just an update to an older video.
147 timestamp
= published
150 INSERT OR IGNORE INTO videos
151 (id, channel_id, title, length, livestream, published, crawled)
152 VALUES (?, ?, ?, ?, ?, datetime(?), datetime(?))
163 # update video title (everything else can't change)
165 UPDATE OR IGNORE videos
173 # for channels, this is obviously always the same, but playlists can
174 # consist of videos from different channels:
175 if i
== 0 or playlist
:
177 INSERT OR REPLACE INTO channels (id, name)
179 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
181 # keep track of which videos are in a playlist, so we can show the user
182 # why a video is in their feed:
185 INSERT OR IGNORE INTO playlist_videos (video_id, playlist_id)
187 """ , ( video
[ 'video_id' ], playlist
))
189 if playlist
and not from_webhook
: # Note: playlists can't get updated via websub
191 INSERT OR REPLACE INTO playlists (id, name, author)
193 """ , ( playlist
, title
, channel
))
195 INSERT OR REPLACE INTO channels (id, name)
197 """ , ( channel
, author
))
203 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
205 returns: best-quality muxed video stream, stream map, player_response, error-type/mesage
206 error types: player, malformed, livestream, geolocked, exhausted
208 player_error
= None # for 'exhausted'
209 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
210 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
211 "video_id" : video_id
,
212 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
218 if r
. status_code
== 429 :
219 return None , None , None , 'banned' , 'possible IP ban'
221 params
= parse_qs ( r
. text
)
222 if 'errorcode' in params
: # status=fail
223 return None , None , None , 'malformed' , params
[ 'reason' ][ 0 ]
225 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
226 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
227 if playabilityStatus
!= "OK" :
228 playabilityReason
= metadata
[ 'playabilityStatus' ]. get ( 'reason' ,
229 '//' . join ( metadata
[ 'playabilityStatus' ]. get ( 'messages' ,[])))
230 player_error
= f
" {playabilityStatus} : {playabilityReason} "
231 if playabilityStatus
== "UNPLAYABLE" :
232 continue # try again with next el value (or fail as exhausted)
233 # without videoDetails, there's only the error message
234 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
235 return None , None , maybe_metadata
, 'player' , player_error
236 if metadata
[ 'videoDetails' ]. get ( 'isLive' , False ):
237 return None , None , metadata
, 'livestream' , None
239 if not 'formats' in metadata
[ 'streamingData' ]:
242 formats
= metadata
[ 'streamingData' ]. get ( 'formats' ,[])
243 for ( i
, v
) in enumerate ( formats
):
244 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
245 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
246 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
248 adaptive
= metadata
[ 'streamingData' ]. get ( 'adaptiveFormats' ,[])
249 for ( i
, v
) in enumerate ( adaptive
):
250 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
251 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
252 adaptive
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
254 stream_map
= { 'adaptive' : adaptive
, 'muxed' : formats
}
256 # todo: check if we have urls or try again
257 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
259 # ip-locked videos can be recovered if the proxy module is loaded:
260 is_geolocked
= 'geolocked' if 'gcr' in parse_qs ( urlparse ( url
). query
) else None
262 return url
, stream_map
, metadata
, is_geolocked
, None
264 return None , None , metadata
, 'exhausted' , player_error
266 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
267 signature
= list ( cipher
[ 's' ][ 0 ])
268 for c
in algo
. split ():
269 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
270 ix
= int ( ix
) % len ( signature
) if ix
else 0
272 if op
== 'r' : signature
= list ( reversed ( signature
))
273 if op
== 's' : signature
= signature
[ ix
:]
274 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
275 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
276 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
277 return f
"{cipher['url'][0]}& {sp} = {sig} "
279 def video_metadata ( metadata
):
283 meta1
= metadata
[ 'videoDetails' ]
284 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
286 published_at
= meta2
. get ( 'liveBroadcastDetails' ,{}) \
287 . get ( 'startTimestamp' , f
"{meta2['publishDate']}T00:00:00Z" )
289 # Note: 'premiere' videos have livestream=False and published= will be the
290 # start of the premiere.
292 'title' : meta1
[ 'title' ],
293 'author' : meta1
[ 'author' ],
294 'channel_id' : meta1
[ 'channelId' ],
295 'published' : published_at
,
296 'views' : int ( meta1
[ 'viewCount' ]),
297 'length' : int ( meta2
[ 'lengthSeconds' ]) or int ( meta1
[ 'lengthSeconds' ]),
298 'livestream' : meta1
[ 'isLiveContent' ],
301 def store_video_metadata ( video_id
):
302 # check if we know about it, and if not, fetch and store video metadata
303 with sqlite3
. connect ( cf
[ 'global' ][ 'database' ]) as conn
:
305 c
. execute ( "SELECT 1 from videos where id = ?" , ( video_id
,))
306 new_video
= len ( c
. fetchall ()) < 1
308 _
, _
, meta
, _
, _
= get_video_info ( video_id
)
310 meta
= video_metadata ( meta
)
312 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
313 VALUES (?, ?, ?, datetime(?), datetime(?))
322 INSERT OR REPLACE INTO channels (id, name)
324 """ , ( meta
[ 'channel_id' ], meta
[ 'author' ]))
326 from werkzeug
. exceptions
import NotFound
327 class NoFallbackException ( NotFound
): pass
328 def fallback_route (* args
, ** kwargs
): # TODO: worthy as a flask-extension?
330 finds the next route that matches the current url rule, and executes it.
331 args, kwargs: pass all arguments of the current route
333 from flask
import current_app
, request
, g
335 # build a list of endpoints that match the current request's url rule:
338 for rule
in current_app
. url_map
. iter_rules ()
339 if rule
. rule
== request
. url_rule
. rule
341 current
= matching
. index ( request
. endpoint
)
343 # since we can't change request.endpoint, we always get the original
344 # endpoint back. so for repeated fall throughs, we use the g object to
345 # increment how often we want to fall through.
346 if not '_fallback_next' in g
:
348 g
._ fallback
_ next
+= 1
350 next_ep
= current
+ g
._ fallback
_ next
352 if next_ep
< len ( matching
):
353 return current_app
. view_functions
[ matching
[ next_ep
]](* args
, ** kwargs
)
355 raise NoFallbackException
357 def websub_url_hmac ( key
, feed_id
, timestamp
, nonce
):
358 """ generate sha1 hmac, as required by websub/pubsubhubbub """
359 sig_input
= f
" {feed_id} : {timestamp} : {nonce} " . encode ( 'ascii' )
360 return hmac
. new ( key
. encode ( 'ascii' ), sig_input
, hashlib
. sha1
). hexdigest ()
362 def websub_body_hmac ( key
, body
):
363 return hmac
. new ( key
. encode ( 'ascii' ), body
, hashlib
. sha1
). hexdigest ()
365 def flask_logger ( msg
, level
= "warning" ):
367 from flask
import current_app
368 current_app
. logger
. log ( level
, msg
)
373 from pprint
import pprint
375 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))