]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
7 from xml
. etree
import ElementTree
8 from configparser
import ConfigParser
9 from datetime
import datetime
, timezone
10 from urllib
. parse
import parse_qs
, urlparse
13 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
14 cf
. read ( config_filename
)
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading
import Timer
23 requests_cache
. remove_expired_responses ()
24 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
29 def fetch_xml ( feed_type
, feed_id
):
30 # TODO: handle requests.exceptions.ConnectionError
31 r
= requests
. get ( "https://www.youtube.com/feeds/videos.xml" , {
39 def parse_xml ( xmldata
):
41 'atom' : "http://www.w3.org/2005/Atom" ,
42 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
43 'media' : "http://search.yahoo.com/mrss/" ,
44 'at' : "http://purl.org/atompub/tombstones/1.0" ,
47 feed
= ElementTree
. fromstring ( xmldata
)
48 if feed
. find ( 'at:deleted-entry' , ns
):
49 author
= feed
. find ( 'at:deleted-entry/at:by/name' , ns
). text
50 ref
= feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' )
51 ( _
, _
, video_id
) = ref
. rpartition ( ':' )
53 title
= feed
. find ( 'atom:title' , ns
). text
54 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
55 if feed
. find ( 'atom:author' , ns
) else None
57 for entry
in feed
. findall ( 'atom:entry' , ns
):
59 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
60 'title' : entry
. find ( 'atom:title' , ns
). text
,
61 'published' : entry
. find ( 'atom:published' , ns
). text
,
62 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
63 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
64 # extra fields for pull_subs/webhook:
65 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
68 return title
, author
, videos
70 def update_channel ( db
, xmldata
):
71 if not xmldata
: return False
73 # Note: websub does not return global author, hence taking from first video
74 title
, _
, videos
= parse_xml ( xmldata
)
76 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
79 for i
, video
in enumerate ( videos
):
80 now
= datetime
. now ( timezone
. utc
)
81 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
82 published
= dateutil
. parser
. parse ( video
[ 'published' ])
83 # if update and published time are near-identical, we assume it's new.
84 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
86 else : #, it's just an update to an older video.
90 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
91 VALUES (?, ?, ?, datetime(?), datetime(?))
100 if i
== 0 : # only required once per feed
102 INSERT OR REPLACE INTO channels (id, name)
104 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
109 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
111 returns: best-quality muxed video stream, player_response, error-type/mesage
112 error types: player, malformed, livestream, geolocked, exhausted
114 player_error
= None # for 'exhausted'
115 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
116 r
= requests
. get ( "https://www.youtube.com/get_video_info" , {
117 "video_id" : video_id
,
118 "eurl" : f
"https://youtube.googleapis.com/v/ {video_id} " ,
123 params
= parse_qs ( r
. text
)
124 if 'errorcode' in params
: # status=fail
125 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
127 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
128 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
129 if playabilityStatus
!= "OK" :
130 playabilityReason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
131 player_error
= f
" {playabilityStatus} : {playabilityReason} "
132 if playabilityStatus
== "UNPLAYABLE" :
133 continue # try again with next el value (or fail as exhausted)
134 # without videoDetails, there's only the error message
135 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
136 return None , maybe_metadata
, 'player' , player_error
137 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
138 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
139 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
140 return None , metadata
, 'livestream' , None
142 if not 'formats' in metadata
[ 'streamingData' ]:
145 formats
= metadata
[ 'streamingData' ][ 'formats' ]
146 for ( i
, v
) in enumerate ( formats
):
147 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
148 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
149 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
151 # todo: check if we have urls or try again
152 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
154 if 'gcr' in parse_qs ( url
):
155 return None , metadata
, 'geolocked' , None
157 return url
, metadata
, None , None
159 return None , metadata
, 'exhausted' , player_error
161 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
162 signature
= list ( cipher
[ 's' ][ 0 ])
163 for c
in algo
. split ():
164 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
165 ix
= int ( ix
) % len ( signature
) if ix
else 0
167 if op
== 'r' : signature
= list ( reversed ( signature
))
168 if op
== 's' : signature
= signature
[ ix
:]
169 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
170 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
171 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
172 return f
"{cipher['url'][0]}& {sp} = {sig} "
174 def prepare_metadata ( metadata
):
175 meta1
= metadata
[ 'videoDetails' ]
176 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
177 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
178 if 'cards' in metadata
else []
179 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
180 if 'endscreen' in metadata
else []
182 # the actual video streams have exact information:
184 sd
= metadata
[ 'streamingData' ]
185 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
186 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
187 # if that's unavailable (e.g. on livestreams), fall back to
188 # thumbnails (only either 4:3 or 16:9).
190 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
191 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
194 { 'url' : cc
[ 'baseUrl' ],
195 'code' : cc
[ 'languageCode' ],
196 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
197 'name' : cc
[ 'name' ][ 'simpleText' ]}
198 for cc
in metadata
. get ( 'captions' ,{})
199 . get ( 'playerCaptionsTracklistRenderer' ,{})
200 . get ( 'captionTracks' ,[])
201 ], key
= lambda cc
: cc
[ 'autogenerated' ])
204 # externals URLs are redirected through youtube.com/redirect, but we
205 # may encounter internal URLs, too
206 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
207 # Remove left-/rightmost word from string:
208 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
209 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
210 # Thousands seperator aware int():
211 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
213 def parse_infocard ( card
):
214 card
= card
[ 'cardRenderer' ]
215 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
216 content
= card
[ 'content' ][ ctype
]
217 if ctype
== "pollRenderer" :
220 'question' : content
[ 'question' ][ 'simpleText' ],
221 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
222 for a
in content
[ 'choices' ]],
224 elif ctype
== "videoInfoCardContentRenderer" :
226 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
227 # TODO: this is ugly; cleanup.
228 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
229 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
231 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
232 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
233 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
235 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
237 elif ctype
== "playlistInfoCardContentRenderer" :
240 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
241 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
242 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
243 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
244 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
246 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
249 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
250 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
251 'title' : content
[ 'title' ][ 'simpleText' ],
252 # XXX: no thumbnails for infocards
254 elif ctype
== "collaboratorInfoCardContentRenderer" :
257 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
258 'title' : content
[ 'channelName' ][ 'simpleText' ],
259 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
260 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
264 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
266 return { 'type' : ctype
, 'content' : content
}
268 def mkthumbs ( thumbs
):
269 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
270 def parse_endcard ( card
):
271 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
272 ctype
= card
[ 'style' ]
273 if ctype
== "CHANNEL" :
275 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
276 'title' : card
[ 'title' ][ 'simpleText' ],
277 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
279 elif ctype
== "VIDEO" :
281 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
282 'title' : card
[ 'title' ][ 'simpleText' ],
283 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
284 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
285 # XXX: no channel name
287 elif ctype
== "PLAYLIST" :
289 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
290 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
291 'title' : card
[ 'title' ][ 'simpleText' ],
292 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
293 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
295 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
297 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
300 'domain' : urlparse ( url
). netloc
,
301 'title' : card
[ 'title' ][ 'simpleText' ],
302 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
306 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
308 return { 'type' : ctype
, 'content' : content
}
310 infocards
= [ parse_infocard ( card
) for card
in cards
]
311 endcards
= [ parse_endcard ( card
) for card
in endsc
]
312 # combine cards to weed out duplicates. for videos and playlists prefer
313 # infocards, for channels and websites prefer endcards, as those have more
314 # information than the other.
315 # if the card type is not in ident, we use the whole card for comparison
316 # (otherwise they'd all replace each other)
317 ident
= { # ctype -> ident
319 'PLAYLIST' : 'playlist_id' ,
320 'CHANNEL' : 'channel_id' ,
324 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
325 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
326 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
328 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
329 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
331 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
332 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
333 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
334 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
335 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
336 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
337 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
338 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
339 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
340 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
341 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
342 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
343 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
346 'title' : meta1
[ 'title' ],
347 'author' : meta1
[ 'author' ],
348 'channel_id' : meta1
[ 'channelId' ],
349 'description' : meta1
[ 'shortDescription' ],
350 'published' : meta2
[ 'publishDate' ],
351 'views' : meta1
[ 'viewCount' ],
352 'length' : int ( meta1
[ 'lengthSeconds' ]),
353 'rating' : meta1
[ 'averageRating' ],
354 'category' : meta2
[ 'category' ],
355 'aspectr' : aspect_ratio
,
356 'unlisted' : meta2
[ 'isUnlisted' ],
357 'countries' : whitelisted
,
358 'blacklisted' : blacklisted
,
359 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
360 'infocards' : infocards
,
361 'endcards' : endcards
,
362 'all_cards' : allcards
,
363 'subtitles' : subtitles
,
366 class RedditException ( Exception ): pass
367 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
368 count
= None , before
= None , after
= None ):
370 fetches data from a subreddit (or a multireddit like gif+gifs) and
371 filters/sorts results.
372 sorted_by values: hot, new, rising, controversial, top
373 time values: hour, week, month, year, all (for top and controversial)
374 returns a tuple of ([ {video} ],before,after)
376 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
379 return [], None , None
381 query
= { k
: v
for k
, v
in {
385 'limit' : limit
, # 1..100 (default 25)
386 't' : time
, # hour,week,month,year,all
388 multireddit
= '+' . join ( subreddits
)
389 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json" ,
390 query
, headers
={ 'User-Agent' : 'Mozilla/5.0' })
391 if not r
. ok
or not 'data' in r
. json ():
392 raise RedditException ( r
. text
)
395 entries
= sorted ( r
. json ()[ 'data' ][ 'children' ], key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 , reverse
= True )
396 for entry
in entries
:
398 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
401 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
402 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
404 continue # XXX: should we log that?
405 if not video_id
: continue
407 'video_id' : video_id
,
409 'url' : e
[ 'permalink' ],
410 'n_comments' : e
[ 'num_comments' ],
411 'n_karma' : e
[ 'score' ],
412 'subreddit' : e
[ 'subreddit' ],
415 before
= r
. json ()[ 'data' ][ 'before' ]
416 after
= r
. json ()[ 'data' ][ 'after' ]
418 return videos
, before
, after
421 from pprint
import pprint
423 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))