]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
7 from xml
. etree
import ElementTree
8 from configparser
import ConfigParser
9 from datetime
import datetime
, timezone
10 from urllib
. parse
import parse_qs
, urlparse
13 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
14 cf
. read ( config_filename
)
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading
import Timer
23 requests_cache
. remove_expired_responses ()
24 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
29 def fetch_xml ( feed_type
, feed_id
):
30 # TODO: handle requests.exceptions.ConnectionError
31 r
= requests
. get ( f
"https://www.youtube.com/feeds/videos.xml? {feed_type} = {feed_id} " )
37 def parse_xml ( xmldata
):
39 'atom' : "http://www.w3.org/2005/Atom" ,
40 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
41 'media' : "http://search.yahoo.com/mrss/" ,
42 'at' : "http://purl.org/atompub/tombstones/1.0" ,
45 feed
= ElementTree
. fromstring ( xmldata
)
46 if feed
. find ( 'at:deleted-entry' , ns
):
47 author
= feed
. find ( 'at:deleted-entry/at:by/name' , ns
). text
48 ref
= feed
. find ( 'at:deleted-entry' , ns
). get ( 'ref' )
49 ( _
, _
, video_id
) = ref
. rpartition ( ':' )
51 title
= feed
. find ( 'atom:title' , ns
). text
52 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
53 if feed
. find ( 'atom:author' , ns
) else None
55 for entry
in feed
. findall ( 'atom:entry' , ns
):
57 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
58 'title' : entry
. find ( 'atom:title' , ns
). text
,
59 'published' : entry
. find ( 'atom:published' , ns
). text
,
60 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
61 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
62 # extra fields for pull_subs/webhook:
63 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
66 return title
, author
, videos
68 def update_channel ( db
, xmldata
):
69 if not xmldata
: return False
71 # Note: websub does not return global author, hence taking from first video
72 title
, _
, videos
= parse_xml ( xmldata
)
74 # TODO: if not title: delete from videos (this should only be implemented after webhook hmac validation!)
77 for i
, video
in enumerate ( videos
):
78 now
= datetime
. now ( timezone
. utc
)
79 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
80 published
= dateutil
. parser
. parse ( video
[ 'published' ])
81 # if update and published time are near-identical, we assume it's new.
82 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
84 else : #, it's just an update to an older video.
88 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
89 VALUES (?, ?, ?, datetime(?), datetime(?))
98 if i
== 0 : # only required once per feed
100 INSERT OR REPLACE INTO channels (id, name)
102 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
107 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
109 returns: best-quality muxed video stream, player_response, error-type/mesage
110 error types: player, malformed, livestream, geolocked, exhausted
112 player_error
= None # for 'exhausted'
113 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
114 r
= requests
. get ( f
"https://www.youtube.com/get_video_info" +
115 f
"?video_id= {video_id} " +
116 f
"&eurl=https://youtube.googleapis.com/v/ {video_id} " +
119 f
"&hl=en_US" ) #"&hl=en&gl=US"
120 params
= parse_qs ( r
. text
)
121 if 'errorcode' in params
: # status=fail
122 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
124 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
125 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
126 if playabilityStatus
!= "OK" :
127 playabilityReason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
128 player_error
= f
" {playabilityStatus} : {playabilityReason} "
129 if playabilityStatus
== "UNPLAYABLE" :
130 continue # try again with next el value (or fail as exhausted)
131 # without videoDetails, there's only the error message
132 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
133 return None , maybe_metadata
, 'player' , player_error
134 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
135 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
136 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
137 return None , metadata
, 'livestream' , None
139 if not 'formats' in metadata
[ 'streamingData' ]:
142 formats
= metadata
[ 'streamingData' ][ 'formats' ]
143 for ( i
, v
) in enumerate ( formats
):
144 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
145 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
146 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
148 # todo: check if we have urls or try again
149 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
151 if 'gcr' in parse_qs ( url
):
152 return None , metadata
, 'geolocked' , None
154 return url
, metadata
, None , None
156 return None , metadata
, 'exhausted' , player_error
158 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
159 signature
= list ( cipher
[ 's' ][ 0 ])
160 for c
in algo
. split ():
161 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
162 ix
= int ( ix
) % len ( signature
) if ix
else 0
164 if op
== 'r' : signature
= list ( reversed ( signature
))
165 if op
== 's' : signature
= signature
[ ix
:]
166 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
167 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
168 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
169 return f
"{cipher['url'][0]}& {sp} = {sig} "
171 def prepare_metadata ( metadata
):
172 meta1
= metadata
[ 'videoDetails' ]
173 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
174 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
175 if 'cards' in metadata
else []
176 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
177 if 'endscreen' in metadata
else []
179 # the actual video streams have exact information:
181 sd
= metadata
[ 'streamingData' ]
182 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
183 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
184 # if that's unavailable (e.g. on livestreams), fall back to
185 # thumbnails (only either 4:3 or 16:9).
187 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
188 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
191 { 'url' : cc
[ 'baseUrl' ],
192 'code' : cc
[ 'languageCode' ],
193 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
194 'name' : cc
[ 'name' ][ 'simpleText' ]}
195 for cc
in metadata
. get ( 'captions' ,{})
196 . get ( 'playerCaptionsTracklistRenderer' ,{})
197 . get ( 'captionTracks' ,[])
198 ], key
= lambda cc
: cc
[ 'autogenerated' ])
201 # externals URLs are redirected through youtube.com/redirect, but we
202 # may encounter internal URLs, too
203 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
204 # Remove left-/rightmost word from string:
205 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
206 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
207 # Thousands seperator aware int():
208 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
210 def parse_infocard ( card
):
211 card
= card
[ 'cardRenderer' ]
212 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
213 content
= card
[ 'content' ][ ctype
]
214 if ctype
== "pollRenderer" :
217 'question' : content
[ 'question' ][ 'simpleText' ],
218 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
219 for a
in content
[ 'choices' ]],
221 elif ctype
== "videoInfoCardContentRenderer" :
223 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
224 # TODO: this is ugly; cleanup.
225 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
226 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
228 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
229 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
230 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
232 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
234 elif ctype
== "playlistInfoCardContentRenderer" :
237 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
238 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
239 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
240 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
241 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
243 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
246 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
247 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
248 'title' : content
[ 'title' ][ 'simpleText' ],
249 # XXX: no thumbnails for infocards
251 elif ctype
== "collaboratorInfoCardContentRenderer" :
254 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
255 'title' : content
[ 'channelName' ][ 'simpleText' ],
256 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
257 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
261 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
263 return { 'type' : ctype
, 'content' : content
}
265 def mkthumbs ( thumbs
):
266 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
267 def parse_endcard ( card
):
268 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
269 ctype
= card
[ 'style' ]
270 if ctype
== "CHANNEL" :
272 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
273 'title' : card
[ 'title' ][ 'simpleText' ],
274 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
276 elif ctype
== "VIDEO" :
278 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
279 'title' : card
[ 'title' ][ 'simpleText' ],
280 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
281 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
282 # XXX: no channel name
284 elif ctype
== "PLAYLIST" :
286 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
287 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
288 'title' : card
[ 'title' ][ 'simpleText' ],
289 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
290 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
292 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
294 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
297 'domain' : urlparse ( url
). netloc
,
298 'title' : card
[ 'title' ][ 'simpleText' ],
299 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
303 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
305 return { 'type' : ctype
, 'content' : content
}
307 infocards
= [ parse_infocard ( card
) for card
in cards
]
308 endcards
= [ parse_endcard ( card
) for card
in endsc
]
309 # combine cards to weed out duplicates. for videos and playlists prefer
310 # infocards, for channels and websites prefer endcards, as those have more
311 # information than the other.
312 # if the card type is not in ident, we use the whole card for comparison
313 # (otherwise they'd all replace each other)
314 ident
= { # ctype -> ident
316 'PLAYLIST' : 'playlist_id' ,
317 'CHANNEL' : 'channel_id' ,
321 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
322 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
323 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
325 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
326 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
328 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
329 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
330 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
331 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
332 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
333 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
334 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
335 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
336 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
337 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
338 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
339 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
340 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
343 'title' : meta1
[ 'title' ],
344 'author' : meta1
[ 'author' ],
345 'channel_id' : meta1
[ 'channelId' ],
346 'description' : meta1
[ 'shortDescription' ],
347 'published' : meta2
[ 'publishDate' ],
348 'views' : meta1
[ 'viewCount' ],
349 'length' : int ( meta1
[ 'lengthSeconds' ]),
350 'rating' : meta1
[ 'averageRating' ],
351 'category' : meta2
[ 'category' ],
352 'aspectr' : aspect_ratio
,
353 'unlisted' : meta2
[ 'isUnlisted' ],
354 'countries' : whitelisted
,
355 'blacklisted' : blacklisted
,
356 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
357 'infocards' : infocards
,
358 'endcards' : endcards
,
359 'all_cards' : allcards
,
360 'subtitles' : subtitles
,
363 class RedditException ( Exception ): pass
364 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
365 count
= None , before
= None , after
= None ):
367 fetches data from a subreddit (or a multireddit like gif+gifs) and
368 filters/sorts results.
369 sorted_by values: hot, new, rising, controversial, top
370 time values: hour, week, month, year, all (for top and controversial)
371 returns a tuple of ([ {video} ],before,after)
373 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
376 return [], None , None
378 query
= '&' . join ([ f
" {k} = {v} " for k
, v
in {
382 'limit' : limit
, # 1..100 (default 25)
383 't' : time
, # hour,week,month,year,all
385 multireddit
= '+' . join ( subreddits
)
386 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json? {query} " ,
387 headers
={ 'User-Agent' : 'Mozilla/5.0' })
388 if not r
. ok
or not 'data' in r
. json ():
389 raise RedditException ( r
. text
)
392 entries
= sorted ( r
. json ()[ 'data' ][ 'children' ], key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 , reverse
= True )
393 for entry
in entries
:
395 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
398 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
399 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
401 continue # XXX: should we log that?
402 if not video_id
: continue
404 'video_id' : video_id
,
406 'url' : e
[ 'permalink' ],
407 'n_comments' : e
[ 'num_comments' ],
408 'n_karma' : e
[ 'score' ],
409 'subreddit' : e
[ 'subreddit' ],
412 before
= r
. json ()[ 'data' ][ 'before' ]
413 after
= r
. json ()[ 'data' ][ 'after' ]
415 return videos
, before
, after
418 from pprint
import pprint
420 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))