]>
git.gir.st - subscriptionfeed.git/blob - app/common/common.py
7 from xml
. etree
import ElementTree
8 from configparser
import ConfigParser
9 from datetime
import datetime
, timezone
10 from urllib
. parse
import parse_qs
, urlparse
13 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
14 cf
. read ( config_filename
)
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading
import Timer
23 requests_cache
. remove_expired_responses ()
24 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
29 def fetch_xml ( feed_type
, feed_id
):
30 # TODO: handle requests.exceptions.ConnectionError
31 r
= requests
. get ( f
"https://www.youtube.com/feeds/videos.xml? {feed_type} = {feed_id} " )
37 def parse_xml ( xmldata
):
39 'atom' : "http://www.w3.org/2005/Atom" ,
40 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
41 'media' : "http://search.yahoo.com/mrss/"
44 feed
= ElementTree
. fromstring ( xmldata
)
45 title
= feed
. find ( 'atom:title' , ns
). text
46 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
47 if feed
. find ( 'atom:author' , ns
) else None
49 for entry
in feed
. findall ( 'atom:entry' , ns
):
51 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
52 'title' : entry
. find ( 'atom:title' , ns
). text
,
53 'published' : entry
. find ( 'atom:published' , ns
). text
,
54 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
55 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
56 # extra fields for pull_subs/webhook:
57 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
60 return title
, author
, videos
62 def update_channel ( db
, xmldata
):
63 if not xmldata
: return False
65 # Note: websub does not return global author, hence taking from first video
66 title
, _
, videos
= parse_xml ( xmldata
)
69 for i
, video
in enumerate ( videos
):
70 now
= datetime
. now ( timezone
. utc
)
71 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
72 published
= dateutil
. parser
. parse ( video
[ 'published' ])
73 # if update and published time are near-identical, we assume it's new.
74 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
76 else : #, it's just an update to an older video.
80 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
81 VALUES (?, ?, ?, datetime(?), datetime(?))
90 if i
== 0 : # only required once per feed
92 INSERT OR REPLACE INTO channels (id, name)
94 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
99 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
101 returns: best-quality muxed video stream, player_response, error-type/mesage
102 error types: player, malformed, livestream, geolocked, exhausted
104 player_error
= None # for 'exhausted'
105 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
106 r
= requests
. get ( f
"https://www.youtube.com/get_video_info" +
107 f
"?video_id= {video_id} " +
108 f
"&eurl=https://youtube.googleapis.com/v/ {video_id} " +
111 f
"&hl=en_US" ) #"&hl=en&gl=US"
112 params
= parse_qs ( r
. text
)
113 if 'errorcode' in params
: # status=fail
114 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
116 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
117 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
118 if playabilityStatus
!= "OK" :
119 playabilityReason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
120 player_error
= f
" {playabilityStatus} : {playabilityReason} "
121 if playabilityStatus
== "UNPLAYABLE" :
122 continue # try again with next el value (or fail as exhausted)
123 # without videoDetails, there's only the error message
124 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
125 return None , maybe_metadata
, 'player' , player_error
126 if metadata
[ 'videoDetails' ][ 'isLiveContent' ] and \
127 ( metadata
[ 'videoDetails' ]. get ( 'isLive' , False ) or \
128 metadata
[ 'videoDetails' ]. get ( 'isPostLiveDvr' , False )):
129 return None , metadata
, 'livestream' , None
131 if not 'formats' in metadata
[ 'streamingData' ]:
134 formats
= metadata
[ 'streamingData' ][ 'formats' ]
135 for ( i
, v
) in enumerate ( formats
):
136 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
137 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
138 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
140 # todo: check if we have urls or try again
141 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
143 if 'gcr' in parse_qs ( url
):
144 return None , metadata
, 'geolocked' , None
146 return url
, metadata
, None , None
148 return None , metadata
, 'exhausted' , player_error
150 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
151 signature
= list ( cipher
[ 's' ][ 0 ])
152 for c
in algo
. split ():
153 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
154 ix
= int ( ix
) % len ( signature
) if ix
else 0
156 if op
== 'r' : signature
= list ( reversed ( signature
))
157 if op
== 's' : signature
= signature
[ ix
:]
158 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
159 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
160 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
161 return f
"{cipher['url'][0]}& {sp} = {sig} "
163 def prepare_metadata ( metadata
):
164 meta1
= metadata
[ 'videoDetails' ]
165 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
166 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
167 if 'cards' in metadata
else []
168 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
169 if 'endscreen' in metadata
else []
171 # the actual video streams have exact information:
173 sd
= metadata
[ 'streamingData' ]
174 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
175 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
176 # if that's unavailable (e.g. on livestreams), fall back to
177 # thumbnails (only either 4:3 or 16:9).
179 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
180 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
183 { 'url' : cc
[ 'baseUrl' ],
184 'code' : cc
[ 'languageCode' ],
185 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
186 'name' : cc
[ 'name' ][ 'simpleText' ]}
187 for cc
in metadata
. get ( 'captions' ,{})
188 . get ( 'playerCaptionsTracklistRenderer' ,{})
189 . get ( 'captionTracks' ,[])
190 ], key
= lambda cc
: cc
[ 'autogenerated' ])
193 # externals URLs are redirected through youtube.com/redirect, but we
194 # may encounter internal URLs, too
195 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
196 # Remove left-/rightmost word from string:
197 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
198 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
199 # Thousands seperator aware int():
200 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
202 def parse_infocard ( card
):
203 card
= card
[ 'cardRenderer' ]
204 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
205 content
= card
[ 'content' ][ ctype
]
206 if ctype
== "pollRenderer" :
209 'question' : content
[ 'question' ][ 'simpleText' ],
210 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
211 for a
in content
[ 'choices' ]],
213 elif ctype
== "videoInfoCardContentRenderer" :
215 # if the card references a live stream, it has no length, but a "LIVE NOW" badge.
216 # TODO: this is ugly; cleanup.
217 is_live
= content
. get ( 'badge' ,{}). get ( 'liveBadgeRenderer' ,{})
218 length
= is_live
. get ( 'label' ,{}). get ( 'simpleText' ) or content
[ 'lengthString' ][ 'simpleText' ] # '23:03'
220 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
221 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
222 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
224 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
226 elif ctype
== "playlistInfoCardContentRenderer" :
229 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
230 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
231 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
232 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
233 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
235 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
[ 'command' ]:
238 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
239 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
240 'title' : content
[ 'title' ][ 'simpleText' ],
241 # XXX: no thumbnails for infocards
243 elif ctype
== "collaboratorInfoCardContentRenderer" :
246 'channel_id' : content
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
247 'title' : content
[ 'channelName' ][ 'simpleText' ],
248 'icons' : mkthumbs ( content
[ 'channelAvatar' ][ 'thumbnails' ]),
249 'subscribers' : content
[ 'subscriberCountText' ][ 'simpleText' ], # "545K subscribers"
253 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
255 return { 'type' : ctype
, 'content' : content
}
257 def mkthumbs ( thumbs
):
258 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
259 def parse_endcard ( card
):
260 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
261 ctype
= card
[ 'style' ]
262 if ctype
== "CHANNEL" :
264 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
265 'title' : card
[ 'title' ][ 'simpleText' ],
266 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
268 elif ctype
== "VIDEO" :
270 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
271 'title' : card
[ 'title' ][ 'simpleText' ],
272 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
273 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
274 # XXX: no channel name
276 elif ctype
== "PLAYLIST" :
278 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
279 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
280 'title' : card
[ 'title' ][ 'simpleText' ],
281 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
282 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
284 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
286 url
= clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ])
289 'domain' : urlparse ( url
). netloc
,
290 'title' : card
[ 'title' ][ 'simpleText' ],
291 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
295 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
297 return { 'type' : ctype
, 'content' : content
}
299 infocards
= [ parse_infocard ( card
) for card
in cards
]
300 endcards
= [ parse_endcard ( card
) for card
in endsc
]
301 # combine cards to weed out duplicates. for videos and playlists prefer
302 # infocards, for channels and websites prefer endcards, as those have more
303 # information than the other.
304 # if the card type is not in ident, we use the whole card for comparison
305 # (otherwise they'd all replace each other)
306 ident
= { # ctype -> ident
308 'PLAYLIST' : 'playlist_id' ,
309 'CHANNEL' : 'channel_id' ,
313 getident
= lambda c
: c
[ 'content' ]. get ( ident
. get ( c
[ 'type' ]), c
)
314 mkexclude
= lambda cards
, types
: [ getident ( c
) for c
in cards
if c
[ 'type' ] in types
]
315 exclude
= lambda cards
, without
: [ c
for c
in cards
if getident ( c
) not in without
]
317 allcards
= exclude ( infocards
, mkexclude ( endcards
, [ 'CHANNEL' , 'WEBSITE' ])) + \
318 exclude ( endcards
, mkexclude ( infocards
, [ 'VIDEO' , 'PLAYLIST' ]))
320 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
321 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
322 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
323 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
324 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
325 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
326 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
327 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
328 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
329 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
330 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
331 whitelisted
= sorted ( meta2
. get ( 'availableCountries' ,[]))
332 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
335 'title' : meta1
[ 'title' ],
336 'author' : meta1
[ 'author' ],
337 'channel_id' : meta1
[ 'channelId' ],
338 'description' : meta1
[ 'shortDescription' ],
339 'published' : meta2
[ 'publishDate' ],
340 'views' : meta1
[ 'viewCount' ],
341 'length' : int ( meta1
[ 'lengthSeconds' ]),
342 'rating' : meta1
[ 'averageRating' ],
343 'category' : meta2
[ 'category' ],
344 'aspectr' : aspect_ratio
,
345 'unlisted' : meta2
[ 'isUnlisted' ],
346 'countries' : whitelisted
,
347 'blacklisted' : blacklisted
,
348 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
349 'infocards' : infocards
,
350 'endcards' : endcards
,
351 'all_cards' : allcards
,
352 'subtitles' : subtitles
,
355 class RedditException ( Exception ): pass
356 def fetch_reddit ( subreddits
, sorted_by
= "hot" , time
= None , *, limit
= 36 ,
357 count
= None , before
= None , after
= None ):
359 fetches data from a subreddit (or a multireddit like gif+gifs) and
360 filters/sorts results.
361 sorted_by values: hot, new, rising, controversial, top
362 time values: hour, week, month, year, all (for top and controversial)
363 returns a tuple of ([ {video} ],before,after)
365 # TODO: /api/info.json?id=t3_h7mjes == /by_id/t3_h7mjes.json
368 return [], None , None
370 query
= '&' . join ([ f
" {k} = {v} " for k
, v
in {
374 'limit' : limit
, # 1..100 (default 25)
375 't' : time
, # hour,week,month,year,all
377 multireddit
= '+' . join ( subreddits
)
378 r
= requests
. get ( f
"https://old.reddit.com/r/ {multireddit} / {sorted_by} .json? {query} " ,
379 headers
={ 'User-Agent' : 'Mozilla/5.0' })
380 if not r
. ok
or not 'data' in r
. json ():
381 raise RedditException ( r
. text
)
384 entries
= sorted ( r
. json ()[ 'data' ][ 'children' ], key
= lambda e
: e
[ 'data' ][ 'score' ] > 1 , reverse
= True )
385 for entry
in entries
:
387 if e
[ 'domain' ] not in [ 'youtube.com' , 'youtu.be' , 'invidio.us' ]:
390 # Note: youtube.com/<video_id> is not valid (404s), but seen in the wild.
391 video_id
= re
. match ( r
'^https?://(?:www.|m.)?(?:youtube.com/watch\?(?:.*&)?v=|youtu.be/|youtube.com/embed/|youtube.com/)([-_0-9A-Za-z]+)' , e
[ 'url' ]). group ( 1 )
393 continue # XXX: should we log that?
394 if not video_id
: continue
396 'video_id' : video_id
,
398 'url' : e
[ 'permalink' ],
399 'n_comments' : e
[ 'num_comments' ],
400 'n_karma' : e
[ 'score' ],
401 'subreddit' : e
[ 'subreddit' ],
404 before
= r
. json ()[ 'data' ][ 'before' ]
405 after
= r
. json ()[ 'data' ][ 'after' ]
407 return videos
, before
, after
410 from pprint
import pprint
412 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))