]>
git.gir.st - subscriptionfeed.git/blob - app/common.py
7 from xml
. etree
import ElementTree
8 from configparser
import ConfigParser
9 from datetime
import datetime
, timezone
10 from urllib
. parse
import parse_qs
, urlparse
13 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
14 cf
. read ( config_filename
)
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading
import Timer
23 requests_cache
. remove_expired_responses ()
24 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
29 def fetch_xml ( feed_type
, feed_id
):
30 r
= requests
. get ( f
"https://www.youtube.com/feeds/videos.xml? {feed_type} = {feed_id} " )
36 def parse_xml ( xmldata
):
38 'atom' : "http://www.w3.org/2005/Atom" ,
39 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
40 'media' : "http://search.yahoo.com/mrss/"
43 feed
= ElementTree
. fromstring ( xmldata
)
44 title
= feed
. find ( 'atom:title' , ns
). text
45 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
46 if feed
. find ( 'atom:author' , ns
) else None
48 for entry
in feed
. findall ( 'atom:entry' , ns
):
50 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
51 'title' : entry
. find ( 'atom:title' , ns
). text
,
52 'published' : entry
. find ( 'atom:published' , ns
). text
,
53 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
54 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
55 # extra fields for pull_subs/webhook:
56 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
59 return title
, author
, videos
61 def update_channel ( db
, xmldata
):
62 if not xmldata
: return False
64 # Note: websub does not return global author, hence taking from first video
65 title
, _
, videos
= parse_xml ( xmldata
)
68 for i
, video
in enumerate ( videos
):
69 now
= datetime
. now ( timezone
. utc
)
70 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
71 published
= dateutil
. parser
. parse ( video
[ 'published' ])
72 # if update and published time are near-identical, we assume it's new.
73 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
75 else : #, it's just an update to an older video.
79 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
80 VALUES (?, ?, ?, datetime(?), datetime(?))
89 if i
== 0 : # only required once per feed
91 INSERT OR REPLACE INTO channels (id, name)
93 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
98 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
100 returns: best-quality muxed video stream, player_response, error-type/mesage
101 error types: player, malformed, livestream, geolocked, exhausted
103 player_error
= None # for 'exhausted'
104 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
105 r
= requests
. get ( f
"https://www.youtube.com/get_video_info" +
106 f
"?video_id= {video_id} " +
107 f
"&eurl=https://youtube.googleapis.com/v/ {video_id} " +
110 f
"&hl=en_US" ) #"&hl=en&gl=US"
111 params
= parse_qs ( r
. text
)
112 if 'errorcode' in params
: # status=fail
113 return None , None , 'malformed' , params
[ 'reason' ][ 0 ]
115 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
116 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
117 if playabilityStatus
!= "OK" :
118 playabilityReason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
119 player_error
= f
" {playabilityStatus} : {playabilityReason} "
120 if playabilityStatus
== "UNPLAYABLE" :
121 continue # try again with next el value (or fail as exhausted)
122 # without videoDetails, there's only the error messge (playabilityStatus,responseContext,trackingParams)
123 maybe_metadata
= metadata
if 'videoDetails' in metadata
else None
124 return None , maybe_metadata
, 'player' , player_error
125 if 'liveStreamability' in metadata
[ 'playabilityStatus' ]:
126 # can also check .microformat.liveBroadcastDetails.isLiveNow
127 return None , metadata
, 'livestream' , None
129 if not 'formats' in metadata
[ 'streamingData' ]:
130 #TODO: hls only video with those params (kAZCrtJJaAo):
132 # "isLiveDefaultBroadcast": true,
133 # "isLowLatencyLiveStream": true,
134 # "isLiveContent": true,
135 # "isPostLiveDvr": true
138 formats
= metadata
[ 'streamingData' ][ 'formats' ]
139 for ( i
, v
) in enumerate ( formats
):
140 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
141 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
142 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
144 # todo: check if we have urls or try again
145 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
147 if 'gcr' in parse_qs ( url
):
148 return None , metadata
, 'geolocked' , None
150 return url
, metadata
, None , None
152 return None , metadata
, 'exhausted' , player_error
154 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
155 signature
= list ( cipher
[ 's' ][ 0 ])
156 for c
in algo
. split ():
157 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
158 ix
= int ( ix
) % len ( signature
) if ix
else 0
160 if op
== 'r' : signature
= list ( reversed ( signature
))
161 if op
== 's' : signature
= signature
[ ix
:]
162 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
163 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
164 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
165 return f
"{cipher['url'][0]}& {sp} = {sig} "
167 def prepare_metadata ( metadata
):
168 meta1
= metadata
[ 'videoDetails' ]
169 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
170 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
171 if 'cards' in metadata
else []
172 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
173 if 'endscreen' in metadata
else []
175 # thumbnails are either 4:3 or 16:9
176 some_img
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ]
177 aspect_ratio
= some_img
[ 'width' ] / some_img
[ 'height' ]
178 # the actual video streams have exact information:
179 if 'streamingData' in metadata
:
180 sd
= metadata
[ 'streamingData' ]
181 some_stream
= ( sd
. get ( 'adaptiveFormats' ,[]) + sd
. get ( 'formats' ,[]))[ 0 ]
182 aspect_ratio
= some_stream
[ 'width' ] / some_stream
[ 'height' ]
185 { 'url' : cc
[ 'baseUrl' ],
186 'code' : cc
[ 'languageCode' ],
187 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
188 'name' : cc
[ 'name' ][ 'simpleText' ]}
189 for cc
in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ][ 'captionTracks' ]
190 ], key
= lambda cc
: cc
[ 'autogenerated' ]) if 'captions' in metadata
and 'captionTracks' in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ] else [] # TODO<,^: cleanup
193 # externals URLs are redirected through youtube.com/redirect, but we
194 # may encounter internal URLs, too
195 return parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
196 # Remove left-/rightmost word from string:
197 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
198 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
199 # Thousands seperator aware int():
200 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
202 def parse_infocard ( card
):
203 card
= card
[ 'cardRenderer' ]
204 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
205 content
= card
[ 'content' ][ ctype
]
206 if ctype
== "pollRenderer" :
209 'question' : content
[ 'question' ][ 'simpleText' ],
210 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
211 for a
in content
[ 'choices' ]],
213 elif ctype
== "videoInfoCardContentRenderer" :
216 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
217 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
218 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
219 'length' : content
[ 'lengthString' ][ 'simpleText' ], # '23:03'
220 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
222 elif ctype
== "playlistInfoCardContentRenderer" :
225 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
226 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
227 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
228 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
229 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
231 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
. get ( 'command' ,{}). keys (): # <TODO: cleanup
234 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
235 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
236 'title' : content
[ 'title' ][ 'simpleText' ],
237 # XXX: no thumbnails for infocards
241 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
243 return { 'type' : ctype
, 'content' : content
}
245 def mkthumbs ( thumbs
):
246 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
247 def parse_endcard ( card
):
248 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
249 ctype
= card
[ 'style' ]
250 if ctype
== "CHANNEL" :
252 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
253 'title' : card
[ 'title' ][ 'simpleText' ],
254 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
256 elif ctype
== "VIDEO" :
258 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
259 'title' : card
[ 'title' ][ 'simpleText' ],
260 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
261 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
262 # XXX: no channel name
264 elif ctype
== "PLAYLIST" :
266 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
267 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
268 'title' : card
[ 'title' ][ 'simpleText' ],
269 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
270 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
272 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
275 'url' : clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ]),
276 'domain' : urlparse ( url
). netloc
, # TODO: remove .domain
277 'title' : card
[ 'title' ][ 'simpleText' ],
278 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
282 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
284 return { 'type' : ctype
, 'content' : content
}
286 all_countries
= """AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA BB BD
287 BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW BY BZ CA CC CD CF CG CH
288 CI CK CL CM CN CO CR CU CV CW CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER
289 ES ET FI FJ FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR GS GT
290 GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ IR IS IT JE JM JO JP KE
291 KG KH KI KM KN KP KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD
292 ME MF MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC NE NF
293 NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PN PR PS PT PW PY QA
294 RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK SL SM SN SO SR SS ST SV SX
295 SY SZ TC TD TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US UY UZ
296 VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW""" . split ()
297 whitelisted
= sorted ( meta2
[ 'availableCountries' ])
298 blacklisted
= sorted ( set ( all_countries
) - set ( whitelisted
))
301 'title' : meta1
[ 'title' ],
302 'author' : meta1
[ 'author' ],
303 'channel_id' : meta1
[ 'channelId' ],
304 'description' : meta1
[ 'shortDescription' ],
305 'published' : meta2
[ 'publishDate' ],
306 'views' : meta1
[ 'viewCount' ],
307 'length' : int ( meta1
[ 'lengthSeconds' ]),
308 'rating' : meta1
[ 'averageRating' ],
309 'category' : meta2
[ 'category' ],
310 'aspectr' : aspect_ratio
,
311 'unlisted' : meta2
[ 'isUnlisted' ],
312 'countries' : whitelisted
,
313 'blacklisted' : blacklisted
,
314 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
315 'infocards' : [ parse_infocard ( card
) for card
in cards
],
316 'endcards' : [ parse_endcard ( card
) for card
in endsc
],
317 'subtitles' : subtitles
,
321 from pprint
import pprint
323 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))