]>
git.gir.st - subscriptionfeed.git/blob - app/common.py
7 from xml
. etree
import ElementTree
8 from configparser
import ConfigParser
9 from datetime
import datetime
, timezone
10 from urllib
. parse
import parse_qs
, urlparse
13 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
14 cf
. read ( config_filename
)
16 # Note: currently expiring after 10 minutes. googlevideo-urls are valid for 5h59m, but this makes reddit very stale and premiere videos won't start. TODO: exipre when video is livestream/premiere/etc
17 requests_cache
. install_cache ( backend
= 'memory' , expire_after
= 10 * 60 , allowable_codes
=( 200 ,))
19 # Note: this should only be required for the 'memory' backed cache.
20 # TODO: only run for long-running processes, i.e. the frontend
21 from threading
import Timer
23 requests_cache
. remove_expired_responses ()
24 t
= Timer ( sec
, purge_cache
, args
=( sec
,))
29 def fetch_xml ( feed_type
, feed_id
):
30 r
= requests
. get ( f
"https://www.youtube.com/feeds/videos.xml? {feed_type} = {feed_id} " )
36 def parse_xml ( xmldata
):
38 'atom' : "http://www.w3.org/2005/Atom" ,
39 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
40 'media' : "http://search.yahoo.com/mrss/"
43 feed
= ElementTree
. fromstring ( xmldata
)
44 title
= feed
. find ( 'atom:title' , ns
). text
45 author
= feed
. find ( 'atom:author/atom:name' , ns
). text \
46 if feed
. find ( 'atom:author' , ns
) else None
48 for entry
in feed
. findall ( 'atom:entry' , ns
):
50 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
51 'title' : entry
. find ( 'atom:title' , ns
). text
,
52 'published' : entry
. find ( 'atom:published' , ns
). text
,
53 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
54 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
55 # extra fields for pull_subs/webhook:
56 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
59 return title
, author
, videos
61 def update_channel ( db
, xmldata
):
62 if not xmldata
: return False
64 # Note: websub does not return global author, hence taking from first video
65 title
, _
, videos
= parse_xml ( xmldata
)
68 for i
, video
in enumerate ( videos
):
69 now
= datetime
. now ( timezone
. utc
)
70 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
71 published
= dateutil
. parser
. parse ( video
[ 'published' ])
72 # if update and published time are near-identical, we assume it's new.
73 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
75 else : #, it's just an update to an older video.
79 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
80 VALUES (?, ?, ?, datetime(?), datetime(?))
89 if i
== 0 : # only required once per feed
91 INSERT OR REPLACE INTO channels (id, name)
93 """ , ( video
[ 'channel_id' ], video
[ 'author' ]))
98 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
100 returns: best-quality muxed video stream, player_response, error-type/mesage
101 error types: 'initial': the request to get_video_info was malformed
102 'player': playabilityStatus != OK
103 'internal': [livestream, geolocked, exhausted]
105 for el
in [ 'embedded' , 'detailpage' ]: #sometimes, only one or the other works
106 r
= requests
. get ( f
"https://www.youtube.com/get_video_info" +
107 f
"?video_id= {video_id} " +
108 f
"&eurl=https://youtube.googleapis.com/v/ {video_id} " +
111 f
"&hl=en_US" ) #"&hl=en&gl=US"
112 params
= parse_qs ( r
. text
)
113 if 'errorcode' in params
: # status=fail
114 return None , None , 'initial' , f
"MALFORMED: {params['reason'][0]}" # TODO: assuming we haven't fucked it up, this error comes up if the video id is garbage. give better error message
116 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
117 playabilityStatus
= metadata
[ 'playabilityStatus' ][ 'status' ]
118 if playabilityStatus
!= "OK" :
119 if playabilityStatus
== "UNPLAYABLE" :
120 continue # try again with next el value (or fail as exhausted)
121 reason
= metadata
[ 'playabilityStatus' ][ 'reason' ]
122 return None , None , 'player' , f
" {playabilityStatus} : {reason} "
123 if 'liveStreamability' in metadata
[ 'playabilityStatus' ]:
124 # can also check .microformat.liveBroadcastDetails.isLiveNow
125 return None , metadata
, 'internal' , "livestream"
127 if not 'formats' in metadata
[ 'streamingData' ]:
128 #TODO: hls only video with those params (kAZCrtJJaAo):
130 # "isLiveDefaultBroadcast": true,
131 # "isLowLatencyLiveStream": true,
132 # "isLiveContent": true,
133 # "isPostLiveDvr": true
136 formats
= metadata
[ 'streamingData' ][ 'formats' ]
137 for ( i
, v
) in enumerate ( formats
):
138 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
139 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
140 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
142 # todo: check if we have urls or try again
143 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
145 if 'gcr' in parse_qs ( url
):
146 return None , metadata
, 'internal' , "geolocked"
148 return url
, metadata
, None , None
150 return None , metadata
, 'internal' , "exhausted"
152 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
153 signature
= list ( cipher
[ 's' ][ 0 ])
154 for c
in algo
. split ():
155 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
156 ix
= int ( ix
) % len ( signature
) if ix
else 0
158 if op
== 'r' : signature
= list ( reversed ( signature
))
159 if op
== 's' : signature
= signature
[ ix
:]
160 if op
== 'w' : signature
[ 0 ], signature
[ ix
] = signature
[ ix
], signature
[ 0 ]
161 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
162 sig
= cipher
. get ( 'sig' , [ '' . join ( signature
)])[ 0 ]
163 return f
"{cipher['url'][0]}& {sp} = {sig} "
165 def prepare_metadata ( metadata
):
166 meta1
= metadata
[ 'videoDetails' ]
167 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
168 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] \
169 if 'cards' in metadata
else []
170 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] \
171 if 'endscreen' in metadata
else []
173 # TODO: wrong on non-4:3 and non-16:9 videos! (e.g. l06PlYNShpQ)
174 #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
175 aspect_ratio
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'width' ] / meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'height' ]
178 { 'url' : cc
[ 'baseUrl' ],
179 'code' : cc
[ 'languageCode' ],
180 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
181 'name' : cc
[ 'name' ][ 'simpleText' ]}
182 for cc
in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ][ 'captionTracks' ]
183 ], key
= lambda cc
: cc
[ 'autogenerated' ]) if 'captions' in metadata
and 'captionTracks' in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ] else [] # TODO<,^: cleanup
186 # externals URLs are redirected through youtube.com/redirect, but we
187 # may encounter internal URLs, too
188 url
= parse_qs ( urlparse ( url
). query
). get ( 'q' ,[ url
])[ 0 ]
189 # Remove left-/rightmost word from string:
190 delL
= lambda s
: s
. partition ( ' ' )[ 2 ]
191 delR
= lambda s
: s
. rpartition ( ' ' )[ 0 ]
192 # Thousands seperator aware int():
193 intT
= lambda s
: int ( s
. replace ( ',' , '' ))
195 def parse_infocard ( card
):
196 card
= card
[ 'cardRenderer' ]
197 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
198 content
= card
[ 'content' ][ ctype
]
199 if ctype
== "pollRenderer" :
202 'question' : content
[ 'question' ][ 'simpleText' ],
203 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) \
204 for a
in content
[ 'choices' ]],
206 elif ctype
== "videoInfoCardContentRenderer" :
209 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
210 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
211 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
212 'length' : content
[ 'lengthString' ][ 'simpleText' ], # '23:03'
213 'views' : intT ( delR ( content
[ 'viewCountText' ][ 'simpleText' ])),
215 elif ctype
== "playlistInfoCardContentRenderer" :
218 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
219 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
220 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
221 'author' : delL ( content
[ 'channelName' ][ 'simpleText' ]),
222 'n_videos' : intT ( content
[ 'playlistVideoCount' ][ 'simpleText' ]),
224 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
. get ( 'command' ,{}). keys (): # <TODO: cleanup
227 'url' : clean_url ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]),
228 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
229 'title' : content
[ 'title' ][ 'simpleText' ],
230 # XXX: no thumbnails for infocards
234 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
236 return { 'type' : ctype
, 'content' : content
}
238 def mkthumbs ( thumbs
):
239 return { e
[ 'height' ]: e
[ 'url' ] for e
in thumbs
}
240 def parse_endcard ( card
):
241 card
= card
. get ( 'endscreenElementRenderer' , card
) #only sometimes nested
242 ctype
= card
[ 'style' ]
243 if ctype
== "CHANNEL" :
245 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
246 'title' : card
[ 'title' ][ 'simpleText' ],
247 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
249 elif ctype
== "VIDEO" :
251 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
252 'title' : card
[ 'title' ][ 'simpleText' ],
253 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
254 'views' : delR ( card
[ 'metadata' ][ 'simpleText' ]),
255 # XXX: no channel name
257 elif ctype
== "PLAYLIST" :
259 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
260 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
261 'title' : card
[ 'title' ][ 'simpleText' ],
262 'author' : delL ( card
[ 'metadata' ][ 'simpleText' ]),
263 'n_videos' : intT ( delR ( card
[ 'playlistLength' ][ 'simpleText' ])),
265 elif ctype
== "WEBSITE" or ctype
== "CREATOR_MERCHANDISE" :
268 'url' : clean_url ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ]),
269 'domain' : urlparse ( url
). netloc
, # TODO: remove .domain
270 'title' : card
[ 'title' ][ 'simpleText' ],
271 'icons' : mkthumbs ( card
[ 'image' ][ 'thumbnails' ]),
275 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
277 return { 'type' : ctype
, 'content' : content
}
280 'title' : meta1
[ 'title' ],
281 'author' : meta1
[ 'author' ],
282 'channel_id' : meta1
[ 'channelId' ],
283 'description' : meta1
[ 'shortDescription' ],
284 'published' : meta2
[ 'publishDate' ],
285 'views' : meta1
[ 'viewCount' ],
286 'length' : int ( meta1
[ 'lengthSeconds' ]),
287 'rating' : meta1
[ 'averageRating' ],
288 'category' : meta2
[ 'category' ],
289 'aspectr' : aspect_ratio
,
290 'unlisted' : meta2
[ 'isUnlisted' ],
291 'countries' : meta2
[ 'availableCountries' ],
292 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
293 'infocards' : [ parse_infocard ( card
) for card
in cards
],
294 'endcards' : [ parse_endcard ( card
) for card
in endsc
],
295 'subtitles' : subtitles
,
299 from pprint
import pprint
301 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))