]>
git.gir.st - subscriptionfeed.git/blob - app/common.py
6 from datetime
import datetime
, timezone
7 from xml
. etree
import ElementTree
8 from urllib
. parse
import parse_qs
9 from configparser
import ConfigParser
12 config_filename
= os
. environ
. get ( 'YT_CONFIG' , '/etc/yt/config.ini' )
13 cf
. read ( config_filename
)
15 def fetch_xml ( feed_type
, feed_id
):
16 r
= requests
. get ( f
"https://www.youtube.com/feeds/videos.xml? {feed_type} = {feed_id} " )
22 def parse_xml ( xmldata
):
24 'atom' : "http://www.w3.org/2005/Atom" ,
25 'yt' : "http://www.youtube.com/xml/schemas/2015" ,
26 'media' : "http://search.yahoo.com/mrss/"
29 feed
= ElementTree
. fromstring ( xmldata
)
30 author
= feed
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
if feed
. find ( 'atom:author' , ns
) else None
31 if feed
. find ( 'yt:channelId' , ns
):
32 channel_id
= feed
. find ( 'yt:channelId' , ns
). text
33 else : # TODO: clean this up (websub has no yt:channelId, this should be adapted for playlists)
34 self
= feed
. find ( 'atom:link[@rel="self"]' , ns
). get ( 'href' )
35 channel_id
= parse_qs ( self
. split ( '?' )[ 1 ]). get ( 'channel_id' )[ 0 ]
36 title
= feed
. find ( 'atom:title' , ns
). text
38 for entry
in feed
. findall ( 'atom:entry' , ns
):
40 'video_id' : entry
. find ( 'yt:videoId' , ns
). text
,
41 'title' : entry
. find ( 'atom:title' , ns
). text
,
42 'published' : entry
. find ( 'atom:published' , ns
). text
,
43 'channel_id' : entry
. find ( 'yt:channelId' , ns
). text
,
44 'author' : entry
. find ( 'atom:author' , ns
). find ( 'atom:name' , ns
). text
,
45 # extra fields for pull_subs/webhook:
46 'updated' : entry
. find ( 'atom:updated' , ns
). text
,
47 #'description': entry.find('media:group',ns).find('media:description',ns).text ##xxx:missing for websub
50 return title
, author
, channel_id
, videos
52 def update_channel ( db
, xmldata
):
54 returns True on success, False on failure. rigorous error checking is required, otherwise data will be lost!
55 the caller MUST (as per RFC 2119) write (append) the xmlfeed into a file on error.
57 if not xmldata
: return False
59 # Note: wbesub does not return global author
60 title
, author
, channel_id
, videos
= parse_xml ( xmldata
) #xxx: perl-code had this eval'd for a die
64 now
= datetime
. now ( timezone
. utc
)
65 updated
= dateutil
. parser
. parse ( video
[ 'updated' ])
66 published
= dateutil
. parser
. parse ( video
[ 'published' ])
67 # if update and published time are near-identical, it's new. use crawl time if it was published within a week.
68 # else, it's just an update to an older video (before we subscribed, so use original upload time).
69 if ( updated
- published
). seconds
< 60 and ( now
- published
). days
< 7 :
75 INSERT OR IGNORE INTO videos (id, channel_id, title, published, crawled)
76 VALUES (?, ?, ?, datetime(?), datetime(?))
77 """ , ( video
[ 'video_id' ], video
[ 'channel_id' ], video
[ 'title' ], video
[ 'published' ], timestamp
)) #XXX:errorcheck
79 # update channel name (we don't fetch it on subscribing)
80 author
= video
[ 'author' ] # XXX: doing this once per channel is enough (for pull-subs.pl)
82 INSERT OR REPLACE INTO channels (id, name)
84 """ , ( channel_id
, author
)) #XXX:errorcheck
88 def get_video_info ( video_id
, sts
= 0 , algo
= "" ):
90 returns the best-quality muxed video stream, the player_response, error-type/-mesage
91 error types: 'initial': the request to get_video_info was malformed
92 'player': playabilityStatus != OK
93 'internal': [livestream, geolocked, exhausted]
95 # TODO: caching, e.g. beaker? need to not cache premiering-soon videos/livestreams/etc, though
96 # responses are apparently valid for 6h; maybe cache for (video_length - 2h)
97 # TODO: errro types? ["invalid parameters", playabilitystatus, own]
98 # todo: a bit messy; should return all unscrambled video urls in best->worst quality
100 # we try to fetch the video multiple times using different origins
101 for el
in [ 'embedded' , 'detailpage' ]: # ['el-completely-absent',info,leanback,editpage,adunit,previewpage,profilepage]
102 r
= requests
. get ( f
"https://www.youtube.com/get_video_info" +
103 f
"?video_id= {video_id} " +
104 f
"&eurl=https://youtube.googleapis.com/v/ {video_id} " +
107 f
"&hl=en_US" ) #"&hl=en&gl=US"
108 params
= parse_qs ( r
. text
)
109 if 'errorcode' in params
: # status=fail
110 return None , None , 'initial' , f
"MALFORMED: {params['reason'][0]}" # TODO: assuming we haven't fucked it up, this error comes up if the video id is garbage. give better error message
112 metadata
= json
. loads ( params
. get ( 'player_response' )[ 0 ])
113 if metadata
[ 'playabilityStatus' ][ 'status' ] != "OK" :
114 if metadata
[ 'playabilityStatus' ][ 'status' ] == "UNPLAYABLE" :
115 continue # try again with different 'el' value. if none succeeds, we fall into "exhausted" path, which returns last tried metadata, from which the playabilityStatus.reason can be extracted. according to jwz/youtubedown, the worst error message comes from embedded, which is tried first, so it should be overwritten by a better message.
116 return None , None , 'player' , f
"{metadata['playabilityStatus']['status']}: {metadata['playabilityStatus']['reason']}"
117 if 'liveStreamability' in metadata
[ 'playabilityStatus' ]:
118 return None , metadata
, 'internal' , "livestream" # can also check .microformat.liveBroadcastDetails.isLiveNow
120 if not 'formats' in metadata
[ 'streamingData' ]:
121 #TODO: hls only video with those params (kAZCrtJJaAo):
123 # "isLiveDefaultBroadcast": true,
124 # "isLowLatencyLiveStream": true,
125 # "isLiveContent": true,
126 # "isPostLiveDvr": true
129 formats
= metadata
[ 'streamingData' ][ 'formats' ]
130 for ( i
, v
) in enumerate ( formats
):
131 if not ( 'cipher' in v
or 'signatureCipher' in v
): continue
132 cipher
= parse_qs ( v
. get ( 'cipher' ) or v
. get ( 'signatureCipher' ))
133 formats
[ i
][ 'url' ] = unscramble ( cipher
, algo
)
135 # todo: check if we have urls or try again
136 url
= sorted ( formats
, key
= lambda k
: k
[ 'height' ], reverse
= True )[ 0 ][ 'url' ]
138 if 'gcr' in parse_qs ( url
):
139 return None , metadata
, 'internal' , "geolocked"
141 return url
, metadata
, None , None
143 return None , metadata
, 'internal' , "exhausted"
145 def unscramble ( cipher
, algo
): # test video id: UxxajLWwzqY
146 signature
= list ( cipher
[ 's' ][ 0 ])
147 for c
in algo
. split ():
148 op
, ix
= re
. match ( r
"([rsw])(\d+)?" , c
). groups ()
150 if op
== 'r' : signature
= list ( reversed ( signature
))
151 if op
== 's' : signature
= signature
[ int ( ix
):]
152 if op
== 'w' : signature
[ 0 ], signature
[ int ( ix
)% len ( signature
)] = signature
[ int ( ix
)% len ( signature
)], signature
[ 0 ]
153 sp
= cipher
. get ( 'sp' , [ 'signature' ])[ 0 ]
154 sig
= cipher
[ 'sig' ][ 0 ] if 'sig' in cipher
else '' . join ( signature
)
155 return f
"{cipher['url'][0]}& {sp} = {sig} "
157 def prepare_metadata ( metadata
):
158 meta1
= metadata
[ 'videoDetails' ]
159 meta2
= metadata
[ 'microformat' ][ 'playerMicroformatRenderer' ]
160 cards
= metadata
[ 'cards' ][ 'cardCollectionRenderer' ][ 'cards' ] if 'cards' in metadata
else []
161 endsc
= metadata
[ 'endscreen' ][ 'endscreenRenderer' ][ 'elements' ] if 'endscreen' in metadata
else []
163 #aspect_ratio = meta2['embed']['width'] / meta2['embed']['height'], # sometimes absent
164 aspect_ratio
= meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'width' ] / meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'height' ]
167 { 'url' : cc
[ 'baseUrl' ],
168 'code' : cc
[ 'languageCode' ],
169 'autogenerated' : cc
. get ( 'kind' )== "asr" ,
170 'name' : cc
[ 'name' ][ 'simpleText' ]}
171 for cc
in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ][ 'captionTracks' ]
172 ], key
= lambda cc
: cc
[ 'autogenerated' ]) if 'captions' in metadata
and 'captionTracks' in metadata
[ 'captions' ][ 'playerCaptionsTracklistRenderer' ] else []
174 def parse_infocard ( card
):
175 card
= card
[ 'cardRenderer' ]
176 teaser
= card
[ 'teaser' ][ 'simpleCardTeaserRenderer' ][ 'message' ][ 'simpleText' ] # not used
177 ctype
= list ( card
[ 'content' ]. keys ())[ 0 ]
178 content
= card
[ 'content' ][ ctype
]
179 if ctype
== "pollRenderer" :
182 'question' : content
[ 'question' ][ 'simpleText' ],
183 'answers' : [( a
[ 'text' ][ 'simpleText' ], a
[ 'numVotes' ]) for a
in content
[ 'choices' ]],
185 elif ctype
== "videoInfoCardContentRenderer" :
188 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
189 'title' : content
[ 'videoTitle' ][ 'simpleText' ],
190 'author' : content
[ 'channelName' ][ 'simpleText' ], # 'by xXxXx'
191 'length' : content
[ 'lengthString' ][ 'simpleText' ], # '23:03'
192 'views' : content
[ 'viewCountText' ][ 'simpleText' ], # '421,248 views'
194 elif ctype
== "playlistInfoCardContentRenderer" :
197 'playlist_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'playlistId' ],
198 'video_id' : content
[ 'action' ][ 'watchEndpoint' ][ 'videoId' ],
199 'title' : content
[ 'playlistTitle' ][ 'simpleText' ],
200 'author' : content
[ 'channelName' ][ 'simpleText' ],
201 'n_videos' : content
[ 'playlistVideoCount' ][ 'simpleText' ], # '21'
203 elif ctype
== "simpleCardContentRenderer" and 'urlEndpoint' in content
. get ( 'command' ,{}). keys ():
206 'url' : parse_qs ( content
[ 'command' ][ 'urlEndpoint' ][ 'url' ]. split ( '?' )[ 1 ])[ 'q' ][ 0 ],
207 'domain' : content
[ 'displayDomain' ][ 'simpleText' ],
208 'title' : content
[ 'title' ][ 'simpleText' ],
209 'text' : content
[ 'actionButton' ][ 'simpleCardButtonRenderer' ][ 'text' ][ 'simpleText' ],
213 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
215 return { 'teaser' : teaser
, 'type' : ctype
, 'content' : content
}
217 def parse_endcard ( card
):
218 card
= card
[ 'endscreenElementRenderer' ] if 'endscreenElementRenderer' in card
. keys () else card
219 ctype
= card
[ 'style' ]
220 if ctype
== "CHANNEL" :
222 'channel_id' : card
[ 'endpoint' ][ 'browseEndpoint' ][ 'browseId' ],
223 'title' : card
[ 'title' ][ 'simpleText' ],
224 'icons' : { e
[ 'height' ]: e
[ 'url' ] for e
in card
[ 'image' ][ 'thumbnails' ]},
226 elif ctype
== "VIDEO" :
228 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
229 'title' : card
[ 'title' ][ 'simpleText' ],
230 'length' : card
[ 'videoDuration' ][ 'simpleText' ], # '12:21'
231 'views' : card
[ 'metadata' ][ 'simpleText' ], # '51,649 views'
233 elif ctype
== "PLAYLIST" :
235 'playlist_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'playlistId' ],
236 'video_id' : card
[ 'endpoint' ][ 'watchEndpoint' ][ 'videoId' ],
237 'title' : card
[ 'title' ][ 'simpleText' ],
238 'author' : card
[ 'metadata' ][ 'simpleText' ],
239 'n_videos' : card
[ 'playlistLength' ][ 'simpleText' ]. replace ( " videos" , "" ),
241 elif ctype
== "WEBSITE" :
243 'url' : parse_qs ( card
[ 'endpoint' ][ 'urlEndpoint' ][ 'url' ]. split ( '?' )[ 1 ])[ 'q' ][ 0 ],
244 'domain' : card
[ 'metadata' ][ 'simpleText' ],
245 'title' : card
[ 'title' ][ 'simpleText' ],
246 'icons' : { e
[ 'height' ]: e
[ 'url' ] for e
in card
[ 'image' ][ 'thumbnails' ]},
250 content
= { 'error' : f
" {ctype} is not implemented; <pre>{pprint.pformat(card)}</pre>" }
252 return { 'type' : ctype
, 'content' : content
}
255 'title' : meta1
[ 'title' ],
256 'author' : meta1
[ 'author' ],
257 'channel_id' : meta1
[ 'channelId' ],
258 'description' : meta1
[ 'shortDescription' ],
259 'published' : meta2
[ 'publishDate' ],
260 'views' : meta1
[ 'viewCount' ],
261 'length' : int ( meta1
[ 'lengthSeconds' ]),
262 'rating' : meta1
[ 'averageRating' ],
263 'category' : meta2
[ 'category' ],
264 'aspectr' : aspect_ratio
,
265 'unlisted' : meta2
[ 'isUnlisted' ],
266 'countries' : meta2
[ 'availableCountries' ],
267 'poster' : meta2
[ 'thumbnail' ][ 'thumbnails' ][ 0 ][ 'url' ],
268 'infocards' : [ parse_infocard ( card
) for card
in cards
],
269 'endcards' : [ parse_endcard ( card
) for card
in endsc
],
270 'subtitles' : subtitles
,
274 from pprint
import pprint
276 pprint ( args
, stream
= codecs
. getwriter ( "utf-8" )( sys
. stderr
. buffer ))