195
by mbp at sourcefrog
- import lovely urlgrabber library |
1 |
# This library is free software; you can redistribute it and/or
|
2 |
# modify it under the terms of the GNU Lesser General Public
|
|
3 |
# License as published by the Free Software Foundation; either
|
|
4 |
# version 2.1 of the License, or (at your option) any later version.
|
|
5 |
#
|
|
6 |
# This library is distributed in the hope that it will be useful,
|
|
7 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
9 |
# Lesser General Public License for more details.
|
|
10 |
#
|
|
11 |
# You should have received a copy of the GNU Lesser General Public
|
|
12 |
# License along with this library; if not, write to the
|
|
13 |
# Free Software Foundation, Inc.,
|
|
14 |
# 59 Temple Place, Suite 330,
|
|
15 |
# Boston, MA 02111-1307 USA
|
|
16 |
||
17 |
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
|
18 |
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
|
19 |
||
20 |
"""Module for downloading files from a pool of mirrors
|
|
21 |
||
22 |
DESCRIPTION
|
|
23 |
||
24 |
This module provides support for downloading files from a pool of
|
|
25 |
mirrors with configurable failover policies. To a large extent, the
|
|
26 |
failover policy is chosen by using different classes derived from
|
|
27 |
the main class, MirrorGroup.
|
|
28 |
||
29 |
Instances of MirrorGroup (and cousins) act very much like URLGrabber
|
|
30 |
instances in that they have urlread, urlgrab, and urlopen methods.
|
|
31 |
They can therefore, be used in very similar ways.
|
|
32 |
||
33 |
from urlgrabber.grabber import URLGrabber
|
|
34 |
from urlgrabber.mirror import MirrorGroup
|
|
35 |
gr = URLGrabber()
|
|
36 |
mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
|
|
37 |
'http://bar.org/maybe/somewhere/else/',
|
|
38 |
'ftp://baz.net/some/other/place/entirely/']
|
|
39 |
mg.urlgrab('relative/path.zip')
|
|
40 |
||
41 |
The assumption is that all mirrors are identical AFTER the base urls
|
|
42 |
specified, so that any mirror can be used to fetch any file.
|
|
43 |
||
44 |
FAILOVER
|
|
45 |
||
46 |
The failover mechanism is designed to be customized by subclassing
|
|
47 |
from MirrorGroup to change the details of the behavior. In general,
|
|
48 |
the classes maintain a master mirror list and a "current mirror"
|
|
49 |
index. When a download is initiated, a copy of this list and index
|
|
50 |
is created for that download only. The specific failover policy
|
|
51 |
depends on the class used, and so is documented in the class
|
|
52 |
documentation. Note that ANY behavior of the class can be
|
|
53 |
overridden, so any failover policy at all is possible (although
|
|
54 |
you may need to change the interface in extreme cases).
|
|
55 |
||
56 |
CUSTOMIZATION
|
|
57 |
||
58 |
Most customization of a MirrorGroup object is done at instantiation
|
|
59 |
time (or via subclassing). There are four major types of
|
|
60 |
customization:
|
|
61 |
||
62 |
1) Pass in a custom urlgrabber - The passed in urlgrabber will be
|
|
63 |
used (by default... see #2) for the grabs, so options to it
|
|
64 |
apply for the url-fetching
|
|
65 |
||
66 |
2) Custom mirror list - Mirror lists can simply be a list of
|
|
67 |
stings mirrors (as shown in the example above) but each can
|
|
68 |
also be a dict, allowing for more options. For example, the
|
|
69 |
first mirror in the list above could also have been:
|
|
70 |
||
71 |
{'mirror': 'http://foo.com/some/directory/',
|
|
72 |
'grabber': <a custom grabber to be used for this mirror>,
|
|
73 |
'kwargs': { <a dict of arguments passed to the grabber> }}
|
|
74 |
||
75 |
All mirrors are converted to this format internally. If
|
|
76 |
'grabber' is omitted, the default grabber will be used. If
|
|
77 |
kwargs are omitted, then (duh) they will not be used.
|
|
78 |
||
79 |
3) Pass keyword arguments when instantiating the mirror group.
|
|
80 |
See, for example, the failure_callback argument.
|
|
81 |
||
82 |
4) Finally, any kwargs passed in for the specific file (to the
|
|
83 |
urlgrab method, for example) will be folded in. The options
|
|
84 |
passed into the grabber's urlXXX methods will override any
|
|
85 |
options specified in a custom mirror dict.
|
|
86 |
||
87 |
"""
|
|
88 |
||
89 |
# $Id: mirror.py,v 1.12 2004/09/07 21:19:54 mstenner Exp $
|
|
90 |
||
91 |
import random |
|
92 |
import thread # needed for locking to make this threadsafe |
|
93 |
||
94 |
from grabber import URLGrabError, CallbackObject |
|
95 |
||
96 |
DEBUG=0 |
|
97 |
def DBPRINT(*args): print ' '.join(args) |
|
98 |
||
99 |
try: |
|
100 |
from i18n import _ |
|
101 |
except ImportError, msg: |
|
102 |
def _(st): return st |
|
103 |
||
104 |
class GrabRequest: |
|
105 |
"""This is a dummy class used to hold information about the specific
|
|
106 |
request. For example, a single file. By maintaining this information
|
|
107 |
separately, we can accomplish two things:
|
|
108 |
||
109 |
1) make it a little easier to be threadsafe
|
|
110 |
2) have request-specific parameters
|
|
111 |
"""
|
|
112 |
pass
|
|
113 |
||
114 |
class MirrorGroup: |
|
115 |
"""Base Mirror class
|
|
116 |
||
117 |
Instances of this class are built with a grabber object and a list
|
|
118 |
of mirrors. Then all calls to urlXXX should be passed relative urls.
|
|
119 |
The requested file will be searched for on the first mirror. If the
|
|
120 |
grabber raises an exception (possibly after some retries) then that
|
|
121 |
mirror will be removed from the list, and the next will be attempted.
|
|
122 |
If all mirrors are exhausted, then an exception will be raised.
|
|
123 |
||
124 |
MirrorGroup has the following failover policy:
|
|
125 |
||
126 |
* downloads begin with the first mirror
|
|
127 |
||
128 |
* by default (see default_action below) a failure (after retries)
|
|
129 |
causes it to increment the local AND master indices. Also,
|
|
130 |
the current mirror is removed from the local list (but NOT the
|
|
131 |
master list - the mirror can potentially be used for other
|
|
132 |
files)
|
|
133 |
||
134 |
* if the local list is ever exhausted, a URLGrabError will be
|
|
135 |
raised (errno=256, no more mirrors)
|
|
136 |
||
137 |
OPTIONS
|
|
138 |
||
139 |
In addition to the required arguments "grabber" and "mirrors",
|
|
140 |
MirrorGroup also takes the following optional arguments:
|
|
141 |
|
|
142 |
default_action
|
|
143 |
||
144 |
A dict that describes the actions to be taken upon failure
|
|
145 |
(after retries). default_action can contain any of the
|
|
146 |
following keys (shown here with their default values):
|
|
147 |
||
148 |
default_action = {'increment': 1,
|
|
149 |
'increment_master': 1,
|
|
150 |
'remove': 1,
|
|
151 |
'remove_master': 0,
|
|
152 |
'fail': 0}
|
|
153 |
||
154 |
In this context, 'increment' means "use the next mirror" and
|
|
155 |
'remove' means "never use this mirror again". The two
|
|
156 |
'master' values refer to the instance-level mirror list (used
|
|
157 |
for all files), whereas the non-master values refer to the
|
|
158 |
current download only.
|
|
159 |
||
160 |
The 'fail' option will cause immediate failure by re-raising
|
|
161 |
the exception and no further attempts to get the current
|
|
162 |
download.
|
|
163 |
||
164 |
This dict can be set at instantiation time,
|
|
165 |
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
|
166 |
at method-execution time (only applies to current fetch),
|
|
167 |
filename = mg.urlgrab(url, default_action={'increment': 0})
|
|
168 |
or by returning an action dict from the failure_callback
|
|
169 |
return {'fail':0}
|
|
170 |
in increasing precedence.
|
|
171 |
|
|
172 |
If all three of these were done, the net result would be:
|
|
173 |
{'increment': 0, # set in method
|
|
174 |
'increment_master': 1, # class default
|
|
175 |
'remove': 1, # class default
|
|
176 |
'remove_master': 0, # class default
|
|
177 |
'fail': 0} # set at instantiation, reset
|
|
178 |
# from callback
|
|
179 |
||
180 |
failure_callback
|
|
181 |
||
182 |
this is a callback that will be called when a mirror "fails",
|
|
183 |
meaning the grabber raises some URLGrabError. If this is a
|
|
184 |
tuple, it is interpreted to be of the form (cb, args, kwargs)
|
|
185 |
where cb is the actual callable object (function, method,
|
|
186 |
etc). Otherwise, it is assumed to be the callable object
|
|
187 |
itself. The callback will be passed a grabber.CallbackObject
|
|
188 |
instance along with args and kwargs (if present). The following
|
|
189 |
attributes are defined withing the instance:
|
|
190 |
||
191 |
obj.exception = < exception that was raised >
|
|
192 |
obj.mirror = < the mirror that was tried >
|
|
193 |
obj.relative_url = < url relative to the mirror >
|
|
194 |
obj.url = < full url that failed >
|
|
195 |
# .url is just the combination of .mirror
|
|
196 |
# and .relative_url
|
|
197 |
||
198 |
The failure callback can return an action dict, as described
|
|
199 |
above.
|
|
200 |
||
201 |
Like default_action, the failure_callback can be set at
|
|
202 |
instantiation time or when the urlXXX method is called. In
|
|
203 |
the latter case, it applies only for that fetch.
|
|
204 |
||
205 |
The callback can re-raise the exception quite easily. For
|
|
206 |
example, this is a perfectly adequate callback function:
|
|
207 |
||
208 |
def callback(obj): raise obj.exception
|
|
209 |
||
210 |
WARNING: do not save the exception object (or the
|
|
211 |
CallbackObject instance). As they contain stack frame
|
|
212 |
references, they can lead to circular references.
|
|
213 |
||
214 |
Notes:
|
|
215 |
* The behavior can be customized by deriving and overriding the
|
|
216 |
'CONFIGURATION METHODS'
|
|
217 |
* The 'grabber' instance is kept as a reference, not copied.
|
|
218 |
Therefore, the grabber instance can be modified externally
|
|
219 |
and changes will take effect immediately.
|
|
220 |
"""
|
|
221 |
||
222 |
# notes on thread-safety:
|
|
223 |
||
224 |
# A GrabRequest should never be shared by multiple threads because
|
|
225 |
# it's never saved inside the MG object and never returned outside it.
|
|
226 |
# therefore, it should be safe to access/modify grabrequest data
|
|
227 |
# without a lock. However, accessing the mirrors and _next attributes
|
|
228 |
# of the MG itself must be done when locked to prevent (for example)
|
|
229 |
# removal of the wrong mirror.
|
|
230 |
||
231 |
##############################################################
|
|
232 |
# CONFIGURATION METHODS - intended to be overridden to
|
|
233 |
# customize behavior
|
|
234 |
def __init__(self, grabber, mirrors, **kwargs): |
|
235 |
"""Initialize the MirrorGroup object.
|
|
236 |
||
237 |
REQUIRED ARGUMENTS
|
|
238 |
||
239 |
grabber - URLGrabber instance
|
|
240 |
mirrors - a list of mirrors
|
|
241 |
||
242 |
OPTIONAL ARGUMENTS
|
|
243 |
||
244 |
failure_callback - callback to be used when a mirror fails
|
|
245 |
default_action - dict of failure actions
|
|
246 |
||
247 |
See the module-level and class level documentation for more
|
|
248 |
details.
|
|
249 |
"""
|
|
250 |
||
251 |
# OVERRIDE IDEAS:
|
|
252 |
# shuffle the list to randomize order
|
|
253 |
self.grabber = grabber |
|
254 |
self.mirrors = self._parse_mirrors(mirrors) |
|
255 |
self._next = 0 |
|
256 |
self._lock = thread.allocate_lock() |
|
257 |
self.default_action = None |
|
258 |
self._process_kwargs(kwargs) |
|
259 |
||
260 |
# if these values are found in **kwargs passed to one of the urlXXX
|
|
261 |
# methods, they will be stripped before getting passed on to the
|
|
262 |
# grabber
|
|
263 |
options = ['default_action', 'failure_callback'] |
|
264 |
||
265 |
def _process_kwargs(self, kwargs): |
|
266 |
self.failure_callback = kwargs.get('failure_callback') |
|
267 |
self.default_action = kwargs.get('default_action') |
|
268 |
||
269 |
def _parse_mirrors(self, mirrors): |
|
270 |
parsed_mirrors = [] |
|
271 |
for m in mirrors: |
|
272 |
if type(m) == type(''): m = {'mirror': m} |
|
273 |
parsed_mirrors.append(m) |
|
274 |
return parsed_mirrors |
|
275 |
||
276 |
def _load_gr(self, gr): |
|
277 |
# OVERRIDE IDEAS:
|
|
278 |
# shuffle gr list
|
|
279 |
self._lock.acquire() |
|
280 |
gr.mirrors = list(self.mirrors) |
|
281 |
gr._next = self._next |
|
282 |
self._lock.release() |
|
283 |
||
284 |
def _get_mirror(self, gr): |
|
285 |
# OVERRIDE IDEAS:
|
|
286 |
# return a random mirror so that multiple mirrors get used
|
|
287 |
# even without failures.
|
|
288 |
if not gr.mirrors: |
|
289 |
raise URLGrabError(256, _('No more mirrors to try.')) |
|
290 |
return gr.mirrors[gr._next] |
|
291 |
||
292 |
def _failure(self, gr, cb_obj): |
|
293 |
# OVERRIDE IDEAS:
|
|
294 |
# inspect the error - remove=1 for 404, remove=2 for connection
|
|
295 |
# refused, etc. (this can also be done via
|
|
296 |
# the callback)
|
|
297 |
cb = gr.kw.get('failure_callback') or self.failure_callback |
|
298 |
if cb: |
|
299 |
if type(cb) == type( () ): |
|
300 |
cb, args, kwargs = cb |
|
301 |
else: |
|
302 |
args, kwargs = (), {} |
|
303 |
action = cb(cb_obj, *args, **kwargs) or {} |
|
304 |
else: |
|
305 |
action = {} |
|
306 |
# XXXX - decide - there are two ways to do this
|
|
307 |
# the first is action-overriding as a whole - use the entire action
|
|
308 |
# or fall back on module level defaults
|
|
309 |
#action = action or gr.kw.get('default_action') or self.default_action
|
|
310 |
# the other is to fall through for each element in the action dict
|
|
311 |
a = dict(self.default_action or {}) |
|
312 |
a.update(gr.kw.get('default_action', {})) |
|
313 |
a.update(action) |
|
314 |
action = a |
|
315 |
self.increment_mirror(gr, action) |
|
316 |
if action and action.get('fail', 0): raise |
|
317 |
||
318 |
def increment_mirror(self, gr, action={}): |
|
319 |
"""Tell the mirror object increment the mirror index
|
|
320 |
||
321 |
This increments the mirror index, which amounts to telling the
|
|
322 |
mirror object to use a different mirror (for this and future
|
|
323 |
downloads).
|
|
324 |
||
325 |
This is a SEMI-public method. It will be called internally,
|
|
326 |
and you may never need to call it. However, it is provided
|
|
327 |
(and is made public) so that the calling program can increment
|
|
328 |
the mirror choice for methods like urlopen. For example, with
|
|
329 |
urlopen, there's no good way for the mirror group to know that
|
|
330 |
an error occurs mid-download (it's already returned and given
|
|
331 |
you the file object).
|
|
332 |
|
|
333 |
remove --- can have several values
|
|
334 |
0 do not remove the mirror from the list
|
|
335 |
1 remove the mirror for this download only
|
|
336 |
2 remove the mirror permanently
|
|
337 |
||
338 |
beware of remove=0 as it can lead to infinite loops
|
|
339 |
"""
|
|
340 |
badmirror = gr.mirrors[gr._next] |
|
341 |
||
342 |
self._lock.acquire() |
|
343 |
try: |
|
344 |
ind = self.mirrors.index(badmirror) |
|
345 |
except ValueError: |
|
346 |
pass
|
|
347 |
else: |
|
348 |
if action.get('remove_master', 0): |
|
349 |
del self.mirrors[ind] |
|
350 |
elif self._next == ind and action.get('increment_master', 1): |
|
351 |
self._next += 1 |
|
352 |
if self._next >= len(self.mirrors): self._next = 0 |
|
353 |
self._lock.release() |
|
354 |
||
355 |
if action.get('remove', 1): |
|
356 |
del gr.mirrors[gr._next] |
|
357 |
elif action.get('increment', 1): |
|
358 |
gr._next += 1 |
|
359 |
if gr._next >= len(gr.mirrors): gr._next = 0 |
|
360 |
||
361 |
if DEBUG: |
|
362 |
grm = [m['mirror'] for m in gr.mirrors] |
|
363 |
DBPRINT('GR mirrors: [%s] %i' % (' '.join(grm), gr._next)) |
|
364 |
selfm = [m['mirror'] for m in self.mirrors] |
|
365 |
DBPRINT('MAIN mirrors: [%s] %i' % (' '.join(selfm), self._next)) |
|
366 |
||
367 |
#####################################################################
|
|
368 |
# NON-CONFIGURATION METHODS
|
|
369 |
# these methods are designed to be largely workhorse methods that
|
|
370 |
# are not intended to be overridden. That doesn't mean you can't;
|
|
371 |
# if you want to, feel free, but most things can be done by
|
|
372 |
# by overriding the configuration methods :)
|
|
373 |
||
374 |
def _join_url(self, base_url, rel_url): |
|
375 |
if base_url.endswith('/') or rel_url.startswith('/'): |
|
376 |
return base_url + rel_url |
|
377 |
else: |
|
378 |
return base_url + '/' + rel_url |
|
379 |
||
380 |
def _mirror_try(self, func, url, kw): |
|
381 |
gr = GrabRequest() |
|
382 |
gr.func = func |
|
383 |
gr.url = url |
|
384 |
gr.kw = dict(kw) |
|
385 |
self._load_gr(gr) |
|
386 |
||
387 |
for k in self.options: |
|
388 |
try: del kw[k] |
|
389 |
except KeyError: pass |
|
390 |
||
391 |
while 1: |
|
392 |
mirrorchoice = self._get_mirror(gr) |
|
393 |
fullurl = self._join_url(mirrorchoice['mirror'], gr.url) |
|
394 |
kwargs = dict(mirrorchoice.get('kwargs', {})) |
|
395 |
kwargs.update(kw) |
|
396 |
grabber = mirrorchoice.get('grabber') or self.grabber |
|
397 |
func_ref = getattr(grabber, func) |
|
398 |
if DEBUG: DBPRINT('MIRROR: trying %s -> %s' % (url, fullurl)) |
|
399 |
try: |
|
400 |
return func_ref( *(fullurl,), **kwargs ) |
|
401 |
except URLGrabError, e: |
|
402 |
if DEBUG: DBPRINT('MIRROR: failed') |
|
403 |
obj = CallbackObject() |
|
404 |
obj.exception = e |
|
405 |
obj.mirror = mirrorchoice['mirror'] |
|
406 |
obj.relative_url = gr.url |
|
407 |
obj.url = fullurl |
|
408 |
self._failure(gr, obj) |
|
409 |
||
410 |
def urlgrab(self, url, filename=None, **kwargs): |
|
411 |
kw = dict(kwargs) |
|
412 |
kw['filename'] = filename |
|
413 |
func = 'urlgrab' |
|
414 |
return self._mirror_try(func, url, kw) |
|
415 |
||
416 |
def urlopen(self, url, **kwargs): |
|
417 |
kw = dict(kwargs) |
|
418 |
func = 'urlopen' |
|
419 |
return self._mirror_try(func, url, kw) |
|
420 |
||
421 |
def urlread(self, url, limit=None, **kwargs): |
|
422 |
kw = dict(kwargs) |
|
423 |
kw['limit'] = limit |
|
424 |
func = 'urlread' |
|
425 |
return self._mirror_try(func, url, kw) |
|
426 |
||
427 |
||
428 |
class MGRandomStart(MirrorGroup): |
|
429 |
"""A mirror group that starts at a random mirror in the list.
|
|
430 |
||
431 |
This behavior of this class is identical to MirrorGroup, except that
|
|
432 |
it starts at a random location in the mirror list.
|
|
433 |
"""
|
|
434 |
||
435 |
def __init__(self, grabber, mirrors, **kwargs): |
|
436 |
"""Initialize the object
|
|
437 |
||
438 |
The arguments for intialization are the same as for MirrorGroup
|
|
439 |
"""
|
|
440 |
MirrorGroup.__init__(self, grabber, mirrors, **kwargs) |
|
441 |
self._next = random.randrange(len(mirrors)) |
|
442 |
||
443 |
class MGRandomOrder(MirrorGroup): |
|
444 |
"""A mirror group that uses mirrors in a random order.
|
|
445 |
||
446 |
This behavior of this class is identical to MirrorGroup, except that
|
|
447 |
it uses the mirrors in a random order. Note that the order is set at
|
|
448 |
initialization time and fixed thereafter. That is, it does not pick a
|
|
449 |
random mirror after each failure.
|
|
450 |
"""
|
|
451 |
||
452 |
def __init__(self, grabber, mirrors, **kwargs): |
|
453 |
"""Initialize the object
|
|
454 |
||
455 |
The arguments for intialization are the same as for MirrorGroup
|
|
456 |
"""
|
|
457 |
MirrorGroup.__init__(self, grabber, mirrors, **kwargs) |
|
458 |
random.shuffle(self.mirrors) |
|
459 |
||
460 |
if __name__ == '__main__': |
|
461 |
pass
|