1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
"""Module for downloading files from a pool of mirrors
24
This module provides support for downloading files from a pool of
25
mirrors with configurable failover policies. To a large extent, the
26
failover policy is chosen by using different classes derived from
27
the main class, MirrorGroup.
29
Instances of MirrorGroup (and cousins) act very much like URLGrabber
30
instances in that they have urlread, urlgrab, and urlopen methods.
31
They can therefore, be used in very similar ways.
33
from urlgrabber.grabber import URLGrabber
34
from urlgrabber.mirror import MirrorGroup
36
mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
37
'http://bar.org/maybe/somewhere/else/',
38
'ftp://baz.net/some/other/place/entirely/']
39
mg.urlgrab('relative/path.zip')
41
The assumption is that all mirrors are identical AFTER the base urls
42
specified, so that any mirror can be used to fetch any file.
46
The failover mechanism is designed to be customized by subclassing
47
from MirrorGroup to change the details of the behavior. In general,
48
the classes maintain a master mirror list and a "current mirror"
49
index. When a download is initiated, a copy of this list and index
50
is created for that download only. The specific failover policy
51
depends on the class used, and so is documented in the class
52
documentation. Note that ANY behavior of the class can be
53
overridden, so any failover policy at all is possible (although
54
you may need to change the interface in extreme cases).
58
Most customization of a MirrorGroup object is done at instantiation
59
time (or via subclassing). There are four major types of
62
1) Pass in a custom urlgrabber - The passed in urlgrabber will be
63
used (by default... see #2) for the grabs, so options to it
64
apply for the url-fetching
66
2) Custom mirror list - Mirror lists can simply be a list of
67
stings mirrors (as shown in the example above) but each can
68
also be a dict, allowing for more options. For example, the
69
first mirror in the list above could also have been:
71
{'mirror': 'http://foo.com/some/directory/',
72
'grabber': <a custom grabber to be used for this mirror>,
73
'kwargs': { <a dict of arguments passed to the grabber> }}
75
All mirrors are converted to this format internally. If
76
'grabber' is omitted, the default grabber will be used. If
77
kwargs are omitted, then (duh) they will not be used.
79
3) Pass keyword arguments when instantiating the mirror group.
80
See, for example, the failure_callback argument.
82
4) Finally, any kwargs passed in for the specific file (to the
83
urlgrab method, for example) will be folded in. The options
84
passed into the grabber's urlXXX methods will override any
85
options specified in a custom mirror dict.
89
# $Id: mirror.py,v 1.12 2004/09/07 21:19:54 mstenner Exp $
92
import thread # needed for locking to make this threadsafe
94
from grabber import URLGrabError, CallbackObject
97
def DBPRINT(*args): print ' '.join(args)
101
except ImportError, msg:
105
"""This is a dummy class used to hold information about the specific
106
request. For example, a single file. By maintaining this information
107
separately, we can accomplish two things:
109
1) make it a little easier to be threadsafe
110
2) have request-specific parameters
117
Instances of this class are built with a grabber object and a list
118
of mirrors. Then all calls to urlXXX should be passed relative urls.
119
The requested file will be searched for on the first mirror. If the
120
grabber raises an exception (possibly after some retries) then that
121
mirror will be removed from the list, and the next will be attempted.
122
If all mirrors are exhausted, then an exception will be raised.
124
MirrorGroup has the following failover policy:
126
* downloads begin with the first mirror
128
* by default (see default_action below) a failure (after retries)
129
causes it to increment the local AND master indices. Also,
130
the current mirror is removed from the local list (but NOT the
131
master list - the mirror can potentially be used for other
134
* if the local list is ever exhausted, a URLGrabError will be
135
raised (errno=256, no more mirrors)
139
In addition to the required arguments "grabber" and "mirrors",
140
MirrorGroup also takes the following optional arguments:
144
A dict that describes the actions to be taken upon failure
145
(after retries). default_action can contain any of the
146
following keys (shown here with their default values):
148
default_action = {'increment': 1,
149
'increment_master': 1,
154
In this context, 'increment' means "use the next mirror" and
155
'remove' means "never use this mirror again". The two
156
'master' values refer to the instance-level mirror list (used
157
for all files), whereas the non-master values refer to the
158
current download only.
160
The 'fail' option will cause immediate failure by re-raising
161
the exception and no further attempts to get the current
164
This dict can be set at instantiation time,
165
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
166
at method-execution time (only applies to current fetch),
167
filename = mg.urlgrab(url, default_action={'increment': 0})
168
or by returning an action dict from the failure_callback
170
in increasing precedence.
172
If all three of these were done, the net result would be:
173
{'increment': 0, # set in method
174
'increment_master': 1, # class default
175
'remove': 1, # class default
176
'remove_master': 0, # class default
177
'fail': 0} # set at instantiation, reset
182
this is a callback that will be called when a mirror "fails",
183
meaning the grabber raises some URLGrabError. If this is a
184
tuple, it is interpreted to be of the form (cb, args, kwargs)
185
where cb is the actual callable object (function, method,
186
etc). Otherwise, it is assumed to be the callable object
187
itself. The callback will be passed a grabber.CallbackObject
188
instance along with args and kwargs (if present). The following
189
attributes are defined withing the instance:
191
obj.exception = < exception that was raised >
192
obj.mirror = < the mirror that was tried >
193
obj.relative_url = < url relative to the mirror >
194
obj.url = < full url that failed >
195
# .url is just the combination of .mirror
198
The failure callback can return an action dict, as described
201
Like default_action, the failure_callback can be set at
202
instantiation time or when the urlXXX method is called. In
203
the latter case, it applies only for that fetch.
205
The callback can re-raise the exception quite easily. For
206
example, this is a perfectly adequate callback function:
208
def callback(obj): raise obj.exception
210
WARNING: do not save the exception object (or the
211
CallbackObject instance). As they contain stack frame
212
references, they can lead to circular references.
215
* The behavior can be customized by deriving and overriding the
216
'CONFIGURATION METHODS'
217
* The 'grabber' instance is kept as a reference, not copied.
218
Therefore, the grabber instance can be modified externally
219
and changes will take effect immediately.
222
# notes on thread-safety:
224
# A GrabRequest should never be shared by multiple threads because
225
# it's never saved inside the MG object and never returned outside it.
226
# therefore, it should be safe to access/modify grabrequest data
227
# without a lock. However, accessing the mirrors and _next attributes
228
# of the MG itself must be done when locked to prevent (for example)
229
# removal of the wrong mirror.
231
##############################################################
232
# CONFIGURATION METHODS - intended to be overridden to
234
def __init__(self, grabber, mirrors, **kwargs):
235
"""Initialize the MirrorGroup object.
239
grabber - URLGrabber instance
240
mirrors - a list of mirrors
244
failure_callback - callback to be used when a mirror fails
245
default_action - dict of failure actions
247
See the module-level and class level documentation for more
252
# shuffle the list to randomize order
253
self.grabber = grabber
254
self.mirrors = self._parse_mirrors(mirrors)
256
self._lock = thread.allocate_lock()
257
self.default_action = None
258
self._process_kwargs(kwargs)
260
# if these values are found in **kwargs passed to one of the urlXXX
261
# methods, they will be stripped before getting passed on to the
263
options = ['default_action', 'failure_callback']
265
def _process_kwargs(self, kwargs):
266
self.failure_callback = kwargs.get('failure_callback')
267
self.default_action = kwargs.get('default_action')
269
def _parse_mirrors(self, mirrors):
272
if type(m) == type(''): m = {'mirror': m}
273
parsed_mirrors.append(m)
274
return parsed_mirrors
276
def _load_gr(self, gr):
280
gr.mirrors = list(self.mirrors)
281
gr._next = self._next
284
def _get_mirror(self, gr):
286
# return a random mirror so that multiple mirrors get used
287
# even without failures.
289
raise URLGrabError(256, _('No more mirrors to try.'))
290
return gr.mirrors[gr._next]
292
def _failure(self, gr, cb_obj):
294
# inspect the error - remove=1 for 404, remove=2 for connection
295
# refused, etc. (this can also be done via
297
cb = gr.kw.get('failure_callback') or self.failure_callback
299
if type(cb) == type( () ):
300
cb, args, kwargs = cb
302
args, kwargs = (), {}
303
action = cb(cb_obj, *args, **kwargs) or {}
306
# XXXX - decide - there are two ways to do this
307
# the first is action-overriding as a whole - use the entire action
308
# or fall back on module level defaults
309
#action = action or gr.kw.get('default_action') or self.default_action
310
# the other is to fall through for each element in the action dict
311
a = dict(self.default_action or {})
312
a.update(gr.kw.get('default_action', {}))
315
self.increment_mirror(gr, action)
316
if action and action.get('fail', 0): raise
318
def increment_mirror(self, gr, action={}):
319
"""Tell the mirror object increment the mirror index
321
This increments the mirror index, which amounts to telling the
322
mirror object to use a different mirror (for this and future
325
This is a SEMI-public method. It will be called internally,
326
and you may never need to call it. However, it is provided
327
(and is made public) so that the calling program can increment
328
the mirror choice for methods like urlopen. For example, with
329
urlopen, there's no good way for the mirror group to know that
330
an error occurs mid-download (it's already returned and given
331
you the file object).
333
remove --- can have several values
334
0 do not remove the mirror from the list
335
1 remove the mirror for this download only
336
2 remove the mirror permanently
338
beware of remove=0 as it can lead to infinite loops
340
badmirror = gr.mirrors[gr._next]
344
ind = self.mirrors.index(badmirror)
348
if action.get('remove_master', 0):
349
del self.mirrors[ind]
350
elif self._next == ind and action.get('increment_master', 1):
352
if self._next >= len(self.mirrors): self._next = 0
355
if action.get('remove', 1):
356
del gr.mirrors[gr._next]
357
elif action.get('increment', 1):
359
if gr._next >= len(gr.mirrors): gr._next = 0
362
grm = [m['mirror'] for m in gr.mirrors]
363
DBPRINT('GR mirrors: [%s] %i' % (' '.join(grm), gr._next))
364
selfm = [m['mirror'] for m in self.mirrors]
365
DBPRINT('MAIN mirrors: [%s] %i' % (' '.join(selfm), self._next))
367
#####################################################################
368
# NON-CONFIGURATION METHODS
369
# these methods are designed to be largely workhorse methods that
370
# are not intended to be overridden. That doesn't mean you can't;
371
# if you want to, feel free, but most things can be done by
372
# by overriding the configuration methods :)
374
def _join_url(self, base_url, rel_url):
375
if base_url.endswith('/') or rel_url.startswith('/'):
376
return base_url + rel_url
378
return base_url + '/' + rel_url
380
def _mirror_try(self, func, url, kw):
387
for k in self.options:
389
except KeyError: pass
392
mirrorchoice = self._get_mirror(gr)
393
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
394
kwargs = dict(mirrorchoice.get('kwargs', {}))
396
grabber = mirrorchoice.get('grabber') or self.grabber
397
func_ref = getattr(grabber, func)
398
if DEBUG: DBPRINT('MIRROR: trying %s -> %s' % (url, fullurl))
400
return func_ref( *(fullurl,), **kwargs )
401
except URLGrabError, e:
402
if DEBUG: DBPRINT('MIRROR: failed')
403
obj = CallbackObject()
405
obj.mirror = mirrorchoice['mirror']
406
obj.relative_url = gr.url
408
self._failure(gr, obj)
410
def urlgrab(self, url, filename=None, **kwargs):
412
kw['filename'] = filename
414
return self._mirror_try(func, url, kw)
416
def urlopen(self, url, **kwargs):
419
return self._mirror_try(func, url, kw)
421
def urlread(self, url, limit=None, **kwargs):
425
return self._mirror_try(func, url, kw)
428
class MGRandomStart(MirrorGroup):
429
"""A mirror group that starts at a random mirror in the list.
431
This behavior of this class is identical to MirrorGroup, except that
432
it starts at a random location in the mirror list.
435
def __init__(self, grabber, mirrors, **kwargs):
436
"""Initialize the object
438
The arguments for intialization are the same as for MirrorGroup
440
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
441
self._next = random.randrange(len(mirrors))
443
class MGRandomOrder(MirrorGroup):
444
"""A mirror group that uses mirrors in a random order.
446
This behavior of this class is identical to MirrorGroup, except that
447
it uses the mirrors in a random order. Note that the order is set at
448
initialization time and fixed thereafter. That is, it does not pick a
449
random mirror after each failure.
452
def __init__(self, grabber, mirrors, **kwargs):
453
"""Initialize the object
455
The arguments for intialization are the same as for MirrorGroup
457
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
458
random.shuffle(self.mirrors)
460
if __name__ == '__main__':