1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
"""A high-level cross-protocol url-grabber.
22
GENERAL ARGUMENTS (kwargs)
24
Where possible, the module-level default is indicated, and legal
29
ignored except for file:// urls, in which case it specifies
30
whether urlgrab should still make a copy of the file, or simply
31
point to the existing copy. The module level default for this
34
close_connection = 0 [0|1]
36
tells URLGrabber to close the connection after a file has been
37
transfered. This is ignored unless the download happens with the
38
http keepalive handler (keepalive=1). Otherwise, the connection
39
is left open for further use. The module level default for this
40
option is 0 (keepalive connections will not be closed).
44
specifies whether keepalive should be used for HTTP/1.1 servers
45
that support it. The module level default for this option is 1
46
(keepalive is enabled).
50
a class instance that supports the following methods:
51
po.start(filename, url, basename, length, text)
52
# length will be None if unknown
53
po.update(read) # read == bytes read so far
58
specifies an alternativ text item in the beginning of the progress
59
bar line. If not given, the basename of the file is used.
63
a number - if it's an int, it's the bytes/second throttle limit.
64
If it's a float, it is first multiplied by bandwidth. If throttle
65
== 0, throttling is disabled. If None, the module-level default
66
(which can be set on default_grabber.throttle) is used. See
67
BANDWIDTH THROTTLING for more information.
71
a positive float expressing the number of seconds to wait for socket
72
operations. If the value is None or 0.0, socket operations will block
73
forever. Setting this option causes urlgrabber to call the settimeout
74
method on the Socket object used for the request. See the Python
75
documentation on settimeout for more information.
76
http://www.python.org/doc/current/lib/socket-objects.html
80
the nominal max bandwidth in bytes/second. If throttle is a float
81
and bandwidth == 0, throttling is disabled. If None, the
82
module-level default (which can be set on
83
default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
88
a tuple of the form (first_byte, last_byte) describing a byte
89
range to retrieve. Either or both of the values may set to
90
None. If first_byte is None, byte offset 0 is assumed. If
91
last_byte is None, the last byte available is assumed. Note that
92
the range specification is python-like in that (0,10) will yeild
93
the first 10 bytes of the file.
95
If set to None, no range will be used.
97
reget = None [None|'simple'|'check_timestamp']
99
whether to attempt to reget a partially-downloaded file. Reget
100
only applies to .urlgrab and (obviously) only if there is a
101
partially downloaded file. Reget has two modes:
103
'simple' -- the local file will always be trusted. If there
104
are 100 bytes in the local file, then the download will always
105
begin 100 bytes into the requested file.
107
'check_timestamp' -- the timestamp of the server file will be
108
compared to the timestamp of the local file. ONLY if the
109
local file is newer than or the same age as the server file
110
will reget be used. If the server file is newer, or the
111
timestamp is not returned, the entire file will be fetched.
113
NOTE: urlgrabber can do very little to verify that the partial
114
file on disk is identical to the beginning of the remote file.
115
You may want to either employ a custom "checkfunc" or simply avoid
116
using reget in situations where corruption is a concern.
118
user_agent = 'urlgrabber/VERSION'
120
a string, usually of the form 'AGENT/VERSION' that is provided to
121
HTTP servers in the User-agent header. The module level default
122
for this option is "urlgrabber/VERSION".
126
a tuple of 2-tuples, each containing a header and value. These
127
will be used for http and https requests only. For example, you
129
http_headers = (('Pragma', 'no-cache'),)
133
this is just like http_headers, but will be used for ftp requests.
137
a dictionary that maps protocol schemes to proxy hosts. For
138
example, to use a proxy server on host "foo" port 3128 for http
140
proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
141
note that proxy authentication information may be provided using
142
normal URL constructs:
143
proxies={ 'http' : 'http://user:host@foo:3128' }
144
Lastly, if proxies is None, the default environment settings will
149
a url prefix that will be prepended to all requested urls. For
151
g = URLGrabber(prefix='http://foo.com/mirror/')
152
g.urlgrab('some/file.txt')
153
## this will fetch 'http://foo.com/mirror/some/file.txt'
154
This option exists primarily to allow identical behavior to
155
MirrorGroup (and derived) instances. Note: a '/' will be inserted
156
if necessary, so you cannot specify a prefix that ends with a
157
partial file or directory name.
161
Overrides the default urllib2.OpenerDirector provided to urllib2
162
when making requests. This option exists so that the urllib2
163
handler chain may be customized. Note that the range, reget,
164
proxy, and keepalive features require that custom handlers be
165
provided to urllib2 in order to function properly. If an opener
166
option is provided, no attempt is made by urlgrabber to ensure
167
chain integrity. You are responsible for ensuring that any
168
extension handlers are present if said features are required.
170
RETRY RELATED ARGUMENTS
174
the number of times to retry the grab before bailing. If this is
175
zero, it will retry forever. This was intentional... really, it
176
was :). If this value is not supplied or is supplied but is None
177
retrying does not occur.
179
retrycodes = [-1,2,4,5,6,7]
181
a sequence of errorcodes (values of e.errno) for which it should
182
retry. See the doc on URLGrabError for more details on
183
this. retrycodes defaults to [-1,2,4,5,6,7] if not specified
188
a function to do additional checks. This defaults to None, which
189
means no additional checking. The function should simply return
190
on a successful check. It should raise URLGrabError on an
191
unsuccessful check. Raising of any other exception will be
192
considered immediate failure and no retries will occur.
194
If it raises URLGrabError, the error code will determine the retry
195
behavior. Negative error numbers are reserved for use by these
196
passed in functions, so you can use many negative numbers for
197
different types of failure. By default, -1 results in a retry,
198
but this can be customized with retrycodes.
200
If you simply pass in a function, it will be given exactly one
201
argument: a CallbackObject instance with the .url attribute
202
defined and either .filename (for urlgrab) or .data (for urlread).
203
For urlgrab, .filename is the name of the local file. For
204
urlread, .data is the actual string data. If you need other
205
arguments passed to the callback (program state of some sort), you
208
checkfunc=(function, ('arg1', 2), {'kwarg': 3})
210
if the downloaded file has filename /tmp/stuff, then this will
211
result in this call (for urlgrab):
213
function(obj, 'arg1', 2, kwarg=3)
214
# obj.filename = '/tmp/stuff'
215
# obj.url = 'http://foo.com/stuff'
217
NOTE: both the "args" tuple and "kwargs" dict must be present if
218
you use this syntax, but either (or both) can be empty.
220
failure_callback = None
222
The callback that gets called during retries when an attempt to
223
fetch a file fails. The syntax for specifying the callback is
224
identical to checkfunc, except for the attributes defined in the
225
CallbackObject instance. In this case, it will have .exception
226
and .url defined. As you might suspect, .exception is the
227
exception that was raised.
229
The callback is present primarily to inform the calling program of
230
the failure, but if it raises an exception (including the one it's
231
passed) that exception will NOT be caught and will therefore cause
232
future retries to be aborted.
236
urlgrabber supports throttling via two values: throttle and
237
bandwidth Between the two, you can either specify and absolute
238
throttle threshold or specify a theshold as a fraction of maximum
241
throttle is a number - if it's an int, it's the bytes/second
242
throttle limit. If it's a float, it is first multiplied by
243
bandwidth. If throttle == 0, throttling is disabled. If None, the
244
module-level default (which can be set with set_throttle) is used.
246
bandwidth is the nominal max bandwidth in bytes/second. If throttle
247
is a float and bandwidth == 0, throttling is disabled. If None, the
248
module-level default (which can be set with set_bandwidth) is used.
252
Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
253
per second, or 12,500,000 Bytes per second. You have a number of
256
*) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
258
This will limit urlgrab to use half of your available bandwidth.
260
*) set_throttle(6250000) # throttle is an int
262
This will also limit urlgrab to use half of your available
263
bandwidth, regardless of what bandwidth is set to.
265
*) set_throttle(6250000); set_throttle(1.0) # float
267
Use half your bandwidth
269
*) set_throttle(6250000); set_throttle(2.0) # float
271
Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
273
*) set_throttle(6250000); set_throttle(0) # throttle = 0
275
Disable throttling - this is more efficient than a very large
278
*) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
280
Disable throttling - this is the default when the module is loaded.
282
SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
284
While this is flexible, it's not extremely obvious to the user. I
285
suggest you implement a float throttle as a percent to make the
286
distinction between absolute and relative throttling very explicit.
288
Also, you may want to convert the units to something more convenient
289
than bytes/second, such as kbps or kB/s, etc.
293
# $Id: grabber.py,v 1.39 2005/03/03 00:54:23 mstenner Exp $
303
from stat import * # S_* and ST_*
306
exec('from ' + (__name__.split('.'))[0] + ' import __version__')
310
auth_handler = urllib2.HTTPBasicAuthHandler( \
311
urllib2.HTTPPasswordMgrWithDefaultRealm())
317
except ImportError, msg:
321
from httplib import HTTPException
322
except ImportError, msg:
326
# This is a convenient way to make keepalive optional.
327
# Just rename the module so it can't be imported.
328
from keepalive import HTTPHandler
329
except ImportError, msg:
330
keepalive_handler = None
332
keepalive_handler = HTTPHandler()
335
# add in range support conditionally too
336
from urlgrabber.byterange import HTTPRangeHandler, FileRangeHandler, \
337
FTPRangeHandler, range_tuple_normalize, range_tuple_to_header, \
339
except ImportError, msg:
344
range_handlers = (HTTPRangeHandler(), FileRangeHandler(), FTPRangeHandler())
348
# check whether socket timeout support is available (Python >= 2.3)
351
TimeoutError = socket.timeout
352
have_socket_timeout = True
353
except AttributeError:
355
have_socket_timeout = False
357
class URLGrabError(IOError):
359
URLGrabError error codes:
361
URLGrabber error codes (0 -- 255)
362
0 - everything looks good (you should never see this)
364
2 - local file doesn't exist
365
3 - request for non-file local file (dir, etc)
368
6 - no content length header when we expected one
370
8 - Exceeded read limit (for urlread)
371
9 - Requested byte range not satisfiable.
372
10 - Byte range requested, but range support unavailable
373
11 - Illegal reget mode
376
MirrorGroup error codes (256 -- 511)
377
256 - No more mirrors left to try
379
Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
380
[ this range reserved for application-specific error codes ]
383
-1 - retry the download, unknown reason
385
Note: to test which group a code is in, you can simply do integer
386
division by 256: e.errno / 256
388
Negative codes are reserved for use by functions passed in to
389
retrygrab with checkfunc. The value -1 is built in as a generic
390
retry code and is already included in the retrycodes list.
391
Therefore, you can create a custom check function that simply
392
returns -1 and the fetch will be re-tried. For more customized
393
retries, you can use other negative number and include them in
394
retry-codes. This is nice for outputting useful messages about
397
You can use these error codes like so:
399
except URLGrabError, e:
404
print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
408
class CallbackObject:
409
"""Container for returned callback data.
411
This is currently a dummy class into which urlgrabber can stuff
412
information for passing to callbacks. This way, the prototype for
413
all callbacks is the same, regardless of the data that will be
414
passed back. Any function that accepts a callback function as an
415
argument SHOULD document what it will define in this object.
417
It is possible that this class will have some greater
418
functionality in the future.
423
"""close any open keepalive connections"""
424
if keepalive_handler: keepalive_handler.close_all()
426
def urlgrab(url, filename=None, **kwargs):
427
"""grab the file at <url> and make a local copy at <filename>
428
If filename is none, the basename of the url is used.
429
urlgrab returns the filename of the local file, which may be different
430
from the passed-in filename if the copy_local kwarg == 0.
432
See module documentation for a description of possible kwargs.
434
return default_grabber.urlgrab(url, filename, **kwargs)
436
def urlopen(url, **kwargs):
437
"""open the url and return a file object
438
If a progress object or throttle specifications exist, then
439
a special file object will be returned that supports them.
440
The file object can be treated like any other file object.
442
See module documentation for a description of possible kwargs.
444
return default_grabber.urlopen(url, **kwargs)
446
def urlread(url, limit=None, **kwargs):
447
"""read the url into a string, up to 'limit' bytes
448
If the limit is exceeded, an exception will be thrown. Note that urlread
449
is NOT intended to be used as a way of saying "I want the first N bytes"
450
but rather 'read the whole file into memory, but don't use too much'
452
See module documentation for a description of possible kwargs.
454
return default_grabber.urlread(url, limit, **kwargs)
457
class URLGrabberOptions:
458
"""Class to ease kwargs handling."""
460
def __init__(self, delegate=None, **kwargs):
461
"""Initialize URLGrabberOptions object.
462
Set default values for all options and then update options specified
465
self.delegate = delegate
468
self._set_attributes(**kwargs)
470
def __getattr__(self, name):
471
if self.delegate and hasattr(self.delegate, name):
472
return getattr(self.delegate, name)
473
raise AttributeError, name
475
def raw_throttle(self):
476
"""Calculate raw throttle value from throttle and bandwidth
479
if self.throttle <= 0:
481
elif type(self.throttle) == type(0):
482
return float(self.throttle)
483
else: # throttle is a float
484
return self.bandwidth * self.throttle
486
def derive(self, **kwargs):
487
"""Create a derived URLGrabberOptions instance.
488
This method creates a new instance and overrides the
489
options specified in kwargs.
491
return URLGrabberOptions(delegate=self, **kwargs)
493
def _set_attributes(self, **kwargs):
494
"""Update object attributes with those provided in kwargs."""
495
self.__dict__.update(kwargs)
496
if have_range and kwargs.has_key('range'):
497
# normalize the supplied range value
498
self.range = range_tuple_normalize(self.range)
499
if not self.reget in [None, 'simple', 'check_timestamp']:
500
raise URLGrabError(11, _('Illegal reget mode: %s') \
503
def _set_defaults(self):
504
"""Set all options to their default values.
505
When adding new options, make sure a default is
508
self.progress_obj = None
512
self.retrycodes = [-1,2,4,5,6,7]
513
self.checkfunc = None
515
self.close_connection = 0
517
self.user_agent = 'urlgrabber/%s' % __version__
521
self.failure_callback = None
524
self.cache_openers = True
527
self.http_headers = None
528
self.ftp_headers = None
531
"""Provides easy opening of URLs with a variety of options.
533
All options are specified as kwargs. Options may be specified when
534
the class is created and may be overridden on a per request basis.
536
New objects inherit default values from default_grabber.
539
def __init__(self, **kwargs):
540
self.opts = URLGrabberOptions(**kwargs)
542
def _retry(self, opts, func, *args):
547
return apply(func, (opts,) + args, {})
548
except URLGrabError, e:
549
if DEBUG: print 'EXCEPTION: %s' % e
550
if (opts.retry is None) \
551
or (tries == opts.retry) \
552
or (e.errno not in opts.retrycodes): raise
553
if opts.failure_callback:
554
cb_func, cb_args, cb_kwargs = \
555
self._make_callback(opts.failure_callback)
556
# this is a little icky - for now, the first element
557
# of args is the url. we might consider a way to tidy
559
obj = CallbackObject()
562
cb_func(obj, *cb_args, **cb_kwargs)
564
def urlopen(self, url, **kwargs):
565
"""open the url and return a file object
566
If a progress object or throttle value specified when this
567
object was created, then a special file object will be
568
returned that supports them. The file object can be treated
569
like any other file object.
571
opts = self.opts.derive(**kwargs)
572
(url,parts) = self._parse_url(url)
573
def retryfunc(opts, url):
574
return URLGrabberFileObject(url, filename=None, opts=opts)
575
return self._retry(opts, retryfunc, url)
577
def urlgrab(self, url, filename=None, **kwargs):
578
"""grab the file at <url> and make a local copy at <filename>
579
If filename is none, the basename of the url is used.
580
urlgrab returns the filename of the local file, which may be
581
different from the passed-in filename if copy_local == 0.
583
opts = self.opts.derive(**kwargs)
584
(url, parts) = self._parse_url(url)
585
(scheme, host, path, parm, query, frag) = parts
587
if scheme in [ 'http', 'https' ]:
588
filename = os.path.basename( urllib.unquote(path) )
590
filename = os.path.basename( path )
591
if scheme == 'file' and not opts.copy_local:
592
# just return the name of the local file - don't make a
594
if not os.path.exists(path):
595
raise URLGrabError(2,
596
_('Local file does not exist: %s') % (path, ))
597
elif not os.path.isfile(path):
598
raise URLGrabError(3,
599
_('Not a normal file: %s') % (path, ))
603
def retryfunc(opts, url, filename):
604
fo = URLGrabberFileObject(url, filename, opts)
607
if not opts.checkfunc is None:
608
cb_func, cb_args, cb_kwargs = \
609
self._make_callback(opts.checkfunc)
610
obj = CallbackObject()
611
obj.filename = filename
613
apply(cb_func, (obj, )+cb_args, cb_kwargs)
618
return self._retry(opts, retryfunc, url, filename)
620
def urlread(self, url, limit=None, **kwargs):
621
"""read the url into a string, up to 'limit' bytes
622
If the limit is exceeded, an exception will be thrown. Note
623
that urlread is NOT intended to be used as a way of saying
624
"I want the first N bytes" but rather 'read the whole file
625
into memory, but don't use too much'
627
opts = self.opts.derive(**kwargs)
628
(url, parts) = self._parse_url(url)
629
if limit is not None:
632
def retryfunc(opts, url, limit):
633
fo = URLGrabberFileObject(url, filename=None, opts=opts)
636
# this is an unfortunate thing. Some file-like objects
637
# have a default "limit" of None, while the built-in (real)
638
# file objects have -1. They each break the other, so for
639
# now, we just force the default if necessary.
640
if limit is None: s = fo.read()
641
else: s = fo.read(limit)
643
if not opts.checkfunc is None:
644
cb_func, cb_args, cb_kwargs = \
645
self._make_callback(opts.checkfunc)
646
obj = CallbackObject()
649
apply(cb_func, (obj, )+cb_args, cb_kwargs)
654
s = self._retry(opts, retryfunc, url, limit)
655
if limit and len(s) > limit:
656
raise URLGrabError(8,
657
_('Exceeded limit (%i): %s') % (limit, url))
660
def _parse_url(self,url):
661
"""break up the url into its component parts
663
This function disassembles a url and
664
1) "normalizes" it, tidying it up a bit
665
2) does any authentication stuff it needs to do
667
it returns the (cleaned) url and a tuple of component parts
671
if p[-1] == '/' or url[0] == '/': url = p + url
672
else: url = p + '/' + url
674
(scheme, host, path, parm, query, frag) = \
675
urlparse.urlparse(url)
677
if not url[0] == '/': url = os.path.abspath(url)
679
(scheme, host, path, parm, query, frag) = \
680
urlparse.urlparse(url)
681
path = os.path.normpath(path)
682
if scheme in ['http', 'https']: path = urllib.quote(path)
683
if '@' in host and auth_handler and scheme in ['http', 'https']:
685
user_pass, host = host.split('@', 1)
686
if ':' in user_pass: user, password = user_pass.split(':', 1)
687
except ValueError, e:
688
raise URLGrabError(1, _('Bad URL: %s') % url)
689
if DEBUG: print 'adding HTTP auth: %s, %s' % (user, password)
690
auth_handler.add_password(None, host, user, password)
691
parts = (scheme, host, path, parm, query, frag)
692
url = urlparse.urlunparse(parts)
695
def _make_callback(self, callback_obj):
696
if callable(callback_obj):
697
return callback_obj, (), {}
701
# create the default URLGrabber used by urlXXX functions.
702
# NOTE: actual defaults are set in URLGrabberOptions
703
default_grabber = URLGrabber()
705
class URLGrabberFileObject:
706
"""This is a file-object wrapper that supports progress objects
709
This exists to solve the following problem: lets say you want to
710
drop-in replace a normal open with urlopen. You want to use a
711
progress meter and/or throttling, but how do you do that without
712
rewriting your code? Answer: urlopen will return a wrapped file
713
object that does the progress meter and-or throttling internally.
716
def __init__(self, url, filename, opts):
718
self.filename = filename
722
self._rbufsize = 1024*8
723
self._ttime = time.time()
725
self._amount_read = 0
729
def __getattr__(self, name):
730
"""This effectively allows us to wrap at the instance level.
731
Any attribute not found in _this_ object will be searched for
732
in self.fo. This includes methods."""
733
if hasattr(self.fo, name):
734
return getattr(self.fo, name)
735
raise AttributeError, name
737
def _get_opener(self):
738
"""Build a urllib2 OpenerDirector based on request options."""
740
return self.opts.opener
741
elif self._opener is None:
743
need_keepalive_handler = (keepalive_handler and self.opts.keepalive)
744
need_range_handler = (range_handlers and \
745
(self.opts.range or self.opts.reget))
746
# if you specify a ProxyHandler when creating the opener
747
# it _must_ come before all other handlers in the list or urllib2
749
if self.opts.proxies:
750
handlers.append( CachedProxyHandler(self.opts.proxies) )
752
# -------------------------------------------------------
753
# OK, these next few lines are a serious kludge to get
754
# around what I think is a bug in python 2.2's
755
# urllib2. The basic idea is that default handlers
756
# get applied first. If you override one (like a
757
# proxy handler), then the default gets pulled, but
758
# the replacement goes on the end. In the case of
759
# proxies, this means the normal handler picks it up
760
# first and the proxy isn't used. Now, this probably
761
# only happened with ftp or non-keepalive http, so not
762
# many folks saw it. The simple approach to fixing it
763
# is just to make sure you override the other
764
# conflicting defaults as well. I would LOVE to see
765
# these go way or be dealt with more elegantly. The
766
# problem isn't there after 2.2. -MDS 2005/02/24
767
if not need_keepalive_handler:
768
handlers.append( urllib2.HTTPHandler() )
769
if not need_range_handler:
770
handlers.append( urllib2.FTPHandler() )
771
# -------------------------------------------------------
773
if need_keepalive_handler:
774
handlers.append( keepalive_handler )
775
if need_range_handler:
776
handlers.extend( range_handlers )
777
handlers.append( auth_handler )
778
if self.opts.cache_openers:
779
self._opener = CachedOpenerDirector(*handlers)
781
self._opener = urllib2.build_opener(*handlers)
782
# OK, I don't like to do this, but otherwise, we end up with
783
# TWO user-agent headers.
784
self._opener.addheaders = []
788
opener = self._get_opener()
790
req = urllib2.Request(self.url) # build request object
791
self._add_headers(req) # add misc headers that we need
792
self._build_range(req) # take care of reget and byterange stuff
794
fo, hdr = self._make_request(req, opener)
795
if self.reget_time and self.opts.reget == 'check_timestamp':
796
# do this if we have a local file with known timestamp AND
797
# we're in check_timestamp reget mode.
800
modified_tuple = hdr.getdate_tz('last-modified')
801
modified_stamp = rfc822.mktime_tz(modified_tuple)
802
if modified_stamp > self.reget_time: fetch_again = 1
807
# the server version is newer than the (incomplete) local
808
# version, so we should abandon the version we're getting
809
# and fetch the whole thing again.
811
self.opts.reget = None
812
del req.headers['Range']
813
self._build_range(req)
814
fo, hdr = self._make_request(req, opener)
816
(scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
817
if not (self.opts.progress_obj or self.opts.raw_throttle() \
818
or self.opts.timeout):
819
# if we're not using the progress_obj, throttling, or timeout
820
# we can get a performance boost by going directly to
821
# the underlying fileobject for reads.
823
if hasattr(fo, 'readline'):
824
self.readline = fo.readline
825
elif self.opts.progress_obj:
826
try: length = int(hdr['Content-Length'])
827
except: length = None
828
self.opts.progress_obj.start(str(self.filename), self.url,
829
os.path.basename(path),
832
self.opts.progress_obj.update(0)
833
(self.fo, self.hdr) = (fo, hdr)
835
def _add_headers(self, req):
836
if self.opts.user_agent:
837
req.add_header('User-agent', self.opts.user_agent)
838
try: req_type = req.get_type()
839
except ValueError: req_type = None
840
if self.opts.http_headers and req_type in ('http', 'https'):
841
for h, v in self.opts.http_headers:
843
if self.opts.ftp_headers and req_type == 'ftp':
844
for h, v in self.opts.ftp_headers:
847
def _build_range(self, req):
848
self.reget_time = None
852
if have_range and self.opts.reget and type(self.filename) == type(''):
853
# we have reget turned on and we're dumping to a file
855
s = os.stat(self.filename)
859
self.reget_time = s[ST_MTIME]
860
reget_length = s[ST_SIZE]
861
rt = reget_length, ''
866
raise URLGrabError(10, _('Byte range requested but range '\
867
'support unavailable'))
869
if rt[0]: rt = (rt[0] + reget_length, rt[1])
872
header = range_tuple_to_header(rt)
873
if header: req.add_header('Range', header)
875
def _make_request(self, req, opener):
877
if have_socket_timeout and self.opts.timeout:
878
old_to = socket.getdefaulttimeout()
879
socket.setdefaulttimeout(self.opts.timeout)
881
fo = opener.open(req)
883
socket.setdefaulttimeout(old_to)
885
fo = opener.open(req)
887
except ValueError, e:
888
raise URLGrabError(1, _('Bad URL: %s') % (e, ))
889
except RangeError, e:
890
raise URLGrabError(9, _('%s') % (e, ))
892
if hasattr(e, 'reason') and have_socket_timeout and \
893
isinstance(e.reason, TimeoutError):
894
raise URLGrabError(12, _('Timeout: %s') % (e, ))
896
raise URLGrabError(4, _('IOError: %s') % (e, ))
898
raise URLGrabError(5, _('OSError: %s') % (e, ))
899
except HTTPException, e:
900
raise URLGrabError(7, _('HTTP Error (%s): %s') % \
901
(e.__class__.__name__, e))
906
"""dump the file to self.filename."""
907
if self.append: new_fo = open(self.filename, 'ab')
908
else: new_fo = open(self.filename, 'wb')
912
block = self.read(bs)
913
size = size + len(block)
916
block = self.read(bs)
917
size = size + len(block)
921
modified_tuple = self.hdr.getdate_tz('last-modified')
922
modified_stamp = rfc822.mktime_tz(modified_tuple)
923
os.utime(self.filename, (modified_stamp, modified_stamp))
924
except (TypeError,), e: pass
928
def _fill_buffer(self, amt=None):
929
"""fill the buffer to contain at least 'amt' bytes by reading
930
from the underlying file object. If amt is None, then it will
931
read until it gets nothing more. It updates the progress meter
932
and throttles after every self._rbufsize bytes."""
933
# the _rbuf test is only in this first 'if' for speed. It's not
934
# logically necessary
935
if self._rbuf and not amt is None:
942
# if we've made it here, then we don't have enough in the buffer
943
# and we need to read more.
946
bufsize = len(self._rbuf)
947
while amt is None or amt:
948
# first, delay if necessary for throttling reasons
949
if self.opts.raw_throttle():
950
diff = self._tsize/self.opts.raw_throttle() - \
951
(time.time() - self._ttime)
952
if diff > 0: time.sleep(diff)
953
self._ttime = time.time()
955
# now read some data, up to self._rbufsize
956
if amt is None: readamount = self._rbufsize
957
else: readamount = min(amt, self._rbufsize)
959
new = self.fo.read(readamount)
960
except socket.error, e:
961
raise URLGrabError(4, _('Socket Error: %s') % (e, ))
962
except TimeoutError, e:
963
raise URLGrabError(12, _('Timeout: %s') % (e, ))
965
if not newsize: break # no more to read
967
if amt: amt = amt - newsize
969
bufsize = bufsize + newsize
970
self._tsize = newsize
971
self._amount_read = self._amount_read + newsize
972
if self.opts.progress_obj:
973
self.opts.progress_obj.update(self._amount_read)
975
self._rbuf = string.join(buf, '')
978
def read(self, amt=None):
979
self._fill_buffer(amt)
981
s, self._rbuf = self._rbuf, ''
983
s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
986
def readline(self, limit=-1):
987
i = string.find(self._rbuf, '\n')
988
while i < 0 and not (0 < limit <= len(self._rbuf)):
990
self._fill_buffer(L + self._rbufsize)
991
if not len(self._rbuf) > L: break
992
i = string.find(self._rbuf, '\n', L)
994
if i < 0: i = len(self._rbuf)
996
if 0 <= limit < len(self._rbuf): i = limit
998
s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
1002
if self.opts.progress_obj:
1003
self.opts.progress_obj.end(self._amount_read)
1005
if self.opts.close_connection:
1006
try: self.fo.close_connection()
1010
def CachedOpenerDirector(*handlers):
1011
for (cached_handlers, opener) in _handler_cache:
1012
if cached_handlers == handlers:
1013
for handler in opener.handlers:
1014
handler.add_parent(opener)
1016
opener = urllib2.build_opener(*handlers)
1017
_handler_cache.append( (handlers, opener) )
1021
def CachedProxyHandler(proxies):
1022
for (pdict, handler) in _proxy_cache:
1023
if pdict == proxies:
1026
handler = urllib2.ProxyHandler(proxies)
1027
_proxy_cache.append( (proxies, handler) )
1030
#####################################################################
1031
# DEPRECATED FUNCTIONS
1032
def set_throttle(new_throttle):
1033
"""Deprecated. Use: default_grabber.throttle = new_throttle"""
1034
default_grabber.throttle = new_throttle
1036
def set_bandwidth(new_bandwidth):
1037
"""Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1038
default_grabber.bandwidth = new_bandwidth
1040
def set_progress_obj(new_progress_obj):
1041
"""Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1042
default_grabber.progress_obj = new_progress_obj
1044
def set_user_agent(new_user_agent):
1045
"""Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1046
default_grabber.user_agent = new_user_agent
1048
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1049
progress_obj=None, throttle=None, bandwidth=None,
1050
numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1051
"""Deprecated. Use: urlgrab() with the retry arg instead"""
1052
kwargs = {'copy_local' : copy_local,
1053
'close_connection' : close_connection,
1054
'progress_obj' : progress_obj,
1055
'throttle' : throttle,
1056
'bandwidth' : bandwidth,
1058
'retrycodes' : retrycodes,
1059
'checkfunc' : checkfunc
1061
return urlgrab(url, filename, **kwargs)
1064
#####################################################################
1068
try: url, filename = sys.argv[1:3]
1070
print 'usage:', sys.argv[0], \
1071
'<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1075
for a in sys.argv[3:]:
1076
k, v = string.split(a, '=', 1)
1080
set_bandwidth(32 * 1024)
1081
print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
1082
default_grabber.bandwidth)
1084
try: from progress import text_progress_meter
1085
except ImportError, e: pass
1086
else: kwargs['progress_obj'] = text_progress_meter()
1088
try: name = apply(urlgrab, (url, filename), kwargs)
1089
except URLGrabError, e: print e
1090
else: print 'LOCAL FILE:', name
1095
try: url, filename = sys.argv[1:3]
1097
print 'usage:', sys.argv[0], \
1098
'<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1102
for a in sys.argv[3:]:
1103
k, v = string.split(a, '=', 1)
1106
try: from progress import text_progress_meter
1107
except ImportError, e: pass
1108
else: kwargs['progress_obj'] = text_progress_meter()
1112
def cfunc(filename, hello, there='foo'):
1115
rnum = random.random()
1117
print 'forcing retry'
1118
raise URLGrabError(-1, 'forcing retry')
1120
print 'forcing failure'
1121
raise URLGrabError(-2, 'forcing immediate failure')
1126
kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1127
try: name = apply(retrygrab, (url, filename), kwargs)
1128
except URLGrabError, e: print e
1129
else: print 'LOCAL FILE:', name
1131
def _file_object_test(filename=None):
1132
import random, cStringIO, sys
1133
if filename is None:
1135
print 'using file "%s" for comparisons' % filename
1140
for testfunc in [_test_file_object_smallread,
1141
_test_file_object_readall,
1142
_test_file_object_readline,
1143
_test_file_object_readlines]:
1144
fo_input = cStringIO.StringIO(s_input)
1145
fo_output = cStringIO.StringIO()
1146
wrapper = URLGrabberFileObject(fo_input, None, 0)
1147
print 'testing %-30s ' % testfunc.__name__,
1148
testfunc(wrapper, fo_output)
1149
s_output = fo_output.getvalue()
1150
if s_output == s_input: print 'passed'
1151
else: print 'FAILED'
1153
def _test_file_object_smallread(wrapper, fo_output):
1155
s = wrapper.read(23)
1159
def _test_file_object_readall(wrapper, fo_output):
1163
def _test_file_object_readline(wrapper, fo_output):
1165
s = wrapper.readline()
1169
def _test_file_object_readlines(wrapper, fo_output):
1170
li = wrapper.readlines()
1171
fo_output.write(string.join(li, ''))
1173
if __name__ == '__main__':
1176
_file_object_test('test')