~bzr-pqm/bzr/bzr.dev

195 by mbp at sourcefrog
- import lovely urlgrabber library
1
#   This library is free software; you can redistribute it and/or
2
#   modify it under the terms of the GNU Lesser General Public
3
#   License as published by the Free Software Foundation; either
4
#   version 2.1 of the License, or (at your option) any later version.
5
#
6
#   This library is distributed in the hope that it will be useful,
7
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
8
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9
#   Lesser General Public License for more details.
10
#
11
#   You should have received a copy of the GNU Lesser General Public
12
#   License along with this library; if not, write to the 
13
#      Free Software Foundation, Inc., 
14
#      59 Temple Place, Suite 330, 
15
#      Boston, MA  02111-1307  USA
16
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
19
20
"""A high-level cross-protocol url-grabber.
21
22
GENERAL ARGUMENTS (kwargs)
23
24
  Where possible, the module-level default is indicated, and legal
25
  values are provided.
26
27
  copy_local = 0   [0|1]
28
29
    ignored except for file:// urls, in which case it specifies
30
    whether urlgrab should still make a copy of the file, or simply
31
    point to the existing copy. The module level default for this
32
    option is 0.
33
34
  close_connection = 0   [0|1]
35
36
    tells URLGrabber to close the connection after a file has been
37
    transfered. This is ignored unless the download happens with the
38
    http keepalive handler (keepalive=1).  Otherwise, the connection
39
    is left open for further use. The module level default for this
40
    option is 0 (keepalive connections will not be closed).
41
42
  keepalive = 1   [0|1]
43
44
    specifies whether keepalive should be used for HTTP/1.1 servers
45
    that support it. The module level default for this option is 1
46
    (keepalive is enabled).
47
48
  progress_obj = None
49
50
    a class instance that supports the following methods:
51
      po.start(filename, url, basename, length, text)
52
      # length will be None if unknown
53
      po.update(read) # read == bytes read so far
54
      po.end()
55
56
  text = None
57
  
58
    specifies an alternativ text item in the beginning of the progress
59
    bar line. If not given, the basename of the file is used.
60
61
  throttle = 1.0
62
63
    a number - if it's an int, it's the bytes/second throttle limit.
64
    If it's a float, it is first multiplied by bandwidth.  If throttle
65
    == 0, throttling is disabled.  If None, the module-level default
66
    (which can be set on default_grabber.throttle) is used. See
67
    BANDWIDTH THROTTLING for more information.
68
69
  timeout = None
70
71
    a positive float expressing the number of seconds to wait for socket
72
    operations. If the value is None or 0.0, socket operations will block
73
    forever. Setting this option causes urlgrabber to call the settimeout
74
    method on the Socket object used for the request. See the Python
75
    documentation on settimeout for more information.
76
    http://www.python.org/doc/current/lib/socket-objects.html
77
78
  bandwidth = 0
79
80
    the nominal max bandwidth in bytes/second.  If throttle is a float
81
    and bandwidth == 0, throttling is disabled.  If None, the
82
    module-level default (which can be set on
83
    default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
84
    more information.
85
86
  range = None
87
88
    a tuple of the form (first_byte, last_byte) describing a byte
89
    range to retrieve. Either or both of the values may set to
90
    None. If first_byte is None, byte offset 0 is assumed. If
91
    last_byte is None, the last byte available is assumed. Note that
92
    the range specification is python-like in that (0,10) will yeild
93
    the first 10 bytes of the file.
94
95
    If set to None, no range will be used.
96
    
97
  reget = None   [None|'simple'|'check_timestamp']
98
99
    whether to attempt to reget a partially-downloaded file.  Reget
100
    only applies to .urlgrab and (obviously) only if there is a
101
    partially downloaded file.  Reget has two modes:
102
103
      'simple' -- the local file will always be trusted.  If there
104
        are 100 bytes in the local file, then the download will always
105
        begin 100 bytes into the requested file.
106
107
      'check_timestamp' -- the timestamp of the server file will be
108
        compared to the timestamp of the local file.  ONLY if the
109
        local file is newer than or the same age as the server file
110
        will reget be used.  If the server file is newer, or the
111
        timestamp is not returned, the entire file will be fetched.
112
113
    NOTE: urlgrabber can do very little to verify that the partial
114
    file on disk is identical to the beginning of the remote file.
115
    You may want to either employ a custom "checkfunc" or simply avoid
116
    using reget in situations where corruption is a concern.
117
118
  user_agent = 'urlgrabber/VERSION'
119
120
    a string, usually of the form 'AGENT/VERSION' that is provided to
121
    HTTP servers in the User-agent header. The module level default
122
    for this option is "urlgrabber/VERSION".
123
124
  http_headers = None
125
126
    a tuple of 2-tuples, each containing a header and value.  These
127
    will be used for http and https requests only.  For example, you
128
    can do
129
      http_headers = (('Pragma', 'no-cache'),)
130
131
  ftp_headers = None
132
133
    this is just like http_headers, but will be used for ftp requests.
134
135
  proxies = None
136
137
    a dictionary that maps protocol schemes to proxy hosts. For
138
    example, to use a proxy server on host "foo" port 3128 for http
139
    and https URLs:
140
      proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
141
    note that proxy authentication information may be provided using
142
    normal URL constructs:
143
      proxies={ 'http' : 'http://user:host@foo:3128' }
144
    Lastly, if proxies is None, the default environment settings will
145
    be used.
146
147
  prefix = None
148
149
    a url prefix that will be prepended to all requested urls.  For
150
    example:
151
      g = URLGrabber(prefix='http://foo.com/mirror/')
152
      g.urlgrab('some/file.txt')
153
      ## this will fetch 'http://foo.com/mirror/some/file.txt'
154
    This option exists primarily to allow identical behavior to
155
    MirrorGroup (and derived) instances.  Note: a '/' will be inserted
156
    if necessary, so you cannot specify a prefix that ends with a
157
    partial file or directory name.
158
159
  opener = None
160
  
161
    Overrides the default urllib2.OpenerDirector provided to urllib2
162
    when making requests.  This option exists so that the urllib2
163
    handler chain may be customized.  Note that the range, reget,
164
    proxy, and keepalive features require that custom handlers be
165
    provided to urllib2 in order to function properly.  If an opener
166
    option is provided, no attempt is made by urlgrabber to ensure
167
    chain integrity.  You are responsible for ensuring that any
168
    extension handlers are present if said features are required.
169
    
170
RETRY RELATED ARGUMENTS
171
172
  retry = None
173
174
    the number of times to retry the grab before bailing.  If this is
175
    zero, it will retry forever. This was intentional... really, it
176
    was :). If this value is not supplied or is supplied but is None
177
    retrying does not occur.
178
179
  retrycodes = [-1,2,4,5,6,7]
180
181
    a sequence of errorcodes (values of e.errno) for which it should
182
    retry. See the doc on URLGrabError for more details on
183
    this. retrycodes defaults to [-1,2,4,5,6,7] if not specified
184
    explicitly.
185
186
  checkfunc = None
187
188
    a function to do additional checks. This defaults to None, which
189
    means no additional checking.  The function should simply return
190
    on a successful check.  It should raise URLGrabError on an
191
    unsuccessful check.  Raising of any other exception will be
192
    considered immediate failure and no retries will occur.
193
194
    If it raises URLGrabError, the error code will determine the retry
195
    behavior.  Negative error numbers are reserved for use by these
196
    passed in functions, so you can use many negative numbers for
197
    different types of failure.  By default, -1 results in a retry,
198
    but this can be customized with retrycodes.
199
200
    If you simply pass in a function, it will be given exactly one
201
    argument: a CallbackObject instance with the .url attribute
202
    defined and either .filename (for urlgrab) or .data (for urlread).
203
    For urlgrab, .filename is the name of the local file.  For
204
    urlread, .data is the actual string data.  If you need other
205
    arguments passed to the callback (program state of some sort), you
206
    can do so like this:
207
208
      checkfunc=(function, ('arg1', 2), {'kwarg': 3})
209
210
    if the downloaded file has filename /tmp/stuff, then this will
211
    result in this call (for urlgrab):
212
213
      function(obj, 'arg1', 2, kwarg=3)
214
      # obj.filename = '/tmp/stuff'
215
      # obj.url = 'http://foo.com/stuff'
216
      
217
    NOTE: both the "args" tuple and "kwargs" dict must be present if
218
    you use this syntax, but either (or both) can be empty.
219
220
  failure_callback = None
221
222
    The callback that gets called during retries when an attempt to
223
    fetch a file fails.  The syntax for specifying the callback is
224
    identical to checkfunc, except for the attributes defined in the
225
    CallbackObject instance.  In this case, it will have .exception
226
    and .url defined.  As you might suspect, .exception is the
227
    exception that was raised.
228
229
    The callback is present primarily to inform the calling program of
230
    the failure, but if it raises an exception (including the one it's
231
    passed) that exception will NOT be caught and will therefore cause
232
    future retries to be aborted.
233
234
BANDWIDTH THROTTLING
235
236
  urlgrabber supports throttling via two values: throttle and
237
  bandwidth Between the two, you can either specify and absolute
238
  throttle threshold or specify a theshold as a fraction of maximum
239
  available bandwidth.
240
241
  throttle is a number - if it's an int, it's the bytes/second
242
  throttle limit.  If it's a float, it is first multiplied by
243
  bandwidth.  If throttle == 0, throttling is disabled.  If None, the
244
  module-level default (which can be set with set_throttle) is used.
245
246
  bandwidth is the nominal max bandwidth in bytes/second.  If throttle
247
  is a float and bandwidth == 0, throttling is disabled.  If None, the
248
  module-level default (which can be set with set_bandwidth) is used.
249
250
  THROTTLING EXAMPLES:
251
252
  Lets say you have a 100 Mbps connection.  This is (about) 10^8 bits
253
  per second, or 12,500,000 Bytes per second.  You have a number of
254
  throttling options:
255
256
  *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
257
258
     This will limit urlgrab to use half of your available bandwidth.
259
260
  *) set_throttle(6250000) # throttle is an int
261
262
     This will also limit urlgrab to use half of your available
263
     bandwidth, regardless of what bandwidth is set to.
264
265
  *) set_throttle(6250000); set_throttle(1.0) # float
266
267
     Use half your bandwidth
268
269
  *) set_throttle(6250000); set_throttle(2.0) # float
270
271
    Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
272
273
  *) set_throttle(6250000); set_throttle(0) # throttle = 0
274
275
     Disable throttling - this is more efficient than a very large
276
     throttle setting.
277
278
  *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
279
280
     Disable throttling - this is the default when the module is loaded.
281
282
  SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
283
284
  While this is flexible, it's not extremely obvious to the user.  I
285
  suggest you implement a float throttle as a percent to make the
286
  distinction between absolute and relative throttling very explicit.
287
288
  Also, you may want to convert the units to something more convenient
289
  than bytes/second, such as kbps or kB/s, etc.
290
291
"""
292
293
# $Id: grabber.py,v 1.39 2005/03/03 00:54:23 mstenner Exp $
294
295
import os
296
import os.path
297
import urlparse
298
import rfc822
299
import time
300
import string
301
import urllib
302
import urllib2
303
from stat import *  # S_* and ST_*
304
305
try:
306
    exec('from ' + (__name__.split('.'))[0] + ' import __version__')
307
except:
308
    __version__ = '???'
309
310
auth_handler = urllib2.HTTPBasicAuthHandler( \
311
     urllib2.HTTPPasswordMgrWithDefaultRealm())
312
313
DEBUG=0
314
315
try:
316
    from i18n import _
317
except ImportError, msg:
318
    def _(st): return st
319
320
try:
321
    from httplib import HTTPException
322
except ImportError, msg:
323
    HTTPException = None
324
325
try:
326
    # This is a convenient way to make keepalive optional.
327
    # Just rename the module so it can't be imported.
328
    from keepalive import HTTPHandler
329
except ImportError, msg:
330
    keepalive_handler = None
331
else:
332
    keepalive_handler = HTTPHandler()
333
334
try:
335
    # add in range support conditionally too
336
    from urlgrabber.byterange import HTTPRangeHandler, FileRangeHandler, \
337
         FTPRangeHandler, range_tuple_normalize, range_tuple_to_header, \
338
         RangeError
339
except ImportError, msg:
340
    range_handlers = ()
341
    RangeError = None
342
    have_range = 0
343
else:
344
    range_handlers = (HTTPRangeHandler(), FileRangeHandler(), FTPRangeHandler())
345
    have_range = 1
346
347
348
# check whether socket timeout support is available (Python >= 2.3)
349
import socket
350
try:
351
    TimeoutError = socket.timeout
352
    have_socket_timeout = True
353
except AttributeError:
354
    TimeoutError = None
355
    have_socket_timeout = False
356
357
class URLGrabError(IOError):
358
    """
359
    URLGrabError error codes:
360
361
      URLGrabber error codes (0 -- 255)
362
        0    - everything looks good (you should never see this)
363
        1    - malformed url
364
        2    - local file doesn't exist
365
        3    - request for non-file local file (dir, etc)
366
        4    - IOError on fetch
367
        5    - OSError on fetch
368
        6    - no content length header when we expected one
369
        7    - HTTPException
370
        8    - Exceeded read limit (for urlread)
371
        9    - Requested byte range not satisfiable.
372
        10   - Byte range requested, but range support unavailable
373
        11   - Illegal reget mode
374
        12   - Socket timeout.
375
376
      MirrorGroup error codes (256 -- 511)
377
        256  - No more mirrors left to try
378
379
      Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
380
        [ this range reserved for application-specific error codes ]
381
382
      Retry codes (< 0)
383
        -1   - retry the download, unknown reason
384
385
    Note: to test which group a code is in, you can simply do integer
386
    division by 256: e.errno / 256
387
388
    Negative codes are reserved for use by functions passed in to
389
    retrygrab with checkfunc.  The value -1 is built in as a generic
390
    retry code and is already included in the retrycodes list.
391
    Therefore, you can create a custom check function that simply
392
    returns -1 and the fetch will be re-tried.  For more customized
393
    retries, you can use other negative number and include them in
394
    retry-codes.  This is nice for outputting useful messages about
395
    what failed.
396
397
    You can use these error codes like so:
398
      try: urlgrab(url)
399
      except URLGrabError, e:
400
         if e.errno == 3: ...
401
           # or
402
         print e.strerror
403
           # or simply
404
         print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
405
    """
406
    pass
407
408
class CallbackObject:
409
    """Container for returned callback data.
410
411
    This is currently a dummy class into which urlgrabber can stuff
412
    information for passing to callbacks.  This way, the prototype for
413
    all callbacks is the same, regardless of the data that will be
414
    passed back.  Any function that accepts a callback function as an
415
    argument SHOULD document what it will define in this object.
416
417
    It is possible that this class will have some greater
418
    functionality in the future.
419
    """
420
    pass
421
422
def close_all():
423
    """close any open keepalive connections"""
424
    if keepalive_handler: keepalive_handler.close_all()
425
426
def urlgrab(url, filename=None, **kwargs):
427
    """grab the file at <url> and make a local copy at <filename>
428
    If filename is none, the basename of the url is used.
429
    urlgrab returns the filename of the local file, which may be different
430
    from the passed-in filename if the copy_local kwarg == 0.
431
    
432
    See module documentation for a description of possible kwargs.
433
    """
434
    return default_grabber.urlgrab(url, filename, **kwargs)
435
436
def urlopen(url, **kwargs):
437
    """open the url and return a file object
438
    If a progress object or throttle specifications exist, then
439
    a special file object will be returned that supports them.
440
    The file object can be treated like any other file object.
441
    
442
    See module documentation for a description of possible kwargs.
443
    """
444
    return default_grabber.urlopen(url, **kwargs)
445
446
def urlread(url, limit=None, **kwargs):
447
    """read the url into a string, up to 'limit' bytes
448
    If the limit is exceeded, an exception will be thrown.  Note that urlread
449
    is NOT intended to be used as a way of saying "I want the first N bytes"
450
    but rather 'read the whole file into memory, but don't use too much'
451
    
452
    See module documentation for a description of possible kwargs.
453
    """
454
    return default_grabber.urlread(url, limit, **kwargs)
455
456
457
class URLGrabberOptions:
458
    """Class to ease kwargs handling."""
459
460
    def __init__(self, delegate=None, **kwargs):
461
        """Initialize URLGrabberOptions object.
462
        Set default values for all options and then update options specified
463
        in kwargs.
464
        """
465
        self.delegate = delegate
466
        if delegate is None:
467
            self._set_defaults()
468
        self._set_attributes(**kwargs)
469
    
470
    def __getattr__(self, name):
471
        if self.delegate and hasattr(self.delegate, name):
472
            return getattr(self.delegate, name)
473
        raise AttributeError, name
474
    
475
    def raw_throttle(self):
476
        """Calculate raw throttle value from throttle and bandwidth 
477
        values.
478
        """
479
        if self.throttle <= 0:  
480
            return 0
481
        elif type(self.throttle) == type(0): 
482
            return float(self.throttle)
483
        else: # throttle is a float
484
            return self.bandwidth * self.throttle
485
        
486
    def derive(self, **kwargs):
487
        """Create a derived URLGrabberOptions instance.
488
        This method creates a new instance and overrides the
489
        options specified in kwargs.
490
        """
491
        return URLGrabberOptions(delegate=self, **kwargs)
492
        
493
    def _set_attributes(self, **kwargs):
494
        """Update object attributes with those provided in kwargs."""
495
        self.__dict__.update(kwargs)
496
        if have_range and kwargs.has_key('range'):
497
            # normalize the supplied range value
498
            self.range = range_tuple_normalize(self.range)
499
        if not self.reget in [None, 'simple', 'check_timestamp']:
500
            raise URLGrabError(11, _('Illegal reget mode: %s') \
501
                               % (self.reget, ))
502
503
    def _set_defaults(self):
504
        """Set all options to their default values. 
505
        When adding new options, make sure a default is
506
        provided here.
507
        """
508
        self.progress_obj = None
509
        self.throttle = 1.0
510
        self.bandwidth = 0
511
        self.retry = None
512
        self.retrycodes = [-1,2,4,5,6,7]
513
        self.checkfunc = None
514
        self.copy_local = 0
515
        self.close_connection = 0
516
        self.range = None
517
        self.user_agent = 'urlgrabber/%s' % __version__
518
        self.keepalive = 1
519
        self.proxies = None
520
        self.reget = None
521
        self.failure_callback = None
522
        self.prefix = None
523
        self.opener = None
524
        self.cache_openers = True
525
        self.timeout = None
526
        self.text = None
527
        self.http_headers = None
528
        self.ftp_headers = None
529
530
class URLGrabber:
531
    """Provides easy opening of URLs with a variety of options.
532
    
533
    All options are specified as kwargs. Options may be specified when
534
    the class is created and may be overridden on a per request basis.
535
    
536
    New objects inherit default values from default_grabber.
537
    """
538
    
539
    def __init__(self, **kwargs):
540
        self.opts = URLGrabberOptions(**kwargs)
541
    
542
    def _retry(self, opts, func, *args):
543
        tries = 0
544
        while 1:
545
            tries = tries + 1
546
            try:
547
                return apply(func, (opts,) + args, {})
548
            except URLGrabError, e:
549
                if DEBUG: print 'EXCEPTION: %s' % e
550
                if (opts.retry is None) \
551
                    or (tries == opts.retry) \
552
                    or (e.errno not in opts.retrycodes): raise
553
                if opts.failure_callback:
554
                    cb_func, cb_args, cb_kwargs = \
555
                          self._make_callback(opts.failure_callback)
556
                    # this is a little icky - for now, the first element
557
                    # of args is the url.  we might consider a way to tidy
558
                    # that up, though
559
                    obj = CallbackObject()
560
                    obj.exception = e
561
                    obj.url = args[0]
562
                    cb_func(obj, *cb_args, **cb_kwargs)
563
    
564
    def urlopen(self, url, **kwargs):
565
        """open the url and return a file object
566
        If a progress object or throttle value specified when this 
567
        object was created, then  a special file object will be 
568
        returned that supports them. The file object can be treated 
569
        like any other file object.
570
        """
571
        opts = self.opts.derive(**kwargs)
572
        (url,parts) = self._parse_url(url) 
573
        def retryfunc(opts, url):
574
            return URLGrabberFileObject(url, filename=None, opts=opts)
575
        return self._retry(opts, retryfunc, url)
576
    
577
    def urlgrab(self, url, filename=None, **kwargs):
578
        """grab the file at <url> and make a local copy at <filename>
579
        If filename is none, the basename of the url is used.
580
        urlgrab returns the filename of the local file, which may be 
581
        different from the passed-in filename if copy_local == 0.
582
        """
583
        opts = self.opts.derive(**kwargs)
584
        (url, parts) = self._parse_url(url)
585
        (scheme, host, path, parm, query, frag) = parts
586
        if filename is None:
587
            if scheme in [ 'http', 'https' ]:
588
                filename = os.path.basename( urllib.unquote(path) )
589
            else:
590
                filename = os.path.basename( path )
591
        if scheme == 'file' and not opts.copy_local:
592
            # just return the name of the local file - don't make a 
593
            # copy currently
594
            if not os.path.exists(path):
595
                raise URLGrabError(2, 
596
                      _('Local file does not exist: %s') % (path, ))
597
            elif not os.path.isfile(path):
598
                raise URLGrabError(3, 
599
                              _('Not a normal file: %s') % (path, ))
600
            elif not opts.range:
601
                return path
602
        
603
        def retryfunc(opts, url, filename):
604
            fo = URLGrabberFileObject(url, filename, opts)
605
            try:
606
                fo._do_grab()
607
                if not opts.checkfunc is None:
608
                    cb_func, cb_args, cb_kwargs = \
609
                             self._make_callback(opts.checkfunc)
610
                    obj = CallbackObject()
611
                    obj.filename = filename
612
                    obj.url = url
613
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
614
            finally:
615
                fo.close()
616
            return filename
617
        
618
        return self._retry(opts, retryfunc, url, filename)
619
    
620
    def urlread(self, url, limit=None, **kwargs):
621
        """read the url into a string, up to 'limit' bytes
622
        If the limit is exceeded, an exception will be thrown.  Note
623
        that urlread is NOT intended to be used as a way of saying 
624
        "I want the first N bytes" but rather 'read the whole file 
625
        into memory, but don't use too much'
626
        """
627
        opts = self.opts.derive(**kwargs)
628
        (url, parts) = self._parse_url(url)
629
        if limit is not None:
630
            limit = limit + 1
631
            
632
        def retryfunc(opts, url, limit):
633
            fo = URLGrabberFileObject(url, filename=None, opts=opts)
634
            s = ''
635
            try:
636
                # this is an unfortunate thing.  Some file-like objects
637
                # have a default "limit" of None, while the built-in (real)
638
                # file objects have -1.  They each break the other, so for
639
                # now, we just force the default if necessary.
640
                if limit is None: s = fo.read()
641
                else: s = fo.read(limit)
642
643
                if not opts.checkfunc is None:
644
                    cb_func, cb_args, cb_kwargs = \
645
                             self._make_callback(opts.checkfunc)
646
                    obj = CallbackObject()
647
                    obj.data = s
648
                    obj.url = url
649
                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
650
            finally:
651
                fo.close()
652
            return s
653
            
654
        s = self._retry(opts, retryfunc, url, limit)
655
        if limit and len(s) > limit:
656
            raise URLGrabError(8, 
657
                        _('Exceeded limit (%i): %s') % (limit, url))
658
        return s
659
        
660
    def _parse_url(self,url):
661
        """break up the url into its component parts
662
663
        This function disassembles a url and
664
        1) "normalizes" it, tidying it up a bit
665
        2) does any authentication stuff it needs to do
666
667
        it returns the (cleaned) url and a tuple of component parts
668
        """
669
        if self.opts.prefix:
670
            p = self.opts.prefix
671
            if p[-1] == '/' or url[0] == '/': url = p + url
672
            else: url = p + '/' + url
673
            
674
        (scheme, host, path, parm, query, frag) = \
675
                                             urlparse.urlparse(url)
676
        if not scheme:
677
            if not url[0] == '/': url = os.path.abspath(url)
678
            url = 'file:' + url
679
            (scheme, host, path, parm, query, frag) = \
680
                                             urlparse.urlparse(url)
681
        path = os.path.normpath(path)
682
        if scheme in ['http', 'https']: path = urllib.quote(path)
683
        if '@' in host and auth_handler and scheme in ['http', 'https']:
684
            try:
685
                user_pass, host = host.split('@', 1)
686
                if ':' in user_pass: user, password = user_pass.split(':', 1)
687
            except ValueError, e:
688
                raise URLGrabError(1, _('Bad URL: %s') % url)
689
            if DEBUG: print 'adding HTTP auth: %s, %s' % (user, password)
690
            auth_handler.add_password(None, host, user, password)
691
        parts = (scheme, host, path, parm, query, frag)
692
        url = urlparse.urlunparse(parts)
693
        return url, parts
694
        
695
    def _make_callback(self, callback_obj):
696
        if callable(callback_obj):
697
            return callback_obj, (), {}
698
        else:
699
            return callback_obj
700
701
# create the default URLGrabber used by urlXXX functions.
702
# NOTE: actual defaults are set in URLGrabberOptions
703
default_grabber = URLGrabber()
704
705
class URLGrabberFileObject:
706
    """This is a file-object wrapper that supports progress objects 
707
    and throttling.
708
709
    This exists to solve the following problem: lets say you want to
710
    drop-in replace a normal open with urlopen.  You want to use a
711
    progress meter and/or throttling, but how do you do that without
712
    rewriting your code?  Answer: urlopen will return a wrapped file
713
    object that does the progress meter and-or throttling internally.
714
    """
715
716
    def __init__(self, url, filename, opts):
717
        self.url = url
718
        self.filename = filename
719
        self.opts = opts
720
        self.fo = None
721
        self._rbuf = ''
722
        self._rbufsize = 1024*8
723
        self._ttime = time.time()
724
        self._tsize = 0
725
        self._amount_read = 0
726
        self._opener = None
727
        self._do_open()
728
        
729
    def __getattr__(self, name):
730
        """This effectively allows us to wrap at the instance level.
731
        Any attribute not found in _this_ object will be searched for
732
        in self.fo.  This includes methods."""
733
        if hasattr(self.fo, name):
734
            return getattr(self.fo, name)
735
        raise AttributeError, name
736
    
737
    def _get_opener(self):
738
        """Build a urllib2 OpenerDirector based on request options."""
739
        if self.opts.opener:
740
            return self.opts.opener
741
        elif self._opener is None:
742
            handlers = []
743
            need_keepalive_handler = (keepalive_handler and self.opts.keepalive)
744
            need_range_handler = (range_handlers and \
745
                                  (self.opts.range or self.opts.reget))
746
            # if you specify a ProxyHandler when creating the opener
747
            # it _must_ come before all other handlers in the list or urllib2
748
            # chokes.
749
            if self.opts.proxies:
750
                handlers.append( CachedProxyHandler(self.opts.proxies) )
751
752
                # -------------------------------------------------------
753
                # OK, these next few lines are a serious kludge to get
754
                # around what I think is a bug in python 2.2's
755
                # urllib2.  The basic idea is that default handlers
756
                # get applied first.  If you override one (like a
757
                # proxy handler), then the default gets pulled, but
758
                # the replacement goes on the end.  In the case of
759
                # proxies, this means the normal handler picks it up
760
                # first and the proxy isn't used.  Now, this probably
761
                # only happened with ftp or non-keepalive http, so not
762
                # many folks saw it.  The simple approach to fixing it
763
                # is just to make sure you override the other
764
                # conflicting defaults as well.  I would LOVE to see
765
                # these go way or be dealt with more elegantly.  The
766
                # problem isn't there after 2.2.  -MDS 2005/02/24
767
                if not need_keepalive_handler:
768
                    handlers.append( urllib2.HTTPHandler() )
769
                if not need_range_handler:
770
                    handlers.append( urllib2.FTPHandler() )
771
                # -------------------------------------------------------
772
                    
773
            if need_keepalive_handler:
774
                handlers.append( keepalive_handler )
775
            if need_range_handler:
776
                handlers.extend( range_handlers )
777
            handlers.append( auth_handler )
778
            if self.opts.cache_openers:
779
              self._opener = CachedOpenerDirector(*handlers)
780
            else:
781
              self._opener = urllib2.build_opener(*handlers)
782
            # OK, I don't like to do this, but otherwise, we end up with
783
            # TWO user-agent headers.
784
            self._opener.addheaders = []
785
        return self._opener
786
        
787
    def _do_open(self):
788
        opener = self._get_opener()
789
790
        req = urllib2.Request(self.url) # build request object
791
        self._add_headers(req) # add misc headers that we need
792
        self._build_range(req) # take care of reget and byterange stuff
793
794
        fo, hdr = self._make_request(req, opener)
795
        if self.reget_time and self.opts.reget == 'check_timestamp':
796
            # do this if we have a local file with known timestamp AND
797
            # we're in check_timestamp reget mode.
798
            fetch_again = 0
799
            try:
800
                modified_tuple  = hdr.getdate_tz('last-modified')
801
                modified_stamp  = rfc822.mktime_tz(modified_tuple)
802
                if modified_stamp > self.reget_time: fetch_again = 1
803
            except (TypeError,):
804
                fetch_again = 1
805
            
806
            if fetch_again:
807
                # the server version is newer than the (incomplete) local
808
                # version, so we should abandon the version we're getting
809
                # and fetch the whole thing again.
810
                fo.close()
811
                self.opts.reget = None
812
                del req.headers['Range']
813
                self._build_range(req)
814
                fo, hdr = self._make_request(req, opener)
815
816
        (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
817
        if not (self.opts.progress_obj or self.opts.raw_throttle() \
818
                or self.opts.timeout):
819
            # if we're not using the progress_obj, throttling, or timeout
820
            # we can get a performance boost by going directly to
821
            # the underlying fileobject for reads.
822
            self.read = fo.read
823
            if hasattr(fo, 'readline'):
824
                self.readline = fo.readline
825
        elif self.opts.progress_obj:
826
            try:    length = int(hdr['Content-Length'])
827
            except: length = None
828
            self.opts.progress_obj.start(str(self.filename), self.url, 
829
                                         os.path.basename(path), 
830
                                         length,
831
                                         text=self.opts.text)
832
            self.opts.progress_obj.update(0)
833
        (self.fo, self.hdr) = (fo, hdr)
834
    
835
    def _add_headers(self, req):
836
        if self.opts.user_agent:
837
            req.add_header('User-agent', self.opts.user_agent)
838
        try: req_type = req.get_type()
839
        except ValueError: req_type = None
840
        if self.opts.http_headers and req_type in ('http', 'https'):
841
            for h, v in self.opts.http_headers:
842
                req.add_header(h, v)
843
        if self.opts.ftp_headers and req_type == 'ftp':
844
            for h, v in self.opts.ftp_headers:
845
                req.add_header(h, v)
846
847
    def _build_range(self, req):
848
        self.reget_time = None
849
        self.append = 0
850
        reget_length = 0
851
        rt = None
852
        if have_range and self.opts.reget and type(self.filename) == type(''):
853
            # we have reget turned on and we're dumping to a file
854
            try:
855
                s = os.stat(self.filename)
856
            except OSError:
857
                pass
858
            else:
859
                self.reget_time = s[ST_MTIME]
860
                reget_length = s[ST_SIZE]
861
                rt = reget_length, ''
862
                self.append = 1
863
                
864
        if self.opts.range:
865
            if not have_range:
866
                raise URLGrabError(10, _('Byte range requested but range '\
867
                                         'support unavailable'))
868
            rt = self.opts.range
869
            if rt[0]: rt = (rt[0] + reget_length, rt[1])
870
871
        if rt:
872
            header = range_tuple_to_header(rt)
873
            if header: req.add_header('Range', header)
874
875
    def _make_request(self, req, opener):
876
        try:
877
            if have_socket_timeout and self.opts.timeout:
878
                old_to = socket.getdefaulttimeout()
879
                socket.setdefaulttimeout(self.opts.timeout)
880
                try:
881
                    fo = opener.open(req)
882
                finally:
883
                    socket.setdefaulttimeout(old_to)
884
            else:
885
                fo = opener.open(req)
886
            hdr = fo.info()
887
        except ValueError, e:
888
            raise URLGrabError(1, _('Bad URL: %s') % (e, ))
889
        except RangeError, e:
890
            raise URLGrabError(9, _('%s') % (e, ))
891
        except IOError, e:
892
            if hasattr(e, 'reason') and have_socket_timeout and \
893
                   isinstance(e.reason, TimeoutError):
894
                raise URLGrabError(12, _('Timeout: %s') % (e, ))
895
            else:
896
                raise URLGrabError(4, _('IOError: %s') % (e, ))
897
        except OSError, e:
898
            raise URLGrabError(5, _('OSError: %s') % (e, ))
899
        except HTTPException, e:
900
            raise URLGrabError(7, _('HTTP Error (%s): %s') % \
901
                            (e.__class__.__name__, e))
902
        else:
903
            return (fo, hdr)
904
        
905
    def _do_grab(self):
906
        """dump the file to self.filename."""
907
        if self.append: new_fo = open(self.filename, 'ab')
908
        else: new_fo = open(self.filename, 'wb')
909
        bs = 1024*8
910
        size = 0
911
912
        block = self.read(bs)
913
        size = size + len(block)
914
        while block:
915
            new_fo.write(block)
916
            block = self.read(bs)
917
            size = size + len(block)
918
919
        new_fo.close()
920
        try:
921
            modified_tuple  = self.hdr.getdate_tz('last-modified')
922
            modified_stamp  = rfc822.mktime_tz(modified_tuple)
923
            os.utime(self.filename, (modified_stamp, modified_stamp))
924
        except (TypeError,), e: pass
925
926
        return size
927
    
928
    def _fill_buffer(self, amt=None):
929
        """fill the buffer to contain at least 'amt' bytes by reading
930
        from the underlying file object.  If amt is None, then it will
931
        read until it gets nothing more.  It updates the progress meter
932
        and throttles after every self._rbufsize bytes."""
933
        # the _rbuf test is only in this first 'if' for speed.  It's not
934
        # logically necessary
935
        if self._rbuf and not amt is None:
936
            L = len(self._rbuf)
937
            if amt > L:
938
                amt = amt - L
939
            else:
940
                return
941
942
        # if we've made it here, then we don't have enough in the buffer
943
        # and we need to read more.
944
945
        buf = [self._rbuf]
946
        bufsize = len(self._rbuf)
947
        while amt is None or amt:
948
            # first, delay if necessary for throttling reasons
949
            if self.opts.raw_throttle():
950
                diff = self._tsize/self.opts.raw_throttle() - \
951
                       (time.time() - self._ttime)
952
                if diff > 0: time.sleep(diff)
953
                self._ttime = time.time()
954
                
955
            # now read some data, up to self._rbufsize
956
            if amt is None: readamount = self._rbufsize
957
            else:           readamount = min(amt, self._rbufsize)
958
            try:
959
                new = self.fo.read(readamount)
960
            except socket.error, e:
961
                raise URLGrabError(4, _('Socket Error: %s') % (e, ))
962
            except TimeoutError, e:
963
                raise URLGrabError(12, _('Timeout: %s') % (e, ))
964
            newsize = len(new)
965
            if not newsize: break # no more to read
966
967
            if amt: amt = amt - newsize
968
            buf.append(new)
969
            bufsize = bufsize + newsize
970
            self._tsize = newsize
971
            self._amount_read = self._amount_read + newsize
972
            if self.opts.progress_obj:
973
                self.opts.progress_obj.update(self._amount_read)
974
975
        self._rbuf = string.join(buf, '')
976
        return
977
978
    def read(self, amt=None):
979
        self._fill_buffer(amt)
980
        if amt is None:
981
            s, self._rbuf = self._rbuf, ''
982
        else:
983
            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
984
        return s
985
986
    def readline(self, limit=-1):
987
        i = string.find(self._rbuf, '\n')
988
        while i < 0 and not (0 < limit <= len(self._rbuf)):
989
            L = len(self._rbuf)
990
            self._fill_buffer(L + self._rbufsize)
991
            if not len(self._rbuf) > L: break
992
            i = string.find(self._rbuf, '\n', L)
993
994
        if i < 0: i = len(self._rbuf)
995
        else: i = i+1
996
        if 0 <= limit < len(self._rbuf): i = limit
997
998
        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
999
        return s
1000
1001
    def close(self):
1002
        if self.opts.progress_obj:
1003
            self.opts.progress_obj.end(self._amount_read)
1004
        self.fo.close()
1005
        if self.opts.close_connection:
1006
            try: self.fo.close_connection()
1007
            except: pass
1008
1009
_handler_cache = []
1010
def CachedOpenerDirector(*handlers):
1011
    for (cached_handlers, opener) in _handler_cache:
1012
        if cached_handlers == handlers:
1013
            for handler in opener.handlers:
1014
                handler.add_parent(opener)
1015
            return opener
1016
    opener = urllib2.build_opener(*handlers)
1017
    _handler_cache.append( (handlers, opener) )
1018
    return opener
1019
1020
_proxy_cache = []
1021
def CachedProxyHandler(proxies):
1022
    for (pdict, handler) in _proxy_cache:
1023
        if pdict == proxies:
1024
            break
1025
    else:
1026
        handler = urllib2.ProxyHandler(proxies)
1027
        _proxy_cache.append( (proxies, handler) )
1028
    return handler
1029
1030
#####################################################################
1031
# DEPRECATED FUNCTIONS
1032
def set_throttle(new_throttle):
1033
    """Deprecated. Use: default_grabber.throttle = new_throttle"""
1034
    default_grabber.throttle = new_throttle
1035
1036
def set_bandwidth(new_bandwidth):
1037
    """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
1038
    default_grabber.bandwidth = new_bandwidth
1039
1040
def set_progress_obj(new_progress_obj):
1041
    """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
1042
    default_grabber.progress_obj = new_progress_obj
1043
1044
def set_user_agent(new_user_agent):
1045
    """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
1046
    default_grabber.user_agent = new_user_agent
1047
    
1048
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1049
              progress_obj=None, throttle=None, bandwidth=None,
1050
              numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
1051
    """Deprecated. Use: urlgrab() with the retry arg instead"""
1052
    kwargs = {'copy_local' :  copy_local, 
1053
              'close_connection' : close_connection,
1054
              'progress_obj' : progress_obj, 
1055
              'throttle' : throttle, 
1056
              'bandwidth' : bandwidth,
1057
              'retry' : numtries,
1058
              'retrycodes' : retrycodes,
1059
              'checkfunc' : checkfunc 
1060
              }
1061
    return urlgrab(url, filename, **kwargs)
1062
1063
        
1064
#####################################################################
1065
#  TESTING
1066
def _main_test():
1067
    import sys
1068
    try: url, filename = sys.argv[1:3]
1069
    except ValueError:
1070
        print 'usage:', sys.argv[0], \
1071
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1072
        sys.exit()
1073
1074
    kwargs = {}
1075
    for a in sys.argv[3:]:
1076
        k, v = string.split(a, '=', 1)
1077
        kwargs[k] = int(v)
1078
1079
    set_throttle(1.0)
1080
    set_bandwidth(32 * 1024)
1081
    print "throttle: %s,  throttle bandwidth: %s B/s" % (default_grabber.throttle, 
1082
                                                        default_grabber.bandwidth)
1083
1084
    try: from progress import text_progress_meter
1085
    except ImportError, e: pass
1086
    else: kwargs['progress_obj'] = text_progress_meter()
1087
1088
    try: name = apply(urlgrab, (url, filename), kwargs)
1089
    except URLGrabError, e: print e
1090
    else: print 'LOCAL FILE:', name
1091
1092
1093
def _retry_test():
1094
    import sys
1095
    try: url, filename = sys.argv[1:3]
1096
    except ValueError:
1097
        print 'usage:', sys.argv[0], \
1098
              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
1099
        sys.exit()
1100
1101
    kwargs = {}
1102
    for a in sys.argv[3:]:
1103
        k, v = string.split(a, '=', 1)
1104
        kwargs[k] = int(v)
1105
1106
    try: from progress import text_progress_meter
1107
    except ImportError, e: pass
1108
    else: kwargs['progress_obj'] = text_progress_meter()
1109
1110
    global DEBUG
1111
    #DEBUG = 1
1112
    def cfunc(filename, hello, there='foo'):
1113
        print hello, there
1114
        import random
1115
        rnum = random.random()
1116
        if rnum < .5:
1117
            print 'forcing retry'
1118
            raise URLGrabError(-1, 'forcing retry')
1119
        if rnum < .75:
1120
            print 'forcing failure'
1121
            raise URLGrabError(-2, 'forcing immediate failure')
1122
        print 'success'
1123
        return
1124
        
1125
    close_all()
1126
    kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
1127
    try: name = apply(retrygrab, (url, filename), kwargs)
1128
    except URLGrabError, e: print e
1129
    else: print 'LOCAL FILE:', name
1130
1131
def _file_object_test(filename=None):
1132
    import random, cStringIO, sys
1133
    if filename is None:
1134
        filename = __file__
1135
    print 'using file "%s" for comparisons' % filename
1136
    fo = open(filename)
1137
    s_input = fo.read()
1138
    fo.close()
1139
1140
    for testfunc in [_test_file_object_smallread,
1141
                     _test_file_object_readall,
1142
                     _test_file_object_readline,
1143
                     _test_file_object_readlines]:
1144
        fo_input = cStringIO.StringIO(s_input)
1145
        fo_output = cStringIO.StringIO()
1146
        wrapper = URLGrabberFileObject(fo_input, None, 0)
1147
        print 'testing %-30s ' % testfunc.__name__,
1148
        testfunc(wrapper, fo_output)
1149
        s_output = fo_output.getvalue()
1150
        if s_output == s_input: print 'passed'
1151
        else: print 'FAILED'
1152
            
1153
def _test_file_object_smallread(wrapper, fo_output):
1154
    while 1:
1155
        s = wrapper.read(23)
1156
        fo_output.write(s)
1157
        if not s: return
1158
1159
def _test_file_object_readall(wrapper, fo_output):
1160
    s = wrapper.read()
1161
    fo_output.write(s)
1162
1163
def _test_file_object_readline(wrapper, fo_output):
1164
    while 1:
1165
        s = wrapper.readline()
1166
        fo_output.write(s)
1167
        if not s: return
1168
1169
def _test_file_object_readlines(wrapper, fo_output):
1170
    li = wrapper.readlines()
1171
    fo_output.write(string.join(li, ''))
1172
1173
if __name__ == '__main__':
1174
    _main_test()
1175
    _retry_test()
1176
    _file_object_test('test')
1177