~bzr-pqm/bzr/bzr.dev : revision 647

1

# This library is free software; you can redistribute it and/or

2

# modify it under the terms of the GNU Lesser General Public

3

# License as published by the Free Software Foundation; either

4

# version 2.1 of the License, or (at your option) any later version.

5

#

6

# This library is distributed in the hope that it will be useful,

7

# but WITHOUT ANY WARRANTY; without even the implied warranty of

8

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

9

# Lesser General Public License for more details.

10

#

11

# You should have received a copy of the GNU Lesser General Public

12

# License along with this library; if not, write to the

13

# Free Software Foundation, Inc.,

14

# 59 Temple Place, Suite 330,

15

# Boston, MA 02111-1307 USA

16

17

# This file is part of urlgrabber, a high-level cross-protocol url-grabber

18

19

20

"""A high-level cross-protocol url-grabber.

21

22

GENERAL ARGUMENTS (kwargs)

23

24

Where possible, the module-level default is indicated, and legal

25

values are provided.

26

27

copy_local = 0 [0|1]

28

29

ignored except for file:// urls, in which case it specifies

30

whether urlgrab should still make a copy of the file, or simply

31

point to the existing copy. The module level default for this

32

option is 0.

33

34

close_connection = 0 [0|1]

35

36

tells URLGrabber to close the connection after a file has been

37

transfered. This is ignored unless the download happens with the

38

http keepalive handler (keepalive=1). Otherwise, the connection

39

is left open for further use. The module level default for this

40

option is 0 (keepalive connections will not be closed).

41

42

keepalive = 1 [0|1]

43

44

specifies whether keepalive should be used for HTTP/1.1 servers

45

that support it. The module level default for this option is 1

46

(keepalive is enabled).

47

48

progress_obj = None

49

50

a class instance that supports the following methods:

51

po.start(filename, url, basename, length, text)

52

# length will be None if unknown

53

po.update(read) # read == bytes read so far

54

po.end()

55

56

text = None

57

58

specifies an alternativ text item in the beginning of the progress

59

bar line. If not given, the basename of the file is used.

60

61

throttle = 1.0

62

63

a number - if it's an int, it's the bytes/second throttle limit.

64

If it's a float, it is first multiplied by bandwidth. If throttle

65

== 0, throttling is disabled. If None, the module-level default

66

(which can be set on default_grabber.throttle) is used. See

67

BANDWIDTH THROTTLING for more information.

68

69

timeout = None

70

71

a positive float expressing the number of seconds to wait for socket

72

operations. If the value is None or 0.0, socket operations will block

73

forever. Setting this option causes urlgrabber to call the settimeout

74

method on the Socket object used for the request. See the Python

75

documentation on settimeout for more information.

76

http://www.python.org/doc/current/lib/socket-objects.html

77

78

bandwidth = 0

79

80

the nominal max bandwidth in bytes/second. If throttle is a float

81

and bandwidth == 0, throttling is disabled. If None, the

82

module-level default (which can be set on

83

default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for

84

more information.

85

86

range = None

87

88

a tuple of the form (first_byte, last_byte) describing a byte

89

range to retrieve. Either or both of the values may set to

90

None. If first_byte is None, byte offset 0 is assumed. If

91

last_byte is None, the last byte available is assumed. Note that

92

the range specification is python-like in that (0,10) will yeild

93

the first 10 bytes of the file.

94

95

If set to None, no range will be used.

96

97

reget = None [None|'simple'|'check_timestamp']

98

99

whether to attempt to reget a partially-downloaded file. Reget

100

only applies to .urlgrab and (obviously) only if there is a

101

partially downloaded file. Reget has two modes:

102

103

'simple' -- the local file will always be trusted. If there

104

are 100 bytes in the local file, then the download will always

105

begin 100 bytes into the requested file.

106

107

'check_timestamp' -- the timestamp of the server file will be

108

compared to the timestamp of the local file. ONLY if the

109

local file is newer than or the same age as the server file

110

will reget be used. If the server file is newer, or the

111

timestamp is not returned, the entire file will be fetched.

112

113

NOTE: urlgrabber can do very little to verify that the partial

114

file on disk is identical to the beginning of the remote file.

115

You may want to either employ a custom "checkfunc" or simply avoid

116

using reget in situations where corruption is a concern.

117

118

user_agent = 'urlgrabber/VERSION'

119

120

a string, usually of the form 'AGENT/VERSION' that is provided to

121

HTTP servers in the User-agent header. The module level default

122

for this option is "urlgrabber/VERSION".

123

124

http_headers = None

125

126

a tuple of 2-tuples, each containing a header and value. These

127

will be used for http and https requests only. For example, you

128

can do

129

http_headers = (('Pragma', 'no-cache'),)

130

131

ftp_headers = None

132

133

this is just like http_headers, but will be used for ftp requests.

134

135

proxies = None

136

137

a dictionary that maps protocol schemes to proxy hosts. For

138

example, to use a proxy server on host "foo" port 3128 for http

139

and https URLs:

140

proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }

141

note that proxy authentication information may be provided using

142

normal URL constructs:

143

proxies={ 'http' : 'http://user:host@foo:3128' }

144

Lastly, if proxies is None, the default environment settings will

145

be used.

146

147

prefix = None

148

149

a url prefix that will be prepended to all requested urls. For

150

example:

151

g = URLGrabber(prefix='http://foo.com/mirror/')

152

g.urlgrab('some/file.txt')

153

## this will fetch 'http://foo.com/mirror/some/file.txt'

154

This option exists primarily to allow identical behavior to

155

MirrorGroup (and derived) instances. Note: a '/' will be inserted

156

if necessary, so you cannot specify a prefix that ends with a

157

partial file or directory name.

158

159

opener = None

160

161

Overrides the default urllib2.OpenerDirector provided to urllib2

162

when making requests. This option exists so that the urllib2

163

handler chain may be customized. Note that the range, reget,

164

proxy, and keepalive features require that custom handlers be

165

provided to urllib2 in order to function properly. If an opener

166

option is provided, no attempt is made by urlgrabber to ensure

167

chain integrity. You are responsible for ensuring that any

168

extension handlers are present if said features are required.

169

170

RETRY RELATED ARGUMENTS

171

172

retry = None

173

174

the number of times to retry the grab before bailing. If this is

175

zero, it will retry forever. This was intentional... really, it

176

was :). If this value is not supplied or is supplied but is None

177

retrying does not occur.

178

179

retrycodes = [-1,2,4,5,6,7]

180

181

a sequence of errorcodes (values of e.errno) for which it should

182

retry. See the doc on URLGrabError for more details on

183

this. retrycodes defaults to [-1,2,4,5,6,7] if not specified

184

explicitly.

185

186

checkfunc = None

187

188

a function to do additional checks. This defaults to None, which

189

means no additional checking. The function should simply return

190

on a successful check. It should raise URLGrabError on an

191

unsuccessful check. Raising of any other exception will be

192

considered immediate failure and no retries will occur.

193

194

If it raises URLGrabError, the error code will determine the retry

195

behavior. Negative error numbers are reserved for use by these

196

passed in functions, so you can use many negative numbers for

197

different types of failure. By default, -1 results in a retry,

198

but this can be customized with retrycodes.

199

200

If you simply pass in a function, it will be given exactly one

201

argument: a CallbackObject instance with the .url attribute

202

defined and either .filename (for urlgrab) or .data (for urlread).

203

For urlgrab, .filename is the name of the local file. For

204

urlread, .data is the actual string data. If you need other

205

arguments passed to the callback (program state of some sort), you

206

can do so like this:

207

208

checkfunc=(function, ('arg1', 2), {'kwarg': 3})

209

210

if the downloaded file has filename /tmp/stuff, then this will

211

result in this call (for urlgrab):

212

213

function(obj, 'arg1', 2, kwarg=3)

214

# obj.filename = '/tmp/stuff'

215

# obj.url = 'http://foo.com/stuff'

216

217

NOTE: both the "args" tuple and "kwargs" dict must be present if

218

you use this syntax, but either (or both) can be empty.

219

220

failure_callback = None

221

222

The callback that gets called during retries when an attempt to

223

fetch a file fails. The syntax for specifying the callback is

224

identical to checkfunc, except for the attributes defined in the

225

CallbackObject instance. In this case, it will have .exception

226

and .url defined. As you might suspect, .exception is the

227

exception that was raised.

228

229

The callback is present primarily to inform the calling program of

230

the failure, but if it raises an exception (including the one it's

231

passed) that exception will NOT be caught and will therefore cause

232

future retries to be aborted.

233

234

BANDWIDTH THROTTLING

235

236

urlgrabber supports throttling via two values: throttle and

237

bandwidth Between the two, you can either specify and absolute

238

throttle threshold or specify a theshold as a fraction of maximum

239

available bandwidth.

240

241

throttle is a number - if it's an int, it's the bytes/second

242

throttle limit. If it's a float, it is first multiplied by

243

bandwidth. If throttle == 0, throttling is disabled. If None, the

244

module-level default (which can be set with set_throttle) is used.

245

246

bandwidth is the nominal max bandwidth in bytes/second. If throttle

247

is a float and bandwidth == 0, throttling is disabled. If None, the

248

module-level default (which can be set with set_bandwidth) is used.

249

250

THROTTLING EXAMPLES:

251

252

Lets say you have a 100 Mbps connection. This is (about) 10^8 bits

253

per second, or 12,500,000 Bytes per second. You have a number of

254

throttling options:

255

256

*) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float

257

258

This will limit urlgrab to use half of your available bandwidth.

259

260

*) set_throttle(6250000) # throttle is an int

261

262

This will also limit urlgrab to use half of your available

263

bandwidth, regardless of what bandwidth is set to.

264

265

*) set_throttle(6250000); set_throttle(1.0) # float

266

267

Use half your bandwidth

268

269

*) set_throttle(6250000); set_throttle(2.0) # float

270

271

Use up to 12,500,000 Bytes per second (your nominal max bandwidth)

272

273

*) set_throttle(6250000); set_throttle(0) # throttle = 0

274

275

Disable throttling - this is more efficient than a very large

276

throttle setting.

277

278

*) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0

279

280

Disable throttling - this is the default when the module is loaded.

281

282

SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)

283

284

While this is flexible, it's not extremely obvious to the user. I

285

suggest you implement a float throttle as a percent to make the

286

distinction between absolute and relative throttling very explicit.

287

288

Also, you may want to convert the units to something more convenient

289

than bytes/second, such as kbps or kB/s, etc.

290

291

"""

292

293

# $Id: grabber.py,v 1.39 2005/03/03 00:54:23 mstenner Exp $

294

295

import os

296

import os.path

297

import urlparse

298

import rfc822

299

import time

300

import string

301

import urllib

302

import urllib2

303

from stat import * # S_* and ST_*

304

305

try:

306

exec('from ' + (__name__.split('.'))[0] + ' import __version__')

307

except:

308

__version__ = '???'

309

310

auth_handler = urllib2.HTTPBasicAuthHandler( \

311

urllib2.HTTPPasswordMgrWithDefaultRealm())

312

313

DEBUG=0

314

315

try:

316

from i18n import _

317

except ImportError, msg:

318

def _(st): return st

319

320

try:

321

from httplib import HTTPException

322

except ImportError, msg:

323

HTTPException = None

324

325

try:

326

# This is a convenient way to make keepalive optional.

327

# Just rename the module so it can't be imported.

328

from keepalive import HTTPHandler

329

except ImportError, msg:

330

keepalive_handler = None

331

else:

332

keepalive_handler = HTTPHandler()

333

334

try:

335

# add in range support conditionally too

336

from urlgrabber.byterange import HTTPRangeHandler, FileRangeHandler, \

337

FTPRangeHandler, range_tuple_normalize, range_tuple_to_header, \

338

RangeError

339

except ImportError, msg:

340

range_handlers = ()

341

RangeError = None

342

have_range = 0

343

else:

344

range_handlers = (HTTPRangeHandler(), FileRangeHandler(), FTPRangeHandler())

345

have_range = 1

346

347

348

# check whether socket timeout support is available (Python >= 2.3)

349

import socket

350

try:

351

TimeoutError = socket.timeout

352

have_socket_timeout = True

353

except AttributeError:

354

TimeoutError = None

355

have_socket_timeout = False

356

357

class URLGrabError(IOError):

358

"""

359

URLGrabError error codes:

360

361

URLGrabber error codes (0 -- 255)

362

0 - everything looks good (you should never see this)

363

1 - malformed url

364

2 - local file doesn't exist

365

3 - request for non-file local file (dir, etc)

366

4 - IOError on fetch

367

5 - OSError on fetch

368

6 - no content length header when we expected one

369

7 - HTTPException

370

8 - Exceeded read limit (for urlread)

371

9 - Requested byte range not satisfiable.

372

10 - Byte range requested, but range support unavailable

373

11 - Illegal reget mode

374

12 - Socket timeout.

375

376

MirrorGroup error codes (256 -- 511)

377

256 - No more mirrors left to try

378

379

Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)

380

[ this range reserved for application-specific error codes ]

381

382

Retry codes (< 0)

383

-1 - retry the download, unknown reason

384

385

Note: to test which group a code is in, you can simply do integer

386

division by 256: e.errno / 256

387

388

Negative codes are reserved for use by functions passed in to

389

retrygrab with checkfunc. The value -1 is built in as a generic

390

retry code and is already included in the retrycodes list.

391

Therefore, you can create a custom check function that simply

392

returns -1 and the fetch will be re-tried. For more customized

393

retries, you can use other negative number and include them in

394

retry-codes. This is nice for outputting useful messages about

395

what failed.

396

397

You can use these error codes like so:

398

try: urlgrab(url)

399

except URLGrabError, e:

400

if e.errno == 3: ...

401

# or

402

print e.strerror

403

# or simply

404

print e #### print '[Errno %i] %s' % (e.errno, e.strerror)

405

"""

406

pass

407

408

class CallbackObject:

409

"""Container for returned callback data.

410

411

This is currently a dummy class into which urlgrabber can stuff

412

information for passing to callbacks. This way, the prototype for

413

all callbacks is the same, regardless of the data that will be

414

passed back. Any function that accepts a callback function as an

415

argument SHOULD document what it will define in this object.

416

417

It is possible that this class will have some greater

418

functionality in the future.

419

"""

420

pass

421

422

def close_all():

423

"""close any open keepalive connections"""

424

if keepalive_handler: keepalive_handler.close_all()

425

426

def urlgrab(url, filename=None, **kwargs):

427

"""grab the file at <url> and make a local copy at <filename>

428

If filename is none, the basename of the url is used.

429

urlgrab returns the filename of the local file, which may be different

430

from the passed-in filename if the copy_local kwarg == 0.

431

432

See module documentation for a description of possible kwargs.

433

"""

434

return default_grabber.urlgrab(url, filename, **kwargs)

435

436

def urlopen(url, **kwargs):

437

"""open the url and return a file object

438

If a progress object or throttle specifications exist, then

439

a special file object will be returned that supports them.

440

The file object can be treated like any other file object.

441

442

See module documentation for a description of possible kwargs.

443

"""

444

return default_grabber.urlopen(url, **kwargs)

445

446

def urlread(url, limit=None, **kwargs):

447

"""read the url into a string, up to 'limit' bytes

448

If the limit is exceeded, an exception will be thrown. Note that urlread

449

is NOT intended to be used as a way of saying "I want the first N bytes"

450

but rather 'read the whole file into memory, but don't use too much'

451

452

See module documentation for a description of possible kwargs.

453

"""

454

return default_grabber.urlread(url, limit, **kwargs)

455

456

457

class URLGrabberOptions:

458

"""Class to ease kwargs handling."""

459

460

def __init__(self, delegate=None, **kwargs):

461

"""Initialize URLGrabberOptions object.

462

Set default values for all options and then update options specified

463

in kwargs.

464

"""

465

self.delegate = delegate

466

if delegate is None:

467

self._set_defaults()

468

self._set_attributes(**kwargs)

469

470

def __getattr__(self, name):

471

if self.delegate and hasattr(self.delegate, name):

472

return getattr(self.delegate, name)

473

raise AttributeError, name

474

475

def raw_throttle(self):

476

"""Calculate raw throttle value from throttle and bandwidth

477

values.

478

"""

479

if self.throttle <= 0:

480

return 0

481

elif type(self.throttle) == type(0):

482

return float(self.throttle)

483

else: # throttle is a float

484

return self.bandwidth * self.throttle

485

486

def derive(self, **kwargs):

487

"""Create a derived URLGrabberOptions instance.

488

This method creates a new instance and overrides the

489

options specified in kwargs.

490

"""

491

return URLGrabberOptions(delegate=self, **kwargs)

492

493

def _set_attributes(self, **kwargs):

494

"""Update object attributes with those provided in kwargs."""

495

self.__dict__.update(kwargs)

496

if have_range and kwargs.has_key('range'):

497

# normalize the supplied range value

498

self.range = range_tuple_normalize(self.range)

499

if not self.reget in [None, 'simple', 'check_timestamp']:

500

raise URLGrabError(11, _('Illegal reget mode: %s') \

501

% (self.reget, ))

502

503

def _set_defaults(self):

504

"""Set all options to their default values.

505

When adding new options, make sure a default is

506

provided here.

507

"""

508

self.progress_obj = None

509

self.throttle = 1.0

510

self.bandwidth = 0

511

self.retry = None

512

self.retrycodes = [-1,2,4,5,6,7]

513

self.checkfunc = None

514

self.copy_local = 0

515

self.close_connection = 0

516

self.range = None

517

self.user_agent = 'urlgrabber/%s' % __version__

518

self.keepalive = 1

519

self.proxies = None

520

self.reget = None

521

self.failure_callback = None

522

self.prefix = None

523

self.opener = None

524

self.cache_openers = True

525

self.timeout = None

526

self.text = None

527

self.http_headers = None

528

self.ftp_headers = None

529

530

class URLGrabber:

531

"""Provides easy opening of URLs with a variety of options.

532

533

All options are specified as kwargs. Options may be specified when

534

the class is created and may be overridden on a per request basis.

535

536

New objects inherit default values from default_grabber.

537

"""

538

539

def __init__(self, **kwargs):

540

self.opts = URLGrabberOptions(**kwargs)

541

542

def _retry(self, opts, func, *args):

543

tries = 0

544

while 1:

545

tries = tries + 1

546

try:

547

return apply(func, (opts,) + args, {})

548

except URLGrabError, e:

549

if DEBUG: print 'EXCEPTION: %s' % e

550

if (opts.retry is None) \

551

or (tries == opts.retry) \

552

or (e.errno not in opts.retrycodes): raise

553

if opts.failure_callback:

554

cb_func, cb_args, cb_kwargs = \

555

self._make_callback(opts.failure_callback)

556

# this is a little icky - for now, the first element

557

# of args is the url. we might consider a way to tidy

558

# that up, though

559

obj = CallbackObject()

560

obj.exception = e

561

obj.url = args[0]

562

cb_func(obj, *cb_args, **cb_kwargs)

563

564

def urlopen(self, url, **kwargs):

565

"""open the url and return a file object

566

If a progress object or throttle value specified when this

567

object was created, then a special file object will be

568

returned that supports them. The file object can be treated

569

like any other file object.

570

"""

571

opts = self.opts.derive(**kwargs)

572

(url,parts) = self._parse_url(url)

573

def retryfunc(opts, url):

574

return URLGrabberFileObject(url, filename=None, opts=opts)

575

return self._retry(opts, retryfunc, url)

576

577

def urlgrab(self, url, filename=None, **kwargs):

578

"""grab the file at <url> and make a local copy at <filename>

579

If filename is none, the basename of the url is used.

580

urlgrab returns the filename of the local file, which may be

581

different from the passed-in filename if copy_local == 0.

582

"""

583

opts = self.opts.derive(**kwargs)

584

(url, parts) = self._parse_url(url)

585

(scheme, host, path, parm, query, frag) = parts

586

if filename is None:

587

if scheme in [ 'http', 'https' ]:

588

filename = os.path.basename( urllib.unquote(path) )

589

else:

590

filename = os.path.basename( path )

591

if scheme == 'file' and not opts.copy_local:

592

# just return the name of the local file - don't make a

593

# copy currently

594

if not os.path.exists(path):

595

raise URLGrabError(2,

596

_('Local file does not exist: %s') % (path, ))

597

elif not os.path.isfile(path):

598

raise URLGrabError(3,

599

_('Not a normal file: %s') % (path, ))

600

elif not opts.range:

601

return path

602

603

def retryfunc(opts, url, filename):

604

fo = URLGrabberFileObject(url, filename, opts)

605

try:

606

fo._do_grab()

607

if not opts.checkfunc is None:

608

cb_func, cb_args, cb_kwargs = \

609

self._make_callback(opts.checkfunc)

610

obj = CallbackObject()

611

obj.filename = filename

612

obj.url = url

613

apply(cb_func, (obj, )+cb_args, cb_kwargs)

614

finally:

615

fo.close()

616

return filename

617

618

return self._retry(opts, retryfunc, url, filename)

619

620

def urlread(self, url, limit=None, **kwargs):

621

"""read the url into a string, up to 'limit' bytes

622

If the limit is exceeded, an exception will be thrown. Note

623

that urlread is NOT intended to be used as a way of saying

624

"I want the first N bytes" but rather 'read the whole file

625

into memory, but don't use too much'

626

"""

627

opts = self.opts.derive(**kwargs)

628

(url, parts) = self._parse_url(url)

629

if limit is not None:

630

limit = limit + 1

631

632

def retryfunc(opts, url, limit):

633

fo = URLGrabberFileObject(url, filename=None, opts=opts)

634

s = ''

635

try:

636

# this is an unfortunate thing. Some file-like objects

637

# have a default "limit" of None, while the built-in (real)

638

# file objects have -1. They each break the other, so for

639

# now, we just force the default if necessary.

640

if limit is None: s = fo.read()

641

else: s = fo.read(limit)

642

643

if not opts.checkfunc is None:

644

cb_func, cb_args, cb_kwargs = \

645

self._make_callback(opts.checkfunc)

646

obj = CallbackObject()

647

obj.data = s

648

obj.url = url

649

apply(cb_func, (obj, )+cb_args, cb_kwargs)

650

finally:

651

fo.close()

652

return s

653

654

s = self._retry(opts, retryfunc, url, limit)

655

if limit and len(s) > limit:

656

raise URLGrabError(8,

657

_('Exceeded limit (%i): %s') % (limit, url))

658

return s

659

660

def _parse_url(self,url):

661

"""break up the url into its component parts

662

663

This function disassembles a url and

664

1) "normalizes" it, tidying it up a bit

665

2) does any authentication stuff it needs to do

666

667

it returns the (cleaned) url and a tuple of component parts

668

"""

669

if self.opts.prefix:

670

p = self.opts.prefix

671

if p[-1] == '/' or url[0] == '/': url = p + url

672

else: url = p + '/' + url

673

674

(scheme, host, path, parm, query, frag) = \

675

urlparse.urlparse(url)

676

if not scheme:

677

if not url[0] == '/': url = os.path.abspath(url)

678

url = 'file:' + url

679

(scheme, host, path, parm, query, frag) = \

680

urlparse.urlparse(url)

681

path = os.path.normpath(path)

682

if scheme in ['http', 'https']: path = urllib.quote(path)

683

if '@' in host and auth_handler and scheme in ['http', 'https']:

684

try:

685

user_pass, host = host.split('@', 1)

686

if ':' in user_pass: user, password = user_pass.split(':', 1)

687

except ValueError, e:

688

raise URLGrabError(1, _('Bad URL: %s') % url)

689

if DEBUG: print 'adding HTTP auth: %s, %s' % (user, password)

690

auth_handler.add_password(None, host, user, password)

691

parts = (scheme, host, path, parm, query, frag)

692

url = urlparse.urlunparse(parts)

693

return url, parts

694

695

def _make_callback(self, callback_obj):

696

if callable(callback_obj):

697

return callback_obj, (), {}

698

else:

699

return callback_obj

700

701

# create the default URLGrabber used by urlXXX functions.

702

# NOTE: actual defaults are set in URLGrabberOptions

703

default_grabber = URLGrabber()

704

705

class URLGrabberFileObject:

706

"""This is a file-object wrapper that supports progress objects

707

and throttling.

708

709

This exists to solve the following problem: lets say you want to

710

drop-in replace a normal open with urlopen. You want to use a

711

progress meter and/or throttling, but how do you do that without

712

rewriting your code? Answer: urlopen will return a wrapped file

713

object that does the progress meter and-or throttling internally.

714

"""

715

716

def __init__(self, url, filename, opts):

717

self.url = url

718

self.filename = filename

719

self.opts = opts

720

self.fo = None

721

self._rbuf = ''

722

self._rbufsize = 1024*8

723

self._ttime = time.time()

724

self._tsize = 0

725

self._amount_read = 0

726

self._opener = None

727

self._do_open()

728

729

def __getattr__(self, name):

730

"""This effectively allows us to wrap at the instance level.

731

Any attribute not found in _this_ object will be searched for

732

in self.fo. This includes methods."""

733

if hasattr(self.fo, name):

734

return getattr(self.fo, name)

735

raise AttributeError, name

736

737

def _get_opener(self):

738

"""Build a urllib2 OpenerDirector based on request options."""

739

if self.opts.opener:

740

return self.opts.opener

741

elif self._opener is None:

742

handlers = []

743

need_keepalive_handler = (keepalive_handler and self.opts.keepalive)

744

need_range_handler = (range_handlers and \

745

(self.opts.range or self.opts.reget))

746

# if you specify a ProxyHandler when creating the opener

747

# it _must_ come before all other handlers in the list or urllib2

748

# chokes.

749

if self.opts.proxies:

750

handlers.append( CachedProxyHandler(self.opts.proxies) )

751

752

# -------------------------------------------------------

753

# OK, these next few lines are a serious kludge to get

754

# around what I think is a bug in python 2.2's

755

# urllib2. The basic idea is that default handlers

756

# get applied first. If you override one (like a

757

# proxy handler), then the default gets pulled, but

758

# the replacement goes on the end. In the case of

759

# proxies, this means the normal handler picks it up

760

# first and the proxy isn't used. Now, this probably

761

# only happened with ftp or non-keepalive http, so not

762

# many folks saw it. The simple approach to fixing it

763

# is just to make sure you override the other

764

# conflicting defaults as well. I would LOVE to see

765

# these go way or be dealt with more elegantly. The

766

# problem isn't there after 2.2. -MDS 2005/02/24

767

if not need_keepalive_handler:

768

handlers.append( urllib2.HTTPHandler() )

769

if not need_range_handler:

770

handlers.append( urllib2.FTPHandler() )

771

# -------------------------------------------------------

772

773

if need_keepalive_handler:

774

handlers.append( keepalive_handler )

775

if need_range_handler:

776

handlers.extend( range_handlers )

777

handlers.append( auth_handler )

778

if self.opts.cache_openers:

779

self._opener = CachedOpenerDirector(*handlers)

780

else:

781

self._opener = urllib2.build_opener(*handlers)

782

# OK, I don't like to do this, but otherwise, we end up with

783

# TWO user-agent headers.

784

self._opener.addheaders = []

785

return self._opener

786

787

def _do_open(self):

788

opener = self._get_opener()

789

790

req = urllib2.Request(self.url) # build request object

791

self._add_headers(req) # add misc headers that we need

792

self._build_range(req) # take care of reget and byterange stuff

793

794

fo, hdr = self._make_request(req, opener)

795

if self.reget_time and self.opts.reget == 'check_timestamp':

796

# do this if we have a local file with known timestamp AND

797

# we're in check_timestamp reget mode.

798

fetch_again = 0

799

try:

800

modified_tuple = hdr.getdate_tz('last-modified')

801

modified_stamp = rfc822.mktime_tz(modified_tuple)

802

if modified_stamp > self.reget_time: fetch_again = 1

803

except (TypeError,):

804

fetch_again = 1

805

806

if fetch_again:

807

# the server version is newer than the (incomplete) local

808

# version, so we should abandon the version we're getting

809

# and fetch the whole thing again.

810

fo.close()

811

self.opts.reget = None

812

del req.headers['Range']

813

self._build_range(req)

814

fo, hdr = self._make_request(req, opener)

815

816

(scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)

817

if not (self.opts.progress_obj or self.opts.raw_throttle() \

818

or self.opts.timeout):

819

# if we're not using the progress_obj, throttling, or timeout

820

# we can get a performance boost by going directly to

821

# the underlying fileobject for reads.

822

self.read = fo.read

823

if hasattr(fo, 'readline'):

824

self.readline = fo.readline

825

elif self.opts.progress_obj:

826

try: length = int(hdr['Content-Length'])

827

except: length = None

828

self.opts.progress_obj.start(str(self.filename), self.url,

829

os.path.basename(path),

830

length,

831

text=self.opts.text)

832

self.opts.progress_obj.update(0)

833

(self.fo, self.hdr) = (fo, hdr)

834

835

def _add_headers(self, req):

836

if self.opts.user_agent:

837

req.add_header('User-agent', self.opts.user_agent)

838

try: req_type = req.get_type()

839

except ValueError: req_type = None

840

if self.opts.http_headers and req_type in ('http', 'https'):

841

for h, v in self.opts.http_headers:

842

req.add_header(h, v)

843

if self.opts.ftp_headers and req_type == 'ftp':

844

for h, v in self.opts.ftp_headers:

845

req.add_header(h, v)

846

847

def _build_range(self, req):

848

self.reget_time = None

849

self.append = 0

850

reget_length = 0

851

rt = None

852

if have_range and self.opts.reget and type(self.filename) == type(''):

853

# we have reget turned on and we're dumping to a file

854

try:

855

s = os.stat(self.filename)

856

except OSError:

857

pass

858

else:

859

self.reget_time = s[ST_MTIME]

860

reget_length = s[ST_SIZE]

861

rt = reget_length, ''

862

self.append = 1

863

864

if self.opts.range:

865

if not have_range:

866

raise URLGrabError(10, _('Byte range requested but range '\

867

'support unavailable'))

868

rt = self.opts.range

869

if rt[0]: rt = (rt[0] + reget_length, rt[1])

870

871

if rt:

872

header = range_tuple_to_header(rt)

873

if header: req.add_header('Range', header)

874

875

def _make_request(self, req, opener):

876

try:

877

if have_socket_timeout and self.opts.timeout:

878

old_to = socket.getdefaulttimeout()

879

socket.setdefaulttimeout(self.opts.timeout)

880

try:

881

fo = opener.open(req)

882

finally:

883

socket.setdefaulttimeout(old_to)

884

else:

885

fo = opener.open(req)

886

hdr = fo.info()

887

except ValueError, e:

888

raise URLGrabError(1, _('Bad URL: %s') % (e, ))

889

except RangeError, e:

890

raise URLGrabError(9, _('%s') % (e, ))

891

except IOError, e:

892

if hasattr(e, 'reason') and have_socket_timeout and \

893

isinstance(e.reason, TimeoutError):

894

raise URLGrabError(12, _('Timeout: %s') % (e, ))

895

else:

896

raise URLGrabError(4, _('IOError: %s') % (e, ))

897

except OSError, e:

898

raise URLGrabError(5, _('OSError: %s') % (e, ))

899

except HTTPException, e:

900

raise URLGrabError(7, _('HTTP Error (%s): %s') % \

901

(e.__class__.__name__, e))

902

else:

903

return (fo, hdr)

904

905

def _do_grab(self):

906

"""dump the file to self.filename."""

907

if self.append: new_fo = open(self.filename, 'ab')

908

else: new_fo = open(self.filename, 'wb')

909

bs = 1024*8

910

size = 0

911

912

block = self.read(bs)

913

size = size + len(block)

914

while block:

915

new_fo.write(block)

916

block = self.read(bs)

917

size = size + len(block)

918

919

new_fo.close()

920

try:

921

modified_tuple = self.hdr.getdate_tz('last-modified')

922

modified_stamp = rfc822.mktime_tz(modified_tuple)

923

os.utime(self.filename, (modified_stamp, modified_stamp))

924

except (TypeError,), e: pass

925

926

return size

927

928

def _fill_buffer(self, amt=None):

929

"""fill the buffer to contain at least 'amt' bytes by reading

930

from the underlying file object. If amt is None, then it will

931

read until it gets nothing more. It updates the progress meter

932

and throttles after every self._rbufsize bytes."""

933

# the _rbuf test is only in this first 'if' for speed. It's not

934

# logically necessary

935

if self._rbuf and not amt is None:

936

L = len(self._rbuf)

937

if amt > L:

938

amt = amt - L

939

else:

940

return

941

942

# if we've made it here, then we don't have enough in the buffer

943

# and we need to read more.

944

945

buf = [self._rbuf]

946

bufsize = len(self._rbuf)

947

while amt is None or amt:

948

# first, delay if necessary for throttling reasons

949

if self.opts.raw_throttle():

950

diff = self._tsize/self.opts.raw_throttle() - \

951

(time.time() - self._ttime)

952

if diff > 0: time.sleep(diff)

953

self._ttime = time.time()

954

955

# now read some data, up to self._rbufsize

956

if amt is None: readamount = self._rbufsize

957

else: readamount = min(amt, self._rbufsize)

958

try:

959

new = self.fo.read(readamount)

960

except socket.error, e:

961

raise URLGrabError(4, _('Socket Error: %s') % (e, ))

962

except TimeoutError, e:

963

raise URLGrabError(12, _('Timeout: %s') % (e, ))

964

newsize = len(new)

965

if not newsize: break # no more to read

966

967

if amt: amt = amt - newsize

968

buf.append(new)

969

bufsize = bufsize + newsize

970

self._tsize = newsize

971

self._amount_read = self._amount_read + newsize

972

if self.opts.progress_obj:

973

self.opts.progress_obj.update(self._amount_read)

974

975

self._rbuf = string.join(buf, '')

976

return

977

978

def read(self, amt=None):

979

self._fill_buffer(amt)

980

if amt is None:

981

s, self._rbuf = self._rbuf, ''

982

else:

983

s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]

984

return s

985

986

def readline(self, limit=-1):

987

i = string.find(self._rbuf, '\n')

988

while i < 0 and not (0 < limit <= len(self._rbuf)):

989

L = len(self._rbuf)

990

self._fill_buffer(L + self._rbufsize)

991

if not len(self._rbuf) > L: break

992

i = string.find(self._rbuf, '\n', L)

993

994

if i < 0: i = len(self._rbuf)

995

else: i = i+1

996

if 0 <= limit < len(self._rbuf): i = limit

997

998

s, self._rbuf = self._rbuf[:i], self._rbuf[i:]

999

return s

1000

1001

def close(self):

1002

if self.opts.progress_obj:

1003

self.opts.progress_obj.end(self._amount_read)

1004

self.fo.close()

1005

if self.opts.close_connection:

1006

try: self.fo.close_connection()

1007

except: pass

1008

1009

_handler_cache = []

1010

def CachedOpenerDirector(*handlers):

1011

for (cached_handlers, opener) in _handler_cache:

1012

if cached_handlers == handlers:

1013

for handler in opener.handlers:

1014

handler.add_parent(opener)

1015

return opener

1016

opener = urllib2.build_opener(*handlers)

1017

_handler_cache.append( (handlers, opener) )

1018

return opener

1019

1020

_proxy_cache = []

1021

def CachedProxyHandler(proxies):

1022

for (pdict, handler) in _proxy_cache:

1023

if pdict == proxies:

1024

break

1025

else:

1026

handler = urllib2.ProxyHandler(proxies)

1027

_proxy_cache.append( (proxies, handler) )

1028

return handler

1029

1030

#####################################################################

1031

# DEPRECATED FUNCTIONS

1032

def set_throttle(new_throttle):

1033

"""Deprecated. Use: default_grabber.throttle = new_throttle"""

1034

default_grabber.throttle = new_throttle

1035

1036

def set_bandwidth(new_bandwidth):

1037

"""Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""

1038

default_grabber.bandwidth = new_bandwidth

1039

1040

def set_progress_obj(new_progress_obj):

1041

"""Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""

1042

default_grabber.progress_obj = new_progress_obj

1043

1044

def set_user_agent(new_user_agent):

1045

"""Deprecated. Use: default_grabber.user_agent = new_user_agent"""

1046

default_grabber.user_agent = new_user_agent

1047

1048

def retrygrab(url, filename=None, copy_local=0, close_connection=0,

1049

progress_obj=None, throttle=None, bandwidth=None,

1050

numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):

1051

"""Deprecated. Use: urlgrab() with the retry arg instead"""

1052

kwargs = {'copy_local' : copy_local,

1053

'close_connection' : close_connection,

1054

'progress_obj' : progress_obj,

1055

'throttle' : throttle,

1056

'bandwidth' : bandwidth,

1057

'retry' : numtries,

1058

'retrycodes' : retrycodes,

1059

'checkfunc' : checkfunc

1060

}

1061

return urlgrab(url, filename, **kwargs)

1062

1063

1064

#####################################################################

1065

# TESTING

1066

def _main_test():

1067

import sys

1068

try: url, filename = sys.argv[1:3]

1069

except ValueError:

1070

print 'usage:', sys.argv[0], \

1071

'<url> <filename> [copy_local=0|1] [close_connection=0|1]'

1072

sys.exit()

1073

1074

kwargs = {}

1075

for a in sys.argv[3:]:

1076

k, v = string.split(a, '=', 1)

1077

kwargs[k] = int(v)

1078

1079

set_throttle(1.0)

1080

set_bandwidth(32 * 1024)

1081

print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,

1082

default_grabber.bandwidth)

1083

1084

try: from progress import text_progress_meter

1085

except ImportError, e: pass

1086

else: kwargs['progress_obj'] = text_progress_meter()

1087

1088

try: name = apply(urlgrab, (url, filename), kwargs)

1089

except URLGrabError, e: print e

1090

else: print 'LOCAL FILE:', name

1091

1092

1093

def _retry_test():

1094

import sys

1095

try: url, filename = sys.argv[1:3]

1096

except ValueError:

1097

print 'usage:', sys.argv[0], \

1098

'<url> <filename> [copy_local=0|1] [close_connection=0|1]'

1099

sys.exit()

1100

1101

kwargs = {}

1102

for a in sys.argv[3:]:

1103

k, v = string.split(a, '=', 1)

1104

kwargs[k] = int(v)

1105

1106

try: from progress import text_progress_meter

1107

except ImportError, e: pass

1108

else: kwargs['progress_obj'] = text_progress_meter()

1109

1110

global DEBUG

1111

#DEBUG = 1

1112

def cfunc(filename, hello, there='foo'):

1113

print hello, there

1114

import random

1115

rnum = random.random()

1116

if rnum < .5:

1117

print 'forcing retry'

1118

raise URLGrabError(-1, 'forcing retry')

1119

if rnum < .75:

1120

print 'forcing failure'

1121

raise URLGrabError(-2, 'forcing immediate failure')

1122

print 'success'

1123

return

1124

1125

close_all()

1126

kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})

1127

try: name = apply(retrygrab, (url, filename), kwargs)

1128

except URLGrabError, e: print e

1129

else: print 'LOCAL FILE:', name

1130

1131

def _file_object_test(filename=None):

1132

import random, cStringIO, sys

1133

if filename is None:

1134

filename = __file__

1135

print 'using file "%s" for comparisons' % filename

1136

fo = open(filename)

1137

s_input = fo.read()

1138

fo.close()

1139

1140

for testfunc in [_test_file_object_smallread,

1141

_test_file_object_readall,

1142

_test_file_object_readline,

1143

_test_file_object_readlines]:

1144

fo_input = cStringIO.StringIO(s_input)

1145

fo_output = cStringIO.StringIO()

1146

wrapper = URLGrabberFileObject(fo_input, None, 0)

1147

print 'testing %-30s ' % testfunc.__name__,

1148

testfunc(wrapper, fo_output)

1149

s_output = fo_output.getvalue()

1150

if s_output == s_input: print 'passed'

1151

else: print 'FAILED'

1152

1153

def _test_file_object_smallread(wrapper, fo_output):

1154

while 1:

1155

s = wrapper.read(23)

1156

fo_output.write(s)

1157

if not s: return

1158

1159

def _test_file_object_readall(wrapper, fo_output):

1160

s = wrapper.read()

1161

fo_output.write(s)

1162

1163

def _test_file_object_readline(wrapper, fo_output):

1164

while 1:

1165

s = wrapper.readline()

1166

fo_output.write(s)

1167

if not s: return

1168

1169

def _test_file_object_readlines(wrapper, fo_output):

1170

li = wrapper.readlines()

1171

fo_output.write(string.join(li, ''))

1172

1173

if __name__ == '__main__':

1174

_main_test()

1175

_retry_test()

1176

_file_object_test('test')

1177