~bzr-pqm/bzr/bzr.dev : revision 2145.1.1

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

16

17

"""Implementaion of urllib2 tailored to bzr needs

18

19

This file re-implements the urllib2 class hierarchy with custom classes.

20

21

For instance, we create a new HTTPConnection and HTTPSConnection that inherit

22

from the original urllib2.HTTP(s)Connection objects, but also have a new base

23

which implements a custom getresponse and fake_close handlers.

24

25

And then we implement custom HTTPHandler and HTTPSHandler classes, that use

26

the custom HTTPConnection classes.

27

28

We have a custom Response class, which lets us maintain a keep-alive

29

connection even for requests that urllib2 doesn't expect to contain body data.

30

31

And a custom Request class that lets us track redirections, and send

32

authentication data without requiring an extra round trip to get rejected by

33

the server. We also create a Request hierarchy, to make it clear what type

34

of request is being made.

35

"""

36

37

DEBUG = 0

38

39

# TODO: It may be possible to share the password_manager across

40

# all transports by prefixing the realm by the protocol used

41

# (especially if other protocols do not use realms). See

42

# PasswordManager below.

43

44

# FIXME: Oversimplifying, two kind of exceptions should be

45

# raised, once a request is issued: URLError before we have been

46

# able to process the response, HTTPError after that. Process the

47

# response means we are able to leave the socket clean, so if we

48

# are not able to do that, we should close the connection. The

49

# actual code more or less do that, tests should be written to

50

# ensure that.

51

52

import httplib

53

import socket

54

import urllib

55

import urllib2

56

import urlparse

57

import sys

58

59

from bzrlib import __version__ as bzrlib_version

60

from bzrlib import errors

61

62

63

# We define our own Response class to keep our httplib pipe clean

64

class Response(httplib.HTTPResponse):

65

"""Custom HTTPResponse, to avoid the need to decorate.

66

67

httplib prefers to decorate the returned objects, rather

68

than using a custom object.

69

"""

70

71

# Some responses have bodies in which we have no interest

72

_body_ignored_responses = [301,302, 303, 307, 401, 403, 404]

73

74

def __init__(self, *args, **kwargs):

75

httplib.HTTPResponse.__init__(self, *args, **kwargs)

76

77

def begin(self):

78

"""Begin to read the response from the server.

79

80

httplib assumes that some responses get no content and do

81

not even attempt to read the body in that case, leaving

82

the body in the socket, blocking the next request. Let's

83

try to workaround that.

84

"""

85

httplib.HTTPResponse.begin(self)

86

if self.status in self._body_ignored_responses:

87

if self.debuglevel > 0:

88

print "For status: [%s]," % self.status,

89

print "will ready body, length: ",

90

if self.length is not None:

91

print "[%d]" % self.length

92

else:

93

print "None"

94

if not (self.length is None or self.will_close):

95

# In some cases, we just can't read the body not

96

# even try or we may encounter a 104, 'Connection

97

# reset by peer' error if there is indeed no body

98

# and the server closed the connection just after

99

# having issued the response headers (even if the

100

# headers indicate a Content-Type...)

101

body = self.fp.read(self.length)

102

if self.debuglevel > 0:

103

print "Consumed body: [%s]" % body

104

self.close()

105

106

107

# Not inheriting from 'object' because httplib.HTTPConnection doesn't.

108

class AbstractHTTPConnection:

109

"""A custom HTTP(S) Connection, which can reset itself on a bad response"""

110

111

response_class = Response

112

strict = 1 # We don't support HTTP/0.9

113

114

def fake_close(self):

115

"""Make the connection believes the response have been fully handled.

116

117

That makes the httplib.HTTPConnection happy

118

"""

119

# Preserve our preciousss

120

sock = self.sock

121

self.sock = None

122

self.close()

123

self.sock = sock

124

125

126

class HTTPConnection(AbstractHTTPConnection, httplib.HTTPConnection):

127

pass

128

129

130

class HTTPSConnection(AbstractHTTPConnection, httplib.HTTPSConnection):

131

pass

132

133

134

class Request(urllib2.Request):

135

"""A custom Request object.

136

137

urllib2 determines the request method heuristically (based on

138

the presence or absence of data). We set the method

139

statically.

140

141

Also, the Request object tracks the connection the request will

142

be made on.

143

"""

144

145

def __init__(self, method, url, data=None, headers={},

146

origin_req_host=None, unverifiable=False,

147

connection=None, parent=None,):

148

# urllib2.Request will be confused if we don't extract

149

# authentification info before building the request

150

url, self.user, self.password = self.extract_auth(url)

151

urllib2.Request.__init__(self, url, data, headers,

152

origin_req_host, unverifiable)

153

self.method = method

154

self.connection = connection

155

# To handle redirections

156

self.parent = parent

157

self.redirected_to = None

158

159

def extract_auth(self, url):

160

"""Extracts authentification information from url.

161

162

Get user and password from url of the form: http://user:pass@host/path

163

"""

164

scheme, netloc, path, query, fragment = urlparse.urlsplit(url)

165

166

if '@' in netloc:

167

auth, netloc = netloc.split('@', 1)

168

if ':' in auth:

169

user, password = auth.split(':', 1)

170

else:

171

user, password = auth, None

172

user = urllib.unquote(user)

173

if password is not None:

174

password = urllib.unquote(password)

175

else:

176

user = None

177

password = None

178

179

url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))

180

181

return url, user, password

182

183

def get_method(self):

184

return self.method

185

186

187

# The urlib2.xxxAuthHandler handle the authentification of the

188

# requests, to do that, they need an urllib2 PasswordManager *at

189

# build time*. We also need one to reuse the passwords already

190

# typed by the user.

191

class PasswordManager(urllib2.HTTPPasswordMgrWithDefaultRealm):

192

193

def __init__(self):

194

urllib2.HTTPPasswordMgrWithDefaultRealm.__init__(self)

195

196

197

class ConnectionHandler(urllib2.BaseHandler):

198

"""Provides connection-sharing by pre-processing requests.

199

200

urllib2 provides no way to access the HTTPConnection object

201

internally used. But we need it in order to achieve

202

connection sharing. So, we add it to the request just before

203

it is processed, and then we override the do_open method for

204

http[s] requests.

205

"""

206

207

handler_order = 1000 # after all pre-processings

208

209

def get_key(self, connection):

210

"""Returns the key for the connection in the cache"""

211

return '%s:%d' % (connection.host, connection.port)

212

213

def create_connection(self, request, http_connection_class):

214

host = request.get_host()

215

if not host:

216

# Just a bit of paranoia here, this should have been

217

# handled in the higher levels

218

raise errors.InvalidURL(request.get_full_url(), 'no host given.')

219

220

# We create a connection (but it will not connect yet)

221

try:

222

connection = http_connection_class(host)

223

except httplib.InvalidURL, exception:

224

# There is only one occurrence of InvalidURL in httplib

225

raise errors.InvalidURL(request.get_full_url(),

226

extra='nonnumeric port')

227

228

return connection

229

230

def capture_connection(self, request, http_connection_class):

231

"""Capture or inject the request connection.

232

233

Two cases:

234

- the request have no connection: create a new one,

235

236

- the request have a connection: this one have been used

237

already, let's capture it, so that we can give it to

238

another transport to be reused. We don't do that

239

ourselves: the Transport object get the connection from

240

a first request and then propagate it, from request to

241

request or to cloned transports.

242

"""

243

connection = request.connection

244

if connection is None:

245

# Create a new one

246

connection = self.create_connection(request, http_connection_class)

247

request.connection = connection

248

249

# All connections will pass here, propagate debug level

250

connection.set_debuglevel(DEBUG)

251

return request

252

253

def http_request(self, request):

254

return self.capture_connection(request, HTTPConnection)

255

256

def https_request(self, request):

257

return self.capture_connection(request, HTTPSConnection)

258

259

260

class AbstractHTTPHandler(urllib2.AbstractHTTPHandler):

261

"""A custom handler for HTTP(S) requests.

262

263

We overrive urllib2.AbstractHTTPHandler to get a better

264

control of the connection, the ability to implement new

265

request types and return a response able to cope with

266

persistent connections.

267

"""

268

269

# We change our order to be before urllib2 HTTP[S]Handlers

270

# and be chosen instead of them (the first http_open called

271

# wins).

272

handler_order = 400

273

274

_default_headers = {'Pragma': 'no-cache',

275

'Cache-control': 'max-age=0',

276

'Connection': 'Keep-Alive',

277

# FIXME: Spell it User-*A*gent once we

278

# know how to properly avoid bogus

279

# urllib2 using capitalize() for headers

280

# instead of title(sp?).

281

'User-agent': 'bzr/%s (urllib)' % bzrlib_version,

282

# FIXME: pycurl also set the following, understand why

283

'Accept': '*/*',

284

}

285

286

def __init__(self):

287

urllib2.AbstractHTTPHandler.__init__(self, debuglevel=DEBUG)

288

289

def http_request(self, request):

290

"""Common headers setting"""

291

292

request.headers.update(self._default_headers.copy())

293

# FIXME: We may have to add the Content-Length header if

294

# we have data to send.

295

return request

296

297

def retry_or_raise(self, http_class, request, first_try):

298

"""Retry the request (once) or raise the exception.

299

300

urllib2 raises exception of application level kind, we

301

just have to translate them.

302

303

httplib can raise exceptions of transport level (badly

304

formatted dialog, loss of connexion or socket level

305

problems). In that case we should issue the request again

306

(httplib will close and reopen a new connection if

307

needed).

308

"""

309

# When an exception occurs, we give back the original

310

# Traceback or the bugs are hard to diagnose.

311

exc_type, exc_val, exc_tb = sys.exc_info()

312

if exc_type == socket.gaierror:

313

# No need to retry, that will not help

314

raise errors.ConnectionError("Couldn't resolve host '%s'"

315

% request.get_origin_req_host(),

316

orig_error=exc_val)

317

else:

318

if first_try:

319

if self._debuglevel > 0:

320

print 'Received exception: [%r]' % exc_val

321

print ' On connection: [%r]' % request.connection

322

method = request.get_method()

323

url = request.get_full_url()

324

print ' Will retry, %s %r' % (method, url)

325

request.connection.close()

326

response = self.do_open(http_class, request, False)

327

convert_to_addinfourl = False

328

else:

329

if self._debuglevel > 0:

330

print 'Received second exception: [%r]' % exc_val

331

print ' On connection: [%r]' % request.connection

332

if exc_type in (httplib.BadStatusLine, httplib.UnknownProtocol):

333

# httplib.BadStatusLine and

334

# httplib.UnknownProtocol indicates that a

335

# bogus server was encountered or a bad

336

# connection (i.e. transient errors) is

337

# experimented, we have already retried once

338

# for that request so we raise the exception.

339

my_exception = errors.InvalidHttpResponse(

340

request.get_full_url(),

341

'Bad status line received',

342

orig_error=exc_val)

343

else:

344

# All other exception are considered connection related.

345

346

# httplib.HTTPException should indicate a bug

347

# in the urllib implementation, somewhow the

348

# httplib pipeline is in an incorrect state,

349

# we retry in hope that this will correct the

350

# problem but that may need investigation

351

# (note that no such bug is known as of

352

# 20061005 --vila).

353

354

# socket errors generally occurs for reasons

355

# far outside our scope, so closing the

356

# connection and retrying is the best we can

357

# do.

358

359

# FIXME: and then there is HTTPError raised by:

360

# - HTTPDefaultErrorHandler (we define our own)

361

# - HTTPRedirectHandler.redirect_request

362

# - AbstractDigestAuthHandler.http_error_auth_reqed

363

364

my_exception = errors.ConnectionError(

365

msg= 'while sending %s %s:' % (request.get_method(),

366

request.get_selector()),

367

orig_error=exc_val)

368

369

if self._debuglevel > 0:

370

print 'On connection: [%r]' % request.connection

371

method = request.get_method()

372

url = request.get_full_url()

373

print ' Failed again, %s %r' % (method, url)

374

print ' Will raise: [%r]' % my_exception

375

raise my_exception, None, exc_tb

376

return response, convert_to_addinfourl

377

378

def do_open(self, http_class, request, first_try=True):

379

"""See urllib2.AbstractHTTPHandler.do_open for the general idea.

380

381

The request will be retried once if it fails.

382

"""

383

connection = request.connection

384

assert connection is not None, \

385

'Cannot process a request without a connection'

386

387

# Get all the headers

388

headers = {}

389

headers.update(request.header_items())

390

headers.update(request.unredirected_hdrs)

391

392

try:

393

connection._send_request(request.get_method(),

394

request.get_selector(),

395

# FIXME: implements 100-continue

396

#None, # We don't send the body yet

397

request.get_data(),

398

headers)

399

if self._debuglevel > 0:

400

print 'Request sent: [%r]' % request

401

response = connection.getresponse()

402

convert_to_addinfourl = True

403

except (socket.gaierror, httplib.BadStatusLine, httplib.UnknownProtocol,

404

socket.error, httplib.HTTPException):

405

response, convert_to_addinfourl = self.retry_or_raise(http_class,

406

request,

407

first_try)

408

409

# FIXME: HTTPConnection does not fully support 100-continue (the

410

# server responses are just ignored)

411

412

# if code == 100:

413

# mutter('Will send the body')

414

# # We can send the body now

415

# body = request.get_data()

416

# if body is None:

417

# raise URLError("No data given")

418

# connection.send(body)

419

# response = connection.getresponse()

420

421

if self._debuglevel > 0:

422

print 'Receives response: %r' % response

423

print ' For: %r(%r)' % (request.get_method(),

424

request.get_full_url())

425

426

if convert_to_addinfourl:

427

# Shamelessly copied from urllib2

428

req = request

429

r = response

430

r.recv = r.read

431

fp = socket._fileobject(r)

432

resp = urllib2.addinfourl(fp, r.msg, req.get_full_url())

433

resp.code = r.status

434

resp.msg = r.reason

435

if self._debuglevel > 0:

436

print 'Create addinfourl: %r' % resp

437

print ' For: %r(%r)' % (request.get_method(),

438

request.get_full_url())

439

else:

440

resp = response

441

return resp

442

443

# # we need titled headers in a dict but

444

# # response.getheaders returns a list of (lower(header).

445

# # Let's title that because most of bzr handle titled

446

# # headers, but maybe we should switch to lowercased

447

# # headers...

448

# # jam 20060908: I think we actually expect the headers to

449

# # be similar to mimetools.Message object, which uses

450

# # case insensitive keys. It lowers() all requests.

451

# # My concern is that the code may not do perfect title case.

452

# # For example, it may use Content-type rather than Content-Type

453

#

454

# # When we get rid of addinfourl, we must ensure that bzr

455

# # always use titled headers and that any header received

456

# # from server is also titled.

457

#

458

# headers = {}

459

# for header, value in (response.getheaders()):

460

# headers[header.title()] = value

461

# # FIXME: Implements a secured .read method

462

# response.code = response.status

463

# response.headers = headers

464

# return response

465

466

467

class HTTPHandler(AbstractHTTPHandler):

468

"""A custom handler that just thunks into HTTPConnection"""

469

470

def http_open(self, request):

471

return self.do_open(HTTPConnection, request)

472

473

474

class HTTPSHandler(AbstractHTTPHandler):

475

"""A custom handler that just thunks into HTTPSConnection"""

476

477

def https_open(self, request):

478

return self.do_open(HTTPSConnection, request)

479

480

481

class HTTPRedirectHandler(urllib2.HTTPRedirectHandler):

482

"""Handles redirect requests.

483

484

We have to implement our own scheme because we use a specific

485

Request object and because we want to implement a specific

486

policy.

487

"""

488

_debuglevel = DEBUG

489

# RFC2616 says that only read requests should be redirected

490

# without interacting with the user. But bzr use some

491

# shortcuts to optimize against roundtrips which can leads to

492

# write requests being issued before read requests of

493

# containing dirs can be redirected. So we redirect write

494

# requests in the same way which seems to respect the spirit

495

# of the RFC if not its letter.

496

497

def redirect_request(self, req, fp, code, msg, headers, newurl):

498

"""See urllib2.HTTPRedirectHandler.redirect_request"""

499

# We would have preferred to update the request instead

500

# of creating a new one, but the urllib2.Request object

501

# has a too complicated creation process to provide a

502

# simple enough equivalent update process. Instead, when

503

# redirecting, we only update the original request with a

504

# reference to the following request in the redirect

505

# chain.

506

507

# Some codes make no sense on out context and are treated

508

# as errors:

509

510

# 300: Multiple choices for different representations of

511

# the URI. Using that mechanisn with bzr will violate the

512

# protocol neutrality of Transport.

513

514

# 304: Not modified (SHOULD only occurs with conditional

515

# GETs which are not used by our implementation)

516

517

# 305: Use proxy. I can't imagine this one occurring in

518

# our context-- vila/20060909

519

520

# 306: Unused (if the RFC says so...)

521

522

# FIXME: If the code is 302 and the request is HEAD, we

523

# MAY avoid following the redirections if the intent is

524

# to check the existence, we have a hint that the file

525

# exist, now if we want to be sure, we must follow the

526

# redirection. Let's do that for now.

527

528

if code in (301, 302, 303, 307):

529

return Request(req.get_method(),newurl,

530

headers = req.headers,

531

origin_req_host = req.get_origin_req_host(),

532

unverifiable = True,

533

# TODO: It will be nice to be able to

534

# detect virtual hosts sharing the same

535

# IP address, that will allow us to

536

# share the same connection...

537

connection = None,

538

parent = req,

539

)

540

else:

541

raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)

542

543

def http_error_30x(self, req, fp, code, msg, headers):

544

"""Requests the redirected to URI.

545

546

Copied from urllib2 to be able to fake_close the

547

associated connection, *before* issuing the redirected

548

request but *after* having eventually raised an error.

549

"""

550

# Some servers (incorrectly) return multiple Location headers

551

# (so probably same goes for URI). Use first header.

552

553

# TODO: Once we get rid of addinfourl objects, the

554

# following will need to be updated to use correct case

555

# for headers.

556

if 'location' in headers:

557

newurl = headers.getheaders('location')[0]

558

elif 'uri' in headers:

559

newurl = headers.getheaders('uri')[0]

560

else:

561

return

562

if self._debuglevel > 0:

563

print 'Redirected to: %s' % newurl

564

newurl = urlparse.urljoin(req.get_full_url(), newurl)

565

566

# This call succeeds or raise an error. urllib2 returns

567

# if redirect_request returns None, but our

568

# redirect_request never returns None.

569

redirected_req = self.redirect_request(req, fp, code, msg, headers,

570

newurl)

571

572

# loop detection

573

# .redirect_dict has a key url if url was previously visited.

574

if hasattr(req, 'redirect_dict'):

575

visited = redirected_req.redirect_dict = req.redirect_dict

576

if (visited.get(newurl, 0) >= self.max_repeats or

577

len(visited) >= self.max_redirections):

578

raise urllib2.HTTPError(req.get_full_url(), code,

579

self.inf_msg + msg, headers, fp)

580

else:

581

visited = redirected_req.redirect_dict = req.redirect_dict = {}

582

visited[newurl] = visited.get(newurl, 0) + 1

583

584

# We can close the fp now that we are sure that we won't

585

# use it with HTTPError.

586

fp.close()

587

# We have all we need already in the response

588

req.connection.fake_close()

589

590

return self.parent.open(redirected_req)

591

592

http_error_302 = http_error_303 = http_error_307 = http_error_30x

593

594

def http_error_301(self, req, fp, code, msg, headers):

595

response = self.http_error_30x(req, fp, code, msg, headers)

596

# If one or several 301 response occur during the

597

# redirection chain, we MUST update the original request

598

# to indicate where the URI where finally found.

599

600

original_req = req

601

while original_req.parent is not None:

602

original_req = original_req.parent

603

if original_req.redirected_to is None:

604

# Only the last occurring 301 should be taken

605

# into account i.e. the first occurring here when

606

# redirected_to has not yet been set.

607

original_req.redirected_to = redirected_url

608

return response

609

610

611

class HTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):

612

"""Custom basic authentification handler.

613

614

Send the authentification preventively to avoid the the

615

roundtrip associated with the 401 error.

616

"""

617

618

# def http_request(self, request):

619

# """Insert an authentification header if information is available"""

620

# if request.auth == 'basic' and request.password is not None:

621

#

622

# return request

623

624

625

class HTTPErrorProcessor(urllib2.HTTPErrorProcessor):

626

"""Process HTTP error responses.

627

628

We don't really process the errors, quite the contrary

629

instead, we leave our Transport handle them.

630

"""

631

handler_order = 1000 # after all other processing

632

633

def http_response(self, request, response):

634

code, msg, hdrs = response.code, response.msg, response.info()

635

636

if code not in (200, # Ok

637

206, # Partial content

638

404, # Not found

639

):

640

response = self.parent.error('http', request, response,

641

code, msg, hdrs)

642

return response

643

644

https_response = http_response

645

646

647

class HTTPDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):

648

"""Translate common errors into bzr Exceptions"""

649

650

def http_error_default(self, req, fp, code, msg, hdrs):

651

if code == 404:

652

raise errors.NoSuchFile(req.get_selector(),

653

extra=HTTPError(req.get_full_url(),

654

code, msg,

655

hdrs, fp))

656

elif code == 403:

657

raise errors.TransportError('Server refuses to fullfil the request')

658

else:

659

# TODO: A test is needed to exercise that code path

660

raise errors.InvalidHttpResponse(req.get_full_url(),

661

'Unable to handle http code %d: %s'

662

% (code, msg))

663

664

class Opener(object):

665

"""A wrapper around urllib2.build_opener

666

667

Daughter classes can override to build their own specific opener

668

"""

669

# TODO: Provides hooks for daughter classes.

670

671

def __init__(self,

672

connection=ConnectionHandler,

673

redirect=HTTPRedirectHandler,

674

error=HTTPErrorProcessor,):

675

self.password_manager = PasswordManager()

676

# TODO: Implements the necessary wrappers for the handlers

677

# commented out below

678

self._opener = urllib2.build_opener( \

679

connection, redirect, error,

680

#urllib2.ProxyHandler,

681

urllib2.HTTPBasicAuthHandler(self.password_manager),

682

#urllib2.HTTPDigestAuthHandler(self.password_manager),

683

#urllib2.ProxyBasicAuthHandler,

684

#urllib2.ProxyDigestAuthHandler,

685

HTTPHandler,

686

HTTPSHandler,

687

HTTPDefaultErrorHandler,

688

)

689

self.open = self._opener.open

690

if DEBUG >= 2:

691

# When dealing with handler order, it's easy to mess

692

# things up, the following will help understand which

693

# handler is used, when and for what.

694

import pprint

695

pprint.pprint(self._opener.__dict__)

696