1
# This library is free software; you can redistribute it and/or
2
# modify it under the terms of the GNU Lesser General Public
3
# License as published by the Free Software Foundation; either
4
# version 2.1 of the License, or (at your option) any later version.
6
# This library is distributed in the hope that it will be useful,
7
# but WITHOUT ANY WARRANTY; without even the implied warranty of
8
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9
# Lesser General Public License for more details.
11
# You should have received a copy of the GNU Lesser General Public
12
# License along with this library; if not, write to the
13
# Free Software Foundation, Inc.,
14
# 59 Temple Place, Suite 330,
15
# Boston, MA 02111-1307 USA
17
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
18
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
20
"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
23
>>> from keepalive import HTTPHandler
24
>>> keepalive_handler = HTTPHandler()
25
>>> opener = urllib2.build_opener(keepalive_handler)
26
>>> urllib2.install_opener(opener)
28
>>> fo = urllib2.urlopen('http://www.python.org')
30
If a connection to a given host is requested, and all of the existing
31
connections are still in use, another connection will be opened. If
32
the handler tries to use an existing connection but it fails in some
33
way, it will be closed and removed from the pool.
35
To remove the handler, simply re-run build_opener with no arguments, and
38
You can explicitly close connections by using the close_connection()
39
method of the returned file-like object (described below) or you can
40
use the handler methods:
42
close_connection(host)
46
NOTE: using the close_connection and close_all methods of the handler
47
should be done with care when using multiple threads.
48
* there is nothing that prevents another thread from creating new
49
connections immediately after connections are closed
50
* no checks are done to prevent in-use connections from being closed
52
>>> keepalive_handler.close_all()
54
EXTRA ATTRIBUTES AND METHODS
56
Upon a status of 200, the object returned has a few additional
57
attributes and methods, which should not be used if you want to
58
remain consistent with the normal urllib2-returned objects:
60
close_connection() - close the connection to the host
61
readlines() - you know, readlines()
62
status - the return status (ie 404)
63
reason - english translation of status (ie 'File not found')
65
If you want the best of both worlds, use this inside an
66
AttributeError-catching try:
68
>>> try: status = fo.status
69
>>> except AttributeError: status = None
71
Unfortunately, these are ONLY there if status == 200, so it's not
72
easy to distinguish between non-200 responses. The reason is that
73
urllib2 tries to do clever things with error codes 301, 302, 401,
74
and 407, and it wraps the object upon return.
76
For python versions earlier than 2.4, you can avoid this fancy error
77
handling by setting the module-level global HANDLE_ERRORS to zero.
78
You see, prior to 2.4, it's the HTTP Handler's job to determine what
79
to handle specially, and what to just pass up. HANDLE_ERRORS == 0
80
means "pass everything up". In python 2.4, however, this job no
81
longer belongs to the HTTP Handler and is now done by a NEW handler,
82
HTTPErrorProcessor. Here's the bottom line:
85
HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
87
HANDLE_ERRORS == 0 pass everything up, error processing is
88
left to the calling code
90
HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
91
HANDLE_ERRORS == 0 (default) pass everything up, let the
92
other handlers (specifically,
93
HTTPErrorProcessor) decide what to do
95
In practice, setting the variable either way makes little difference
96
in python 2.4, so for the most consistent behavior across versions,
97
you probably just want to use the defaults, which will give you
102
# $Id: keepalive.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $
110
def DBPRINT(*args): print ' '.join(args)
113
if hasattr(sys, 'version_info'):
114
_python_version = sys.version_info
116
_python_version = map(int, sys.version.split()[0].split('.'))
117
if _python_version < [2, 4]: HANDLE_ERRORS = 1
118
else: HANDLE_ERRORS = 0
120
class ConnectionManager:
122
The connection manager must be able to:
123
* keep track of all existing
126
self._lock = thread.allocate_lock()
127
self._hostmap = {} # map hosts to a list of connections
128
self._connmap = {} # map connections to host
129
self._readymap = {} # map connection to ready state
131
def add(self, host, connection, ready):
134
if not self._hostmap.has_key(host): self._hostmap[host] = []
135
self._hostmap[host].append(connection)
136
self._connmap[connection] = host
137
self._readymap[connection] = ready
141
def remove(self, connection):
145
host = self._connmap[connection]
149
del self._connmap[connection]
150
del self._readymap[connection]
151
self._hostmap[host].remove(connection)
152
if not self._hostmap[host]: del self._hostmap[host]
156
def set_ready(self, connection, ready):
157
try: self._readymap[connection] = ready
158
except KeyError: pass
160
def get_ready_conn(self, host):
164
if self._hostmap.has_key(host):
165
for c in self._hostmap[host]:
166
if self._readymap[c]:
167
self._readymap[c] = 0
174
def get_all(self, host=None):
176
return list(self._hostmap.get(host, []))
178
return dict(self._hostmap)
180
class HTTPHandler(urllib2.HTTPHandler):
182
self._cm = ConnectionManager()
184
#### Connection Management
185
def open_connections(self):
186
"""return a list of connected hosts and the number of connections
187
to each. [('foo.com:80', 2), ('bar.org', 1)]"""
188
return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
190
def close_connection(self, host):
191
"""close connection(s) to <host>
192
host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
193
no error occurs if there is no connection to that host."""
194
for h in self._cm.get_all(host):
199
"""close all open connections"""
200
for host, conns in self._cm.get_all().items():
205
def _request_closed(self, request, host, connection):
206
"""tells us that this request is now closed and the the
207
connection is ready for another request"""
208
self._cm.set_ready(connection, 1)
210
def _remove_connection(self, host, connection, close=0):
211
if close: connection.close()
212
self._cm.remove(connection)
214
#### Transaction Execution
215
def http_open(self, req):
216
return self.do_open(HTTPConnection, req)
218
def do_open(self, http_class, req):
219
host = req.get_host()
221
raise urllib2.URLError('no host given')
224
h = self._cm.get_ready_conn(host)
226
r = self._reuse_connection(h, req, host)
228
# if this response is non-None, then it worked and we're
229
# done. Break out, skipping the else block.
232
# connection is bad - possibly closed by server
233
# discard it and ask for the next free connection
236
h = self._cm.get_ready_conn(host)
238
# no (working) free connections were found. Create a new one.
240
if DEBUG: DBPRINT("creating new connection to %s (%d)" % \
242
self._cm.add(host, h, 0)
243
self._start_transaction(h, req)
245
except (socket.error, httplib.HTTPException), err:
246
raise urllib2.URLError(err)
248
# if not a persistent connection, don't try to reuse it
249
if r.will_close: self._cm.remove(h)
251
if DEBUG: DBPRINT("STATUS: %s, %s" % (r.status, r.reason))
254
r._url = req.get_full_url()
258
if r.status == 200 or not HANDLE_ERRORS:
261
return self.parent.error('http', req, r, r.status, r.reason, r.msg)
264
def _reuse_connection(self, h, req, host):
265
"""start the transaction with a re-used connection
266
return a response object (r) upon success or None on failure.
267
This DOES not close or remove bad connections in cases where
268
it returns. However, if an unexpected exception occurs, it
269
will close and remove the connection before re-raising.
272
self._start_transaction(h, req)
274
# note: just because we got something back doesn't mean it
275
# worked. We'll check the version below, too.
276
except (socket.error, httplib.HTTPException):
279
# adding this block just in case we've missed
280
# something we will still raise the exception, but
281
# lets try and close the connection and remove it
282
# first. We previously got into a nasty loop
283
# where an exception was uncaught, and so the
284
# connection stayed open. On the next try, the
285
# same exception was raised, etc. The tradeoff is
286
# that it's now possible this call will raise
287
# a DIFFERENT exception
288
if DEBUG: DBPRINT("unexpected exception - " \
289
"closing connection to %s (%d)" % (host, id(h)))
294
if r is None or r.version == 9:
295
# httplib falls back to assuming HTTP 0.9 if it gets a
296
# bad header back. This is most likely to happen if
297
# the socket has been closed by the server since we
298
# last used the connection.
299
if DEBUG: DBPRINT("failed to re-use connection to %s (%d)" \
303
if DEBUG: DBPRINT("re-using connection to %s (%d)" % (host, id(h)))
307
def _start_transaction(self, h, req):
310
data = req.get_data()
311
h.putrequest('POST', req.get_selector())
312
if not req.headers.has_key('Content-type'):
313
h.putheader('Content-type',
314
'application/x-www-form-urlencoded')
315
if not req.headers.has_key('Content-length'):
316
h.putheader('Content-length', '%d' % len(data))
318
h.putrequest('GET', req.get_selector())
319
except (socket.error, httplib.HTTPException), err:
320
raise urllib2.URLError(err)
322
for args in self.parent.addheaders:
324
for k, v in req.headers.items():
330
class HTTPResponse(httplib.HTTPResponse):
331
# we need to subclass HTTPResponse in order to
332
# 1) add readline() and readlines() methods
333
# 2) add close_connection() methods
334
# 3) add info() and geturl() methods
336
# in order to add readline(), read must be modified to deal with a
337
# buffer. example: readline must read a buffer and then spit back
338
# one line at a time. The only real alternative is to read one
339
# BYTE at a time (ick). Once something has been read, it can't be
340
# put back (ok, maybe it can, but that's even uglier than this),
341
# so if you THEN do a normal read, you must first take stuff from
344
# the read method wraps the original to accomodate buffering,
345
# although read() never adds to the buffer.
346
# Both readline and readlines have been stolen with almost no
347
# modification from socket.py
350
def __init__(self, sock, debuglevel=0, strict=0, method=None):
351
if method: # the httplib in python 2.3 uses the method arg
352
httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
354
httplib.HTTPResponse.__init__(self, sock, debuglevel)
355
self.fileno = sock.fileno
358
self._rbufsize = 8096
359
self._handler = None # inserted by the handler later
360
self._host = None # (same)
361
self._url = None # (same)
362
self._connection = None # (same)
364
_raw_read = httplib.HTTPResponse.read
371
self._handler._request_closed(self, self._host,
374
def close_connection(self):
375
self._handler._remove_connection(self._host, self._connection, close=1)
384
def read(self, amt=None):
385
# the _rbuf test is only in this first if for speed. It's not
386
# logically necessary
387
if self._rbuf and not amt is None:
393
self._rbuf = self._rbuf[amt:]
396
s = self._rbuf + self._raw_read(amt)
400
def readline(self, limit=-1):
402
i = self._rbuf.find('\n')
403
while i < 0 and not (0 < limit <= len(self._rbuf)):
404
new = self._raw_read(self._rbufsize)
407
if i >= 0: i = i + len(self._rbuf)
408
self._rbuf = self._rbuf + new
409
if i < 0: i = len(self._rbuf)
411
if 0 <= limit < len(self._rbuf): i = limit
412
data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
415
def readlines(self, sizehint = 0):
419
line = self.readline()
423
if sizehint and total >= sizehint:
428
class HTTPConnection(httplib.HTTPConnection):
429
# use the modified response class
430
response_class = HTTPResponse
432
#########################################################################
434
#########################################################################
436
def error_handler(url):
439
keepalive_handler = HTTPHandler()
440
opener = urllib2.build_opener(keepalive_handler)
441
urllib2.install_opener(opener)
442
pos = {0: 'off', 1: 'on'}
444
print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
447
fo = urllib2.urlopen(url)
450
try: status, reason = fo.status, fo.reason
451
except AttributeError: status, reason = None, None
453
print " EXCEPTION: %s" % e
456
print " status = %s, reason = %s" % (status, reason)
458
hosts = keepalive_handler.open_connections()
459
print "open connections:", hosts
460
keepalive_handler.close_all()
466
# first fetch the file with the normal http handler
467
opener = urllib2.build_opener()
468
urllib2.install_opener(opener)
469
fo = urllib2.urlopen(url)
473
print format % ('normal urllib', m.hexdigest())
475
# now install the keepalive handler and try again
476
opener = urllib2.build_opener(HTTPHandler())
477
urllib2.install_opener(opener)
479
fo = urllib2.urlopen(url)
483
print format % ('keepalive read', m.hexdigest())
485
fo = urllib2.urlopen(url)
493
print format % ('keepalive readline', m.hexdigest())
496
print ' making %i connections to:\n %s' % (N, url)
498
sys.stdout.write(' first using the normal urllib handlers')
499
# first use normal opener
500
opener = urllib2.build_opener()
501
urllib2.install_opener(opener)
503
print ' TIME: %.3f s' % t1
505
sys.stdout.write(' now using the keepalive handler ')
506
# now install the keepalive handler and try again
507
opener = urllib2.build_opener(HTTPHandler())
508
urllib2.install_opener(opener)
510
print ' TIME: %.3f s' % t2
511
print ' improvement factor: %.2f' % (t1/t2, )
513
def fetch(N, url, delay=0):
515
starttime = time.time()
517
if delay and i > 0: time.sleep(delay)
518
fo = urllib2.urlopen(url)
521
lens.append(len(foo))
522
diff = time.time() - starttime
528
print "WARNING: inconsistent length on read %i: %i" % (j, i)
532
def test_timeout(url):
533
global DEBUG, DBPRINT
535
def DBPRINT(*args): print ' ' + ' '.join(args)
537
print " fetching the file to establish a connection"
538
fo = urllib2.urlopen(url)
543
print " waiting %i seconds for the server to close the connection" % i
545
sys.stdout.write('\r %2i' % i)
549
sys.stderr.write('\r')
551
print " fetching the file a second time"
552
fo = urllib2.urlopen(url)
557
print ' data are identical'
559
print ' ERROR: DATA DIFFER'
566
print "checking error hander (do this on a non-200)"
567
try: error_handler(url)
569
print "exiting - exception will prevent further tests"
572
print "performing continuity test (making sure stuff isn't corrupted)"
575
print "performing speed comparison"
578
print "performing dropped-connection check"
581
if __name__ == '__main__':
588
print "%s <integer> <url>" % sys.argv[0]