~bzr-pqm/bzr/bzr.dev : contents of urlgrabber/keepalive.py at revision 324

~bzr-pqm/bzr/bzr.dev : (revision 324)

#   This library is free software; you can redistribute it and/or
#   modify it under the terms of the GNU Lesser General Public
#   License as published by the Free Software Foundation; either
#   version 2.1 of the License, or (at your option) any later version.
#
#   This library is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public
#   License along with this library; if not, write to the 
#      Free Software Foundation, Inc., 
#      59 Temple Place, Suite 330, 
#      Boston, MA  02111-1307  USA

# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko

"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.

>>> import urllib2
>>> from keepalive import HTTPHandler
>>> keepalive_handler = HTTPHandler()
>>> opener = urllib2.build_opener(keepalive_handler)
>>> urllib2.install_opener(opener)
>>> 
>>> fo = urllib2.urlopen('http://www.python.org')

If a connection to a given host is requested, and all of the existing
connections are still in use, another connection will be opened.  If
the handler tries to use an existing connection but it fails in some
way, it will be closed and removed from the pool.

To remove the handler, simply re-run build_opener with no arguments, and
install that opener.

You can explicitly close connections by using the close_connection()
method of the returned file-like object (described below) or you can
use the handler methods:

  close_connection(host)
  close_all()
  open_connections()

NOTE: using the close_connection and close_all methods of the handler
should be done with care when using multiple threads.
  * there is nothing that prevents another thread from creating new
    connections immediately after connections are closed
  * no checks are done to prevent in-use connections from being closed

>>> keepalive_handler.close_all()

EXTRA ATTRIBUTES AND METHODS

  Upon a status of 200, the object returned has a few additional
  attributes and methods, which should not be used if you want to
  remain consistent with the normal urllib2-returned objects:

    close_connection()  -  close the connection to the host
    readlines()         -  you know, readlines()
    status              -  the return status (ie 404)
    reason              -  english translation of status (ie 'File not found')

  If you want the best of both worlds, use this inside an
  AttributeError-catching try:

  >>> try: status = fo.status
  >>> except AttributeError: status = None

  Unfortunately, these are ONLY there if status == 200, so it's not
  easy to distinguish between non-200 responses.  The reason is that
  urllib2 tries to do clever things with error codes 301, 302, 401,
  and 407, and it wraps the object upon return.

  For python versions earlier than 2.4, you can avoid this fancy error
  handling by setting the module-level global HANDLE_ERRORS to zero.
  You see, prior to 2.4, it's the HTTP Handler's job to determine what
  to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
  means "pass everything up".  In python 2.4, however, this job no
  longer belongs to the HTTP Handler and is now done by a NEW handler,
  HTTPErrorProcessor.  Here's the bottom line:

    python version < 2.4
        HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
                            errors
        HANDLE_ERRORS == 0  pass everything up, error processing is
                            left to the calling code
    python version >= 2.4
        HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
        HANDLE_ERRORS == 0  (default) pass everything up, let the
                            other handlers (specifically,
                            HTTPErrorProcessor) decide what to do

  In practice, setting the variable either way makes little difference
  in python 2.4, so for the most consistent behavior across versions,
  you probably just want to use the defaults, which will give you
  exceptions on errors.

"""

# $Id: keepalive.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $

import urllib2
import httplib
import socket
import thread

DEBUG = 0
def DBPRINT(*args): print ' '.join(args)

import sys
_python_version = map(int, sys.version.split()[0].split('.'))
if _python_version < [2, 4]: HANDLE_ERRORS = 1
else: HANDLE_ERRORS = 0
    
class ConnectionManager:
    """
    The connection manager must be able to:
      * keep track of all existing
      """
    def __init__(self):
        self._lock = thread.allocate_lock()
        self._hostmap = {} # map hosts to a list of connections
        self._connmap = {} # map connections to host
        self._readymap = {} # map connection to ready state

    def add(self, host, connection, ready):
        self._lock.acquire()
        try:
            if not self._hostmap.has_key(host): self._hostmap[host] = []
            self._hostmap[host].append(connection)
            self._connmap[connection] = host
            self._readymap[connection] = ready
        finally:
            self._lock.release()

    def remove(self, connection):
        self._lock.acquire()
        try:
            try:
                host = self._connmap[connection]
            except KeyError:
                pass
            else:
                del self._connmap[connection]
                del self._readymap[connection]
                self._hostmap[host].remove(connection)
                if not self._hostmap[host]: del self._hostmap[host]
        finally:
            self._lock.release()

    def set_ready(self, connection, ready):
        try: self._readymap[connection] = ready
        except KeyError: pass
        
    def get_ready_conn(self, host):
        conn = None
        self._lock.acquire()
        try:
            if self._hostmap.has_key(host):
                for c in self._hostmap[host]:
                    if self._readymap[c]:
                        self._readymap[c] = 0
                        conn = c
                        break
        finally:
            self._lock.release()
        return conn

    def get_all(self, host=None):
        if host:
            return list(self._hostmap.get(host, []))
        else:
            return dict(self._hostmap)

class HTTPHandler(urllib2.HTTPHandler):
    def __init__(self):
        self._cm = ConnectionManager()
        
    #### Connection Management
    def open_connections(self):
        """return a list of connected hosts and the number of connections
        to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
        return [(host, len(li)) for (host, li) in self._cm.get_all().items()]

    def close_connection(self, host):
        """close connection(s) to <host>
        host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
        no error occurs if there is no connection to that host."""
        for h in self._cm.get_all(host):
            self._cm.remove(h)
            h.close()
        
    def close_all(self):
        """close all open connections"""
        for host, conns in self._cm.get_all().items():
            for h in conns:
                self._cm.remove(h)
                h.close()
        
    def _request_closed(self, request, host, connection):
        """tells us that this request is now closed and the the
        connection is ready for another request"""
        self._cm.set_ready(connection, 1)

    def _remove_connection(self, host, connection, close=0):
        if close: connection.close()
        self._cm.remove(connection)
        
    #### Transaction Execution
    def http_open(self, req):
        return self.do_open(HTTPConnection, req)

    def do_open(self, http_class, req):
        host = req.get_host()
        if not host:
            raise urllib2.URLError('no host given')

        try:
            h = self._cm.get_ready_conn(host)
            while h:
                r = self._reuse_connection(h, req, host)

                # if this response is non-None, then it worked and we're
                # done.  Break out, skipping the else block.
                if r: break

                # connection is bad - possibly closed by server
                # discard it and ask for the next free connection
                h.close()
                self._cm.remove(h)
                h = self._cm.get_ready_conn(host)
            else:
                # no (working) free connections were found.  Create a new one.
                h = http_class(host)
                if DEBUG: DBPRINT("creating new connection to %s (%d)" % \
                                  (host, id(h)))
                self._cm.add(host, h, 0)
                self._start_transaction(h, req)
                r = h.getresponse()
        except (socket.error, httplib.HTTPException), err:
            raise urllib2.URLError(err)
            
        # if not a persistent connection, don't try to reuse it
        if r.will_close: self._cm.remove(h)

        if DEBUG: DBPRINT("STATUS: %s, %s" % (r.status, r.reason))
        r._handler = self
        r._host = host
        r._url = req.get_full_url()
        r._connection = h
        r.code = r.status
        
        if r.status == 200 or not HANDLE_ERRORS:
            return r
        else:
            return self.parent.error('http', req, r, r.status, r.reason, r.msg)


    def _reuse_connection(self, h, req, host):
        """start the transaction with a re-used connection
        return a response object (r) upon success or None on failure.
        This DOES not close or remove bad connections in cases where
        it returns.  However, if an unexpected exception occurs, it
        will close and remove the connection before re-raising.
        """
        try:
            self._start_transaction(h, req)
            r = h.getresponse()
            # note: just because we got something back doesn't mean it
            # worked.  We'll check the version below, too.
        except (socket.error, httplib.HTTPException):
            r = None
        except:
            # adding this block just in case we've missed
            # something we will still raise the exception, but
            # lets try and close the connection and remove it
            # first.  We previously got into a nasty loop
            # where an exception was uncaught, and so the
            # connection stayed open.  On the next try, the
            # same exception was raised, etc.  The tradeoff is
            # that it's now possible this call will raise
            # a DIFFERENT exception
            if DEBUG: DBPRINT("unexpected exception - " \
                              "closing connection to %s (%d)" % (host, id(h)))
            self._cm.remove(h)
            h.close()
            raise
                    
        if r is None or r.version == 9:
            # httplib falls back to assuming HTTP 0.9 if it gets a
            # bad header back.  This is most likely to happen if
            # the socket has been closed by the server since we
            # last used the connection.
            if DEBUG: DBPRINT("failed to re-use connection to %s (%d)" \
                              % (host, id(h)))
            r = None
        else:
            if DEBUG: DBPRINT("re-using connection to %s (%d)" % (host, id(h)))

        return r

    def _start_transaction(self, h, req):
        try:
            if req.has_data():
                data = req.get_data()
                h.putrequest('POST', req.get_selector())
                if not req.headers.has_key('Content-type'):
                    h.putheader('Content-type',
                                'application/x-www-form-urlencoded')
                if not req.headers.has_key('Content-length'):
                    h.putheader('Content-length', '%d' % len(data))
            else:
                h.putrequest('GET', req.get_selector())
        except (socket.error, httplib.HTTPException), err:
            raise urllib2.URLError(err)

        for args in self.parent.addheaders:
            h.putheader(*args)
        for k, v in req.headers.items():
            h.putheader(k, v)
        h.endheaders()
        if req.has_data():
            h.send(data)

class HTTPResponse(httplib.HTTPResponse):
    # we need to subclass HTTPResponse in order to
    # 1) add readline() and readlines() methods
    # 2) add close_connection() methods
    # 3) add info() and geturl() methods

    # in order to add readline(), read must be modified to deal with a
    # buffer.  example: readline must read a buffer and then spit back
    # one line at a time.  The only real alternative is to read one
    # BYTE at a time (ick).  Once something has been read, it can't be
    # put back (ok, maybe it can, but that's even uglier than this),
    # so if you THEN do a normal read, you must first take stuff from
    # the buffer.

    # the read method wraps the original to accomodate buffering,
    # although read() never adds to the buffer.
    # Both readline and readlines have been stolen with almost no
    # modification from socket.py
    

    def __init__(self, sock, debuglevel=0, strict=0, method=None):
        if method: # the httplib in python 2.3 uses the method arg
            httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
        else: # 2.2 doesn't
            httplib.HTTPResponse.__init__(self, sock, debuglevel)
        self.fileno = sock.fileno
        self.code = None
        self._rbuf = ''
        self._rbufsize = 8096
        self._handler = None # inserted by the handler later
        self._host = None    # (same)
        self._url = None     # (same)
        self._connection = None # (same)

    _raw_read = httplib.HTTPResponse.read

    def close(self):
        if self.fp:
            self.fp.close()
            self.fp = None
            if self._handler:
                self._handler._request_closed(self, self._host,
                                              self._connection)

    def close_connection(self):
        self._handler._remove_connection(self._host, self._connection, close=1)
        self.close()
        
    def info(self):
        return self.msg

    def geturl(self):
        return self._url

    def read(self, amt=None):
        # the _rbuf test is only in this first if for speed.  It's not
        # logically necessary
        if self._rbuf and not amt is None:
            L = len(self._rbuf)
            if amt > L:
                amt -= L
            else:
                s = self._rbuf[:amt]
                self._rbuf = self._rbuf[amt:]
                return s

        s = self._rbuf + self._raw_read(amt)
        self._rbuf = ''
        return s

    def readline(self, limit=-1):
        data = ""
        i = self._rbuf.find('\n')
        while i < 0 and not (0 < limit <= len(self._rbuf)):
            new = self._raw_read(self._rbufsize)
            if not new: break
            i = new.find('\n')
            if i >= 0: i = i + len(self._rbuf)
            self._rbuf = self._rbuf + new
        if i < 0: i = len(self._rbuf)
        else: i = i+1
        if 0 <= limit < len(self._rbuf): i = limit
        data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
        return data

    def readlines(self, sizehint = 0):
        total = 0
        list = []
        while 1:
            line = self.readline()
            if not line: break
            list.append(line)
            total += len(line)
            if sizehint and total >= sizehint:
                break
        return list


class HTTPConnection(httplib.HTTPConnection):
    # use the modified response class
    response_class = HTTPResponse
    
#########################################################################
#####   TEST FUNCTIONS
#########################################################################

def error_handler(url):
    global HANDLE_ERRORS
    orig = HANDLE_ERRORS
    keepalive_handler = HTTPHandler()
    opener = urllib2.build_opener(keepalive_handler)
    urllib2.install_opener(opener)
    pos = {0: 'off', 1: 'on'}
    for i in (0, 1):
        print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
        HANDLE_ERRORS = i
        try:
            fo = urllib2.urlopen(url)
            foo = fo.read()
            fo.close()
            try: status, reason = fo.status, fo.reason
            except AttributeError: status, reason = None, None
        except IOError, e:
            print "  EXCEPTION: %s" % e
            raise
        else:
            print "  status = %s, reason = %s" % (status, reason)
    HANDLE_ERRORS = orig
    hosts = keepalive_handler.open_connections()
    print "open connections:", hosts
    keepalive_handler.close_all()

def continuity(url):
    import md5
    format = '%25s: %s'
    
    # first fetch the file with the normal http handler
    opener = urllib2.build_opener()
    urllib2.install_opener(opener)
    fo = urllib2.urlopen(url)
    foo = fo.read()
    fo.close()
    m = md5.new(foo)
    print format % ('normal urllib', m.hexdigest())

    # now install the keepalive handler and try again
    opener = urllib2.build_opener(HTTPHandler())
    urllib2.install_opener(opener)

    fo = urllib2.urlopen(url)
    foo = fo.read()
    fo.close()
    m = md5.new(foo)
    print format % ('keepalive read', m.hexdigest())

    fo = urllib2.urlopen(url)
    foo = ''
    while 1:
        f = fo.readline()
        if f: foo = foo + f
        else: break
    fo.close()
    m = md5.new(foo)
    print format % ('keepalive readline', m.hexdigest())

def comp(N, url):
    print '  making %i connections to:\n  %s' % (N, url)

    sys.stdout.write('  first using the normal urllib handlers')
    # first use normal opener
    opener = urllib2.build_opener()
    urllib2.install_opener(opener)
    t1 = fetch(N, url)
    print '  TIME: %.3f s' % t1

    sys.stdout.write('  now using the keepalive handler       ')
    # now install the keepalive handler and try again
    opener = urllib2.build_opener(HTTPHandler())
    urllib2.install_opener(opener)
    t2 = fetch(N, url)
    print '  TIME: %.3f s' % t2
    print '  improvement factor: %.2f' % (t1/t2, )
    
def fetch(N, url, delay=0):
    lens = []
    starttime = time.time()
    for i in range(N):
        if delay and i > 0: time.sleep(delay)
        fo = urllib2.urlopen(url)
        foo = fo.read()
        fo.close()
        lens.append(len(foo))
    diff = time.time() - starttime

    j = 0
    for i in lens[1:]:
        j = j + 1
        if not i == lens[0]:
            print "WARNING: inconsistent length on read %i: %i" % (j, i)

    return diff

def test_timeout(url):
    global DEBUG, DBPRINT
    dbp = DBPRINT
    def DBPRINT(*args): print '    ' + ' '.join(args)
    DEBUG=1
    print "  fetching the file to establish a connection"
    fo = urllib2.urlopen(url)
    data1 = fo.read()
    fo.close()
 
    i = 20
    print "  waiting %i seconds for the server to close the connection" % i
    while i > 0:
        sys.stdout.write('\r  %2i' % i)
        sys.stdout.flush()
        time.sleep(1)
        i -= 1
    sys.stderr.write('\r')

    print "  fetching the file a second time"
    fo = urllib2.urlopen(url)
    data2 = fo.read()
    fo.close()

    if data1 == data2:
        print '  data are identical'
    else:
        print '  ERROR: DATA DIFFER'

    DEBUG=0
    DBPRINT = dbp

    
def test(url, N=10):
    print "checking error hander (do this on a non-200)"
    try: error_handler(url)
    except IOError, e:
        print "exiting - exception will prevent further tests"
        sys.exit()
    print
    print "performing continuity test (making sure stuff isn't corrupted)"
    continuity(url)
    print
    print "performing speed comparison"
    comp(N, url)
    print
    print "performing dropped-connection check"
    test_timeout(url)
    
if __name__ == '__main__':
    import time
    import sys
    try:
        N = int(sys.argv[1])
        url = sys.argv[2]
    except:
        print "%s <integer> <url>" % sys.argv[0]
    else:
        test(url, N)

195 by mbp at sourcefrog - import lovely urlgrabber library	1	# This library is free software; you can redistribute it and/or
	2	# modify it under the terms of the GNU Lesser General Public
	3	# License as published by the Free Software Foundation; either
	4	# version 2.1 of the License, or (at your option) any later version.
	5	#
	6	# This library is distributed in the hope that it will be useful,
	7	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	8	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	9	# Lesser General Public License for more details.
	10	#
	11	# You should have received a copy of the GNU Lesser General Public
	12	# License along with this library; if not, write to the
	13	# Free Software Foundation, Inc.,
	14	# 59 Temple Place, Suite 330,
	15	# Boston, MA 02111-1307 USA
	16
	17	# This file is part of urlgrabber, a high-level cross-protocol url-grabber
	18	# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
	19
	20	"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
	21
	22	>>> import urllib2
	23	>>> from keepalive import HTTPHandler
	24	>>> keepalive_handler = HTTPHandler()
	25	>>> opener = urllib2.build_opener(keepalive_handler)
	26	>>> urllib2.install_opener(opener)
	27	>>>
	28	>>> fo = urllib2.urlopen('http://www.python.org')
	29
	30	If a connection to a given host is requested, and all of the existing
	31	connections are still in use, another connection will be opened. If
	32	the handler tries to use an existing connection but it fails in some
	33	way, it will be closed and removed from the pool.
	34
	35	To remove the handler, simply re-run build_opener with no arguments, and
	36	install that opener.
	37
	38	You can explicitly close connections by using the close_connection()
	39	method of the returned file-like object (described below) or you can
	40	use the handler methods:
	41
	42	close_connection(host)
	43	close_all()
	44	open_connections()
	45
	46	NOTE: using the close_connection and close_all methods of the handler
	47	should be done with care when using multiple threads.
	48	* there is nothing that prevents another thread from creating new
	49	connections immediately after connections are closed
	50	* no checks are done to prevent in-use connections from being closed
	51
	52	>>> keepalive_handler.close_all()
	53
	54	EXTRA ATTRIBUTES AND METHODS
	55
	56	Upon a status of 200, the object returned has a few additional
	57	attributes and methods, which should not be used if you want to
	58	remain consistent with the normal urllib2-returned objects:
	59
	60	close_connection() - close the connection to the host
	61	readlines() - you know, readlines()
	62	status - the return status (ie 404)
	63	reason - english translation of status (ie 'File not found')
	64
65	If you want the best of both worlds, use this inside an
66	AttributeError-catching try:
67
68	>>> try: status = fo.status
69	>>> except AttributeError: status = None
70
71	Unfortunately, these are ONLY there if status == 200, so it's not
72	easy to distinguish between non-200 responses. The reason is that
73	urllib2 tries to do clever things with error codes 301, 302, 401,
74	and 407, and it wraps the object upon return.
75
76	For python versions earlier than 2.4, you can avoid this fancy error
77	handling by setting the module-level global HANDLE_ERRORS to zero.
78	You see, prior to 2.4, it's the HTTP Handler's job to determine what
79	to handle specially, and what to just pass up. HANDLE_ERRORS == 0
80	means "pass everything up". In python 2.4, however, this job no
81	longer belongs to the HTTP Handler and is now done by a NEW handler,
82	HTTPErrorProcessor. Here's the bottom line:
83
84	python version < 2.4
85	HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
86	errors
87	HANDLE_ERRORS == 0 pass everything up, error processing is
88	left to the calling code
89	python version >= 2.4
90	HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
91	HANDLE_ERRORS == 0 (default) pass everything up, let the
92	other handlers (specifically,
93	HTTPErrorProcessor) decide what to do
94
95	In practice, setting the variable either way makes little difference
96	in python 2.4, so for the most consistent behavior across versions,
97	you probably just want to use the defaults, which will give you
98	exceptions on errors.
99
100	"""
101
102	# $Id: keepalive.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $
103
104	import urllib2
105	import httplib
106	import socket
107	import thread
108
109	DEBUG = 0
110	def DBPRINT(*args): print ' '.join(args)
111
112	import sys
113	_python_version = map(int, sys.version.split()[0].split('.'))
114	if _python_version < [2, 4]: HANDLE_ERRORS = 1
115	else: HANDLE_ERRORS = 0
116
117	class ConnectionManager:
118	"""
119	The connection manager must be able to:
120	* keep track of all existing
121	"""
122	def __init__(self):
123	self._lock = thread.allocate_lock()
124	self._hostmap = {} # map hosts to a list of connections
125	self._connmap = {} # map connections to host
126	self._readymap = {} # map connection to ready state
127
128	def add(self, host, connection, ready):
129	self._lock.acquire()
130	try:
131	if not self._hostmap.has_key(host): self._hostmap[host] = []
132	self._hostmap[host].append(connection)
133	self._connmap[connection] = host
134	self._readymap[connection] = ready
135	finally:
136	self._lock.release()
137
138	def remove(self, connection):
139	self._lock.acquire()
140	try:
141	try:
142	host = self._connmap[connection]
143	except KeyError:
144	pass
145	else:
146	del self._connmap[connection]
147	del self._readymap[connection]
148	self._hostmap[host].remove(connection)
149	if not self._hostmap[host]: del self._hostmap[host]
150	finally:
151	self._lock.release()
152
153	def set_ready(self, connection, ready):
154	try: self._readymap[connection] = ready
155	except KeyError: pass
156
157	def get_ready_conn(self, host):
158	conn = None
159	self._lock.acquire()
160	try:
161	if self._hostmap.has_key(host):
162	for c in self._hostmap[host]:
163	if self._readymap[c]:
164	self._readymap[c] = 0
165	conn = c
166	break
167	finally:
168	self._lock.release()
169	return conn
170
171	def get_all(self, host=None):
172	if host:
173	return list(self._hostmap.get(host, []))
174	else:
175	return dict(self._hostmap)
176
177	class HTTPHandler(urllib2.HTTPHandler):
178	def __init__(self):
179	self._cm = ConnectionManager()
180
181	#### Connection Management
182	def open_connections(self):
183	"""return a list of connected hosts and the number of connections
184	to each. [('foo.com:80', 2), ('bar.org', 1)]"""
185	return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
186
187	def close_connection(self, host):
188	"""close connection(s) to <host>
189	host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
190	no error occurs if there is no connection to that host."""
191	for h in self._cm.get_all(host):
192	self._cm.remove(h)
193	h.close()
194
195	def close_all(self):
196	"""close all open connections"""
197	for host, conns in self._cm.get_all().items():
198	for h in conns:
199	self._cm.remove(h)
200	h.close()
201
202	def _request_closed(self, request, host, connection):
203	"""tells us that this request is now closed and the the
204	connection is ready for another request"""
205	self._cm.set_ready(connection, 1)
206
207	def _remove_connection(self, host, connection, close=0):
208	if close: connection.close()
209	self._cm.remove(connection)
210
211	#### Transaction Execution
212	def http_open(self, req):
213	return self.do_open(HTTPConnection, req)
214
215	def do_open(self, http_class, req):
216	host = req.get_host()
217	if not host:
218	raise urllib2.URLError('no host given')
219
220	try:
221	h = self._cm.get_ready_conn(host)
222	while h:
223	r = self._reuse_connection(h, req, host)
224
225	# if this response is non-None, then it worked and we're
226	# done. Break out, skipping the else block.
227	if r: break
228
229	# connection is bad - possibly closed by server
230	# discard it and ask for the next free connection
231	h.close()
232	self._cm.remove(h)
233	h = self._cm.get_ready_conn(host)
234	else:
235	# no (working) free connections were found. Create a new one.
236	h = http_class(host)
237	if DEBUG: DBPRINT("creating new connection to %s (%d)" % \
238	(host, id(h)))
239	self._cm.add(host, h, 0)
240	self._start_transaction(h, req)
241	r = h.getresponse()
242	except (socket.error, httplib.HTTPException), err:
243	raise urllib2.URLError(err)
244
245	# if not a persistent connection, don't try to reuse it
246	if r.will_close: self._cm.remove(h)
247
248	if DEBUG: DBPRINT("STATUS: %s, %s" % (r.status, r.reason))
249	r._handler = self
250	r._host = host
251	r._url = req.get_full_url()
252	r._connection = h
253	r.code = r.status
254
255	if r.status == 200 or not HANDLE_ERRORS:
256	return r
257	else:
258	return self.parent.error('http', req, r, r.status, r.reason, r.msg)
259
260
261	def _reuse_connection(self, h, req, host):
262	"""start the transaction with a re-used connection
263	return a response object (r) upon success or None on failure.
264	This DOES not close or remove bad connections in cases where
265	it returns. However, if an unexpected exception occurs, it
266	will close and remove the connection before re-raising.
267	"""
268	try:
269	self._start_transaction(h, req)
270	r = h.getresponse()
271	# note: just because we got something back doesn't mean it
272	# worked. We'll check the version below, too.
273	except (socket.error, httplib.HTTPException):
274	r = None
275	except:
276	# adding this block just in case we've missed
277	# something we will still raise the exception, but
278	# lets try and close the connection and remove it
279	# first. We previously got into a nasty loop
280	# where an exception was uncaught, and so the
281	# connection stayed open. On the next try, the
282	# same exception was raised, etc. The tradeoff is
283	# that it's now possible this call will raise
284	# a DIFFERENT exception
285	if DEBUG: DBPRINT("unexpected exception - " \
286	"closing connection to %s (%d)" % (host, id(h)))
287	self._cm.remove(h)
288	h.close()
289	raise
290
291	if r is None or r.version == 9:
292	# httplib falls back to assuming HTTP 0.9 if it gets a
293	# bad header back. This is most likely to happen if
294	# the socket has been closed by the server since we
295	# last used the connection.
296	if DEBUG: DBPRINT("failed to re-use connection to %s (%d)" \
297	% (host, id(h)))
298	r = None
299	else:
300	if DEBUG: DBPRINT("re-using connection to %s (%d)" % (host, id(h)))
301
302	return r
303
304	def _start_transaction(self, h, req):
305	try:
306	if req.has_data():
307	data = req.get_data()
308	h.putrequest('POST', req.get_selector())
309	if not req.headers.has_key('Content-type'):
310	h.putheader('Content-type',
311	'application/x-www-form-urlencoded')
312	if not req.headers.has_key('Content-length'):
313	h.putheader('Content-length', '%d' % len(data))
314	else:
315	h.putrequest('GET', req.get_selector())
316	except (socket.error, httplib.HTTPException), err:
317	raise urllib2.URLError(err)
318
319	for args in self.parent.addheaders:
320	h.putheader(*args)
321	for k, v in req.headers.items():
322	h.putheader(k, v)
323	h.endheaders()
324	if req.has_data():
325	h.send(data)
326
327	class HTTPResponse(httplib.HTTPResponse):
328	# we need to subclass HTTPResponse in order to
329	# 1) add readline() and readlines() methods
330	# 2) add close_connection() methods
331	# 3) add info() and geturl() methods
332
333	# in order to add readline(), read must be modified to deal with a
334	# buffer. example: readline must read a buffer and then spit back
335	# one line at a time. The only real alternative is to read one
336	# BYTE at a time (ick). Once something has been read, it can't be
337	# put back (ok, maybe it can, but that's even uglier than this),
338	# so if you THEN do a normal read, you must first take stuff from
339	# the buffer.
340
341	# the read method wraps the original to accomodate buffering,
342	# although read() never adds to the buffer.
343	# Both readline and readlines have been stolen with almost no
344	# modification from socket.py
345
346
347	def __init__(self, sock, debuglevel=0, strict=0, method=None):
348	if method: # the httplib in python 2.3 uses the method arg
349	httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
350	else: # 2.2 doesn't
351	httplib.HTTPResponse.__init__(self, sock, debuglevel)
352	self.fileno = sock.fileno
353	self.code = None
354	self._rbuf = ''
355	self._rbufsize = 8096
356	self._handler = None # inserted by the handler later
357	self._host = None # (same)
358	self._url = None # (same)
359	self._connection = None # (same)
360
361	_raw_read = httplib.HTTPResponse.read
362
363	def close(self):
364	if self.fp:
365	self.fp.close()
366	self.fp = None
367	if self._handler:
368	self._handler._request_closed(self, self._host,
369	self._connection)
370
371	def close_connection(self):
372	self._handler._remove_connection(self._host, self._connection, close=1)
373	self.close()
374
375	def info(self):
376	return self.msg
377
378	def geturl(self):
379	return self._url
380
381	def read(self, amt=None):
382	# the _rbuf test is only in this first if for speed. It's not
383	# logically necessary
384	if self._rbuf and not amt is None:
385	L = len(self._rbuf)
386	if amt > L:
387	amt -= L
388	else:
389	s = self._rbuf[:amt]
390	self._rbuf = self._rbuf[amt:]
391	return s
392
393	s = self._rbuf + self._raw_read(amt)
394	self._rbuf = ''
395	return s
396
397	def readline(self, limit=-1):
398	data = ""
399	i = self._rbuf.find('\n')
400	while i < 0 and not (0 < limit <= len(self._rbuf)):
401	new = self._raw_read(self._rbufsize)
402	if not new: break
403	i = new.find('\n')
404	if i >= 0: i = i + len(self._rbuf)
405	self._rbuf = self._rbuf + new
406	if i < 0: i = len(self._rbuf)
407	else: i = i+1
408	if 0 <= limit < len(self._rbuf): i = limit
409	data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
410	return data
411
412	def readlines(self, sizehint = 0):
413	total = 0
414	list = []
415	while 1:
416	line = self.readline()
417	if not line: break
418	list.append(line)
419	total += len(line)
420	if sizehint and total >= sizehint:
421	break
422	return list
423
424
425	class HTTPConnection(httplib.HTTPConnection):
426	# use the modified response class
427	response_class = HTTPResponse
428
429	#########################################################################
430	##### TEST FUNCTIONS
431	#########################################################################
432
433	def error_handler(url):
434	global HANDLE_ERRORS
435	orig = HANDLE_ERRORS
436	keepalive_handler = HTTPHandler()
437	opener = urllib2.build_opener(keepalive_handler)
438	urllib2.install_opener(opener)
439	pos = {0: 'off', 1: 'on'}
440	for i in (0, 1):
441	print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
442	HANDLE_ERRORS = i
443	try:
444	fo = urllib2.urlopen(url)
445	foo = fo.read()
446	fo.close()
447	try: status, reason = fo.status, fo.reason
448	except AttributeError: status, reason = None, None
449	except IOError, e:
450	print " EXCEPTION: %s" % e
451	raise
452	else:
453	print " status = %s, reason = %s" % (status, reason)
454	HANDLE_ERRORS = orig
455	hosts = keepalive_handler.open_connections()
456	print "open connections:", hosts
457	keepalive_handler.close_all()
458
459	def continuity(url):
460	import md5
461	format = '%25s: %s'
462
463	# first fetch the file with the normal http handler
464	opener = urllib2.build_opener()
465	urllib2.install_opener(opener)
466	fo = urllib2.urlopen(url)
467	foo = fo.read()
468	fo.close()
469	m = md5.new(foo)
470	print format % ('normal urllib', m.hexdigest())
471
472	# now install the keepalive handler and try again
473	opener = urllib2.build_opener(HTTPHandler())
474	urllib2.install_opener(opener)
475
476	fo = urllib2.urlopen(url)
477	foo = fo.read()
478	fo.close()
479	m = md5.new(foo)
480	print format % ('keepalive read', m.hexdigest())
481
482	fo = urllib2.urlopen(url)
483	foo = ''
484	while 1:
485	f = fo.readline()
486	if f: foo = foo + f
487	else: break
488	fo.close()
489	m = md5.new(foo)
490	print format % ('keepalive readline', m.hexdigest())
491
492	def comp(N, url):
493	print ' making %i connections to:\n %s' % (N, url)
494
495	sys.stdout.write(' first using the normal urllib handlers')
496	# first use normal opener
497	opener = urllib2.build_opener()
498	urllib2.install_opener(opener)
499	t1 = fetch(N, url)
500	print ' TIME: %.3f s' % t1
501
502	sys.stdout.write(' now using the keepalive handler ')
503	# now install the keepalive handler and try again
504	opener = urllib2.build_opener(HTTPHandler())
505	urllib2.install_opener(opener)
506	t2 = fetch(N, url)
507	print ' TIME: %.3f s' % t2
508	print ' improvement factor: %.2f' % (t1/t2, )
509
510	def fetch(N, url, delay=0):
511	lens = []
512	starttime = time.time()
513	for i in range(N):
514	if delay and i > 0: time.sleep(delay)
515	fo = urllib2.urlopen(url)
516	foo = fo.read()
517	fo.close()
518	lens.append(len(foo))
519	diff = time.time() - starttime
520
521	j = 0
522	for i in lens[1:]:
523	j = j + 1
524	if not i == lens[0]:
525	print "WARNING: inconsistent length on read %i: %i" % (j, i)
526
527	return diff
528
529	def test_timeout(url):
530	global DEBUG, DBPRINT
531	dbp = DBPRINT
532	def DBPRINT(*args): print ' ' + ' '.join(args)
533	DEBUG=1
534	print " fetching the file to establish a connection"
535	fo = urllib2.urlopen(url)
536	data1 = fo.read()
537	fo.close()
538
539	i = 20
540	print " waiting %i seconds for the server to close the connection" % i
541	while i > 0:
542	sys.stdout.write('\r %2i' % i)
543	sys.stdout.flush()
544	time.sleep(1)
545	i -= 1
546	sys.stderr.write('\r')
547
548	print " fetching the file a second time"
549	fo = urllib2.urlopen(url)
550	data2 = fo.read()
551	fo.close()
552
553	if data1 == data2:
554	print ' data are identical'
555	else:
556	print ' ERROR: DATA DIFFER'
557
558	DEBUG=0
559	DBPRINT = dbp
560
561
562	def test(url, N=10):
563	print "checking error hander (do this on a non-200)"
564	try: error_handler(url)
565	except IOError, e:
566	print "exiting - exception will prevent further tests"
567	sys.exit()
568	print
569	print "performing continuity test (making sure stuff isn't corrupted)"
570	continuity(url)
571	print
572	print "performing speed comparison"
573	comp(N, url)
574	print
575	print "performing dropped-connection check"
576	test_timeout(url)
577
578	if __name__ == '__main__':
579	import time
580	import sys
581	try:
582	N = int(sys.argv[1])
583	url = sys.argv[2]
584	except:
585	print "%s <integer> <url>" % sys.argv[0]
586	else:
587	test(url, N)