~bzr-pqm/bzr/bzr.dev

1185.1.29 by Robert Collins
merge merge tweaks from aaron, which includes latest .dev
1
#! /usr/bin/python
2
3
# $Id: http_client.py 271 2004-10-09 10:50:59Z fredrik $
4
# a simple asynchronous http client (based on SimpleAsyncHTTP.py from
5
# "Python Standard Library" by Fredrik Lundh, O'Reilly 2001)
6
#
7
# HTTP/1.1 and GZIP support added in January 2003 by Fredrik Lundh.
8
#
9
# changes:
10
# 2004-08-26 fl   unified http callback
11
# 2004-10-09 fl   factored out gzip_consumer support
12
# 2005-07-08 mbp  experimental support for keepalive connections
13
#
14
# Copyright (c) 2001-2004 by Fredrik Lundh.  All rights reserved.
15
#
16
17
18
19
"""async/pipelined http client
20
21
Use
22
===
23
24
Users of this library pass in URLs they want to see, and consumer
25
objects that will receive the results at some point in the future.
26
Any number of requests may be queued up, and more may be added while
27
the download is in progress.
28
29
Requests can be both superscalar and superpipelined.  That is to say,
30
for each server there can be multiple sockets open, and each socket
31
may have more than one request in flight.
32
33
Design
34
======
35
36
There is a single DownloadManager, and a connection object for each
37
open socket.
38
39
Request/consumer pairs are maintained in queues.  Each connection has
40
a list of transmitted requests whose response has not yet been
41
received.  There is also a per-server list of requests that have not
42
yet been submitted.
43
44
When a connection is ready to transmit a new request, it takes one
45
from the unsubmitted list, sends the request, and adds the request to
46
its unfulfilled list.  This should happen when the connection has
47
space for more transmissions or when a new request is added by the
48
user.  If the connection terminates with unfulfilled requests they are
49
put back onto the unsubmitted list, to be retried elsewhere.
50
51
Because responses come back precisely in order, the connection always
52
knows what it should expect next: the response for the next
53
unfulfilled request.
54
"""
55
56
# Note that (as of ubuntu python 2.4.1) every socket.connect() call
57
# with a hostname does a remote DNS resolution, which is pretty sucky.
58
# Shouldn't there be a cache in glibc?  We should probably cache the
59
# address in, say, the DownloadManager.
60
61
# TODO: A default consumer operation that writes the received data
62
# into a file; by default the file is named the same as the last
63
# component of the URL.
64
65
# TODO: A utility function that is given a list of URLs, and downloads
66
# them all parallel/pipelined.  If any fail, it raises an exception
67
# (and discards the rest), or perhaps can be told to continue anyhow.
68
# The content is written into temporary files.  It returns a list of
69
# readable file objects.
70
71
# TODO: If we try pipelined or keepalive and the connection drop out
72
# then retry the request on a new connection; eventually we should perhaps
73
# learn that a given host or network just won't allow keepalive.
74
75
76
import asyncore
77
import socket, string, time, sys
78
import StringIO
79
import mimetools, urlparse, urllib
80
import logging
81
82
logging.basicConfig(level=logging.DEBUG,
83
                    format='%(asctime)s %(levelname)s %(message)s',
84
                    filename='/tmp/http_client.log',
85
                    filemode='w')
86
87
logger = logging.getLogger('bzr.http_client')
88
debug = logger.debug
89
info = logger.info
90
error = logger.error
91
92
93
##
94
# Close connection.   Request handlers can raise this exception to
95
# indicate that the connection should be closed.
96
97
class CloseConnection(Exception):
98
    pass
99
100
##
101
# Redirect connection.  Request handlers can raise this exception to
102
# indicate that the a new request should be issued.
103
104
class Redirect(CloseConnection):
105
    def __init__(self, location):
106
        self.location = location
107
108
109
class DownloadManager(object):
110
    """Handles pipelined/overlapped downloads.
111
112
    Pass in a series of URLs with handlers to receive the response.
113
    This object will spread the requests over however many sockets
114
    seem useful.
115
116
    queued_requests
117
        Requests not assigned to any channel
118
119
    running_requests
120
        Currently assigned to a channel
121
    """
122
    def __init__(self):
123
        self.queued_requests = []
124
        # self.channel = HttpChannel('localhost', 8000, self)
125
        self.channels = []
126
        self.try_pipelined = False
127
        self.try_keepalive = False
128
        self.max_channels = 5
129
130
131
    def enqueue(self, url, consumer):
132
        self.queued_requests.append((url, consumer))
133
        self._wake_up_channel()
134
135
136
    def _channel_closed(self, channel):
137
        """Called by the channel when its socket closes.
138
        """
139
        self.channels.remove(channel)
140
        if self.queued_requests:
141
            # might recreate one
142
            self._wake_up_channel()
143
144
145
    def _make_channel(self):
146
        # proxy2 203.17.154.69
147
        # return HttpChannel('82.211.81.161', 80, self)         # bazaar-ng.org 
148
        # return HttpChannel('203.17.154.69', 8080, self)
149
        return HttpChannel('127.0.0.1', 8000, self)  # forwarded
150
            
151
152
    def _wake_up_channel(self):
153
        """Try to wake up one channel to send the newly-added request.
154
155
        There may be more than one request pending, and this may cause
156
        more than one channel to take requests.  That's OK; some of
157
        them may be frustrated.
158
        """
159
        from random import shuffle, choice
160
        
161
        # first, wake up any idle channels
162
        done = False
163
        for ch in self.channels:
164
            if not ch.sent_requests:
165
                ch.take_one()
166
                done = True
167
        if done:
168
            debug("woke existing idle channel(s)")
169
            return
170
171
        if len(self.channels) < self.max_channels:
172
            newch = self._make_channel()
173
            self.channels.append(newch)
174
            newch.take_one()
175
            debug("created new channel")
176
            return
177
178
        if self.try_pipelined:
179
            # ask existing channels to take it
180
            debug("woke busy channel")
181
            choice(self.channels).take_one()
182
183
184
        # debug("request postponed until a channel's idle")
185
        
186
187
188
189
    def run(self):
190
        """Run until all outstanding requests have been served."""
191
        #while self.running_requests or self.queued_requests \
192
        #          or not self.channel.is_idle():
193
        #    asyncore.loop(count=1)
194
        asyncore.loop()
195
196
197
198
class Response(object):
199
    """Holds in-flight response."""
200
201
202
203
def _parse_response_http10(header):
204
    from cStringIO import StringIO
205
206
    fp = StringIO(header)
207
    r = Response()
208
209
    r.status = fp.readline().split(" ", 2)
210
    r.headers = mimetools.Message(fp)
211
212
    # we can only(?) expect to do keepalive if we got either a 
213
    # content-length or chunked encoding; otherwise there's no way to know
214
    # when the content ends apart from through the connection close
215
    r.content_type = r.headers.get("content-type")
216
    try:
217
        r.content_length = int(r.headers.get("content-length"))
218
    except (ValueError, TypeError):
219
        r.content_length = None
220
    debug("seen content length of %r" % r.content_length)
221
222
    r.transfer_encoding = r.headers.get("transfer-encoding")
223
    r.content_encoding = r.headers.get("content-encoding")
224
    r.connection_reply = r.headers.get("connection")
225
226
    # TODO: pass status code to consumer?
227
228
    if r.transfer_encoding:
229
        raise NotImplementedError()
230
231
    if r.transfer_encoding:
232
        raise NotImplementedError()
233
234
    if int(r.status[1]) != 200:
235
        debug("can't handle response status %r" % r.status)
236
        raise NotImplementedError()
237
238
    if r.content_length == None:
239
        raise NotImplementedError()
240
241
    if r.content_length == 0:
242
        raise NotImplementedError()
243
244
    r.content_remaining = r.content_length                
245
246
    return r
247
248
249
    
250
    
251
        
252
253
254
class HttpChannel(asyncore.dispatcher_with_send):
255
    """One http socket, pipelining if possible."""
256
    # asynchronous http client
257
258
    user_agent = "http_client.py 1.3ka (based on effbot)"
259
260
    proxies = urllib.getproxies()
261
262
    def __init__(self, ip_host, ip_port, manager):
263
        asyncore.dispatcher_with_send.__init__(self)
264
        self.manager = manager
265
266
        # if a response header has been seen, this holds it
267
        self.response = None
268
        
269
        self.data = ""
270
271
        self.chunk_size = None
272
273
        self.timestamp = time.time()
274
275
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
276
        debug('connecting...')
277
        self.connect((ip_host, ip_port))
278
279
        # sent_requests holds (url, consumer) 
280
        self.sent_requests = []
281
282
        self._outbuf = ''
283
284
285
    def __repr__(self):
286
        return 'HttpChannel(local_port=%r)' % (self.getsockname(),)
287
288
289
    def is_idle(self):
290
        return (not self.sent_requests)
291
292
293
    def handle_connect(self):
294
        debug("connected")
295
        self.take_one()
296
297
298
    def take_one(self):
299
        """Accept one request from the manager if possible."""
300
        if self.manager.try_pipelined:
301
            if len(self.sent_requests) > 4:
302
                return
303
        else:
304
            if len(self.sent_requests) > 0:
305
                return 
306
        
307
        try:
308
            url, consumer = self.manager.queued_requests.pop(0)
309
            debug('request accepted by channel')
310
        except IndexError:
311
            return
312
        
313
        # TODO: If there are too many already in flight, don't take one.
314
        # TODO: If the socket's not writable (tx buffer full), don't take.
315
        self._push_request_http10(url, consumer)
316
317
318
319
    def _push_request_http10(self, url, consumer):
320
        """Send a request, and add it to the outstanding queue."""
321
        # TODO: check the url requested is appropriate for this connection
322
323
        # TODO: If there are too many requests outstanding or (less likely) the 
324
        # connection fails, queue it for later use.
325
326
        # TODO: Keep track of requests that have been sent but not yet fulfilled,
327
        # because we might need to retransmit them if the connection fails. (Or
328
        # should the caller do that?)
329
330
        request = self._form_request_http10(url)
331
        debug('send request for %s from %r' % (url, self))
332
333
        # dispatcher_with_send handles buffering the data until it can
334
        # be written, and hooks handle_write.
335
336
        self.send(request)
337
338
        self.sent_requests.append((url, consumer))
339
340
341
    def _form_request_http10(self, url):
342
        # TODO: get right vhost name
343
        request = [
344
            "GET %s HTTP/1.0" % (url),
345
            "Host: www.bazaar-ng.org",
346
            ]
347
348
        if self.manager.try_keepalive or self.manager.try_pipelined:
349
            request.extend([
350
                "Keep-Alive: 60", 
351
                "Connection: keep-alive",
352
                ])
353
354
        # make sure to include a user agent
355
        for header in request:
356
            if string.lower(header).startswith("user-agent:"):
357
                break
358
        else:
359
            request.append("User-Agent: %s" % self.user_agent)
360
361
        return string.join(request, "\r\n") + "\r\n\r\n"
362
363
364
    def handle_read(self):
365
        # handle incoming data
366
        data = self.recv(2048)
367
368
        self.data = self.data + data
369
370
        if len(data):
371
            debug('got %d bytes from socket' % len(data))
372
        else:
373
            debug('server closed connection')
374
375
        while self.data:
376
            consumer = self.sent_requests[0][1]
377
            if not self.response:
378
                # do not have a full response header yet
379
380
                # check if we've seen a full header
381
                debug('getting header for %s' % self.sent_requests[0][0])
382
383
                header = self.data.split("\r\n\r\n", 1)
384
                if len(header) <= 1:
385
                    return
386
                header, self.data = header
387
388
                self.response = _parse_response_http10(header)
389
                self.content_remaining = self.response.content_length
390
391
            if not self.data:
392
                return
393
394
            # we now know how many (more) content bytes we have, and how much
395
            # is in the data buffer. there are two main possibilities:
396
            # too much data, and some must be left behind containing the next
397
            # response headers, or too little, or possibly just right
398
399
            want = self.content_remaining
400
            if want > 0:
401
                got_data = self.data[:want]
402
                self.data = self.data[want:]
403
                
404
                assert got_data
405
406
                self.content_remaining -= len(got_data)
407
408
                debug('pass back %d bytes of %s, %d remain'
409
                      % (len(got_data),
410
                         self.sent_requests[0][0],
411
                         self.content_remaining))
412
                consumer.feed(data)
413
414
            if self.content_remaining == 0:
415
                del self.sent_requests[0]
416
417
                debug('content complete')
418
                consumer.content_complete()
419
                
420
                # reset lots of things and try to get the next response header
421
                if self.response.connection_reply == 'close':
422
                    debug('server requested close')
423
                    self.manager._channel_closed(self)
424
                    self.close()
425
                elif not self.manager.try_keepalive:
426
                    debug('no keepalive for this socket')
427
                    self.manager._channel_closed(self)
428
                    self.close()
429
                else:
430
                    debug("ready for next header...")
431
                    self.take_one()
432
                self.response = None
433
434
435
436
    def handle_close(self):
437
        debug('async told us of close on %r' % self)
438
        # if there are outstanding requests should probably reopen and 
439
        # retransmit, but if we're not making any progress then give up
440
        self.manager._channel_closed(self)
441
        self.close()
442
443
444
class DummyConsumer:
445
    def __init__(self, url, pb):
446
        self.url = url
447
        self.outf = None
448
        self._pb = pb
449
450
    def feed(self, data):
451
        # print "feed", repr(data)
452
        # print "feed", repr(data[:20]), repr(data[-20:]), len(data)
453
        if not self.outf:
454
            base = self.url[self.url.rindex('/')+1:]
455
            self.outf = file('/tmp/download/' + base, 'wb')
456
        self.outf.write(data)
457
458
    def error(self, err_info):
459
        import traceback
460
        error('error reported to consumer')
461
        traceback.print_exception(err_info[0], err_info[1], err_info[2])
462
        sys.exit(1)
463
464
    def content_complete(self):
465
        info('content complete from %s' % self.url)
466
        self.outf.close()
467
        self.outf = None
468
        # using last_cnt is cheating
469
        self._pb.update('downloading inventory',
470
                        self._pb.last_cnt+1,
471
                        self._pb.last_total)
472
473
474
475
if __name__ == "__main__":
476
    logging.basicConfig(level=logging.DEBUG)
477
478
    mgr = DownloadManager()
479
480
    from bzrlib.branch import Branch
481
    from bzrlib.progress import ProgressBar
482
483
    pb = ProgressBar()
484
    revs = Branch('/home/mbp/work/bzr').revision_history()
485
    pb.update('downloading inventories', 0, len(revs))
486
487
    for rev in revs:
488
        url = 'http://www.bazaar-ng.org/bzr/bzr.dev/.bzr/inventory-store/' \
489
              + rev + '.gz'
490
        mgr.enqueue(url, DummyConsumer(url, pb))
491
492
    mgr.run()
493
    
494
495
496
    
497
#     for url in ['http://www.bazaar-ng.org/',
498
#                 'http://www.bazaar-ng.org/tutorial.html',
499
#                 'http://www.bazaar-ng.org/download.html',
500
#                 'http://www.bazaar-ng.org/bzr/bzr.dev/.bzr/revision-store/mbp@hope-20050415013653-3b3c9c3d33fae0a6.gz',
501
#                 ]: