137
class _SFTPReadvHelper(object):
138
"""A class to help with managing the state of a readv request."""
140
# See _get_requests for an explanation.
141
_max_request_size = 32768
143
def __init__(self, original_offsets, relpath, _report_activity):
144
"""Create a new readv helper.
146
:param original_offsets: The original requests given by the caller of
148
:param relpath: The name of the file (if known)
149
:param _report_activity: A Transport._report_activity bound method,
150
to be called as data arrives.
152
self.original_offsets = list(original_offsets)
153
self.relpath = relpath
154
self._report_activity = _report_activity
156
def _get_requests(self):
157
"""Break up the offsets into individual requests over sftp.
159
The SFTP spec only requires implementers to support 32kB requests. We
160
could try something larger (openssh supports 64kB), but then we have to
161
handle requests that fail.
162
So instead, we just break up our maximum chunks into 32kB chunks, and
163
asyncronously requests them.
164
Newer versions of paramiko would do the chunking for us, but we want to
165
start processing results right away, so we do it ourselves.
167
# TODO: Because we issue async requests, we don't 'fudge' any extra
168
# data. I'm not 100% sure that is the best choice.
170
# The first thing we do, is to collapse the individual requests as much
171
# as possible, so we don't issues requests <32kB
172
sorted_offsets = sorted(self.original_offsets)
173
coalesced = list(ConnectedTransport._coalesce_offsets(sorted_offsets,
174
limit=0, fudge_factor=0))
176
for c_offset in coalesced:
177
start = c_offset.start
178
size = c_offset.length
180
# Break this up into 32kB requests
182
next_size = min(size, self._max_request_size)
183
requests.append((start, next_size))
186
if 'sftp' in debug.debug_flags:
187
mutter('SFTP.readv(%s) %s offsets => %s coalesced => %s requests',
188
self.relpath, len(sorted_offsets), len(coalesced),
192
def request_and_yield_offsets(self, fp):
193
"""Request the data from the remote machine, yielding the results.
195
:param fp: A Paramiko SFTPFile object that supports readv.
196
:return: Yield the data requested by the original readv caller, one by
199
requests = self._get_requests()
200
offset_iter = iter(self.original_offsets)
201
cur_offset, cur_size = offset_iter.next()
202
# paramiko .readv() yields strings that are in the order of the requests
203
# So we track the current request to know where the next data is
204
# being returned from.
210
# This is used to buffer chunks which we couldn't process yet
211
# It is (start, end, data) tuples.
213
# Create an 'unlimited' data stream, so we stop based on requests,
214
# rather than just because the data stream ended. This lets us detect
216
data_stream = itertools.chain(fp.readv(requests),
217
itertools.repeat(None))
218
for (start, length), data in itertools.izip(requests, data_stream):
220
if cur_coalesced is not None:
221
raise errors.ShortReadvError(self.relpath,
222
start, length, len(data))
223
if len(data) != length:
224
raise errors.ShortReadvError(self.relpath,
225
start, length, len(data))
226
self._report_activity(length, 'read')
228
# This is the first request, just buffer it
229
buffered_data = [data]
230
buffered_len = length
232
elif start == last_end:
233
# The data we are reading fits neatly on the previous
234
# buffer, so this is all part of a larger coalesced range.
235
buffered_data.append(data)
236
buffered_len += length
238
# We have an 'interrupt' in the data stream. So we know we are
239
# at a request boundary.
241
# We haven't consumed the buffer so far, so put it into
242
# data_chunks, and continue.
243
buffered = ''.join(buffered_data)
244
data_chunks.append((input_start, buffered))
246
buffered_data = [data]
247
buffered_len = length
248
last_end = start + length
249
if input_start == cur_offset and cur_size <= buffered_len:
250
# Simplify the next steps a bit by transforming buffered_data
251
# into a single string. We also have the nice property that
252
# when there is only one string ''.join([x]) == x, so there is
254
buffered = ''.join(buffered_data)
255
# Clean out buffered data so that we keep memory
259
# TODO: We *could* also consider the case where cur_offset is in
260
# in the buffered range, even though it doesn't *start*
261
# the buffered range. But for packs we pretty much always
262
# read in order, so you won't get any extra data in the
264
while (input_start == cur_offset
265
and (buffered_offset + cur_size) <= buffered_len):
266
# We've buffered enough data to process this request, spit it
268
cur_data = buffered[buffered_offset:buffered_offset + cur_size]
269
# move the direct pointer into our buffered data
270
buffered_offset += cur_size
271
# Move the start-of-buffer pointer
272
input_start += cur_size
273
# Yield the requested data
274
yield cur_offset, cur_data
275
cur_offset, cur_size = offset_iter.next()
276
# at this point, we've consumed as much of buffered as we can,
277
# so break off the portion that we consumed
278
if buffered_offset == len(buffered_data):
279
# No tail to leave behind
283
buffered = buffered[buffered_offset:]
284
buffered_data = [buffered]
285
buffered_len = len(buffered)
287
buffered = ''.join(buffered_data)
289
data_chunks.append((input_start, buffered))
291
if 'sftp' in debug.debug_flags:
292
mutter('SFTP readv left with %d out-of-order bytes',
293
sum(map(lambda x: len(x[1]), data_chunks)))
294
# We've processed all the readv data, at this point, anything we
295
# couldn't process is in data_chunks. This doesn't happen often, so
296
# this code path isn't optimized
297
# We use an interesting process for data_chunks
298
# Specifically if we have "bisect_left([(start, len, entries)],
300
# If start == qstart, then we get the specific node. Otherwise we
301
# get the previous node
303
idx = bisect.bisect_left(data_chunks, (cur_offset,))
304
if idx < len(data_chunks) and data_chunks[idx][0] == cur_offset:
305
# The data starts here
306
data = data_chunks[idx][1][:cur_size]
308
# The data is in a portion of a previous page
310
sub_offset = cur_offset - data_chunks[idx][0]
311
data = data_chunks[idx][1]
312
data = data[sub_offset:sub_offset + cur_size]
314
# We are missing the page where the data should be found,
317
if len(data) != cur_size:
318
raise AssertionError('We must have miscalulated.'
319
' We expected %d bytes, but only found %d'
320
% (cur_size, len(data)))
321
yield cur_offset, data
322
cur_offset, cur_size = offset_iter.next()
325
134
class SFTPTransport(ConnectedTransport):
326
135
"""Transport implementation for SFTP access."""
473
def _sftp_readv(self, fp, offsets, relpath):
261
def _sftp_readv(self, fp, offsets, relpath='<unknown>'):
474
262
"""Use the readv() member of fp to do async readv.
476
Then read them using paramiko.readv(). paramiko.readv()
264
And then read them using paramiko.readv(). paramiko.readv()
477
265
does not support ranges > 64K, so it caps the request size, and
478
just reads until it gets all the stuff it wants.
266
just reads until it gets all the stuff it wants
480
helper = _SFTPReadvHelper(offsets, relpath, self._report_activity)
481
return helper.request_and_yield_offsets(fp)
268
offsets = list(offsets)
269
sorted_offsets = sorted(offsets)
271
# The algorithm works as follows:
272
# 1) Coalesce nearby reads into a single chunk
273
# This generates a list of combined regions, the total size
274
# and the size of the sub regions. This coalescing step is limited
275
# in the number of nearby chunks to combine, and is allowed to
276
# skip small breaks in the requests. Limiting it makes sure that
277
# we can start yielding some data earlier, and skipping means we
278
# make fewer requests. (Beneficial even when using async)
279
# 2) Break up this combined regions into chunks that are smaller
280
# than 64KiB. Technically the limit is 65536, but we are a
281
# little bit conservative. This is because sftp has a maximum
282
# return chunk size of 64KiB (max size of an unsigned short)
283
# 3) Issue a readv() to paramiko to create an async request for
285
# 4) Read in the data as it comes back, until we've read one
286
# continuous section as determined in step 1
287
# 5) Break up the full sections into hunks for the original requested
288
# offsets. And put them in a cache
289
# 6) Check if the next request is in the cache, and if it is, remove
290
# it from the cache, and yield its data. Continue until no more
291
# entries are in the cache.
292
# 7) loop back to step 4 until all data has been read
294
# TODO: jam 20060725 This could be optimized one step further, by
295
# attempting to yield whatever data we have read, even before
296
# the first coallesced section has been fully processed.
298
# When coalescing for use with readv(), we don't really need to
299
# use any fudge factor, because the requests are made asynchronously
300
coalesced = list(self._coalesce_offsets(sorted_offsets,
301
limit=self._max_readv_combine,
305
for c_offset in coalesced:
306
start = c_offset.start
307
size = c_offset.length
309
# We need to break this up into multiple requests
311
next_size = min(size, self._max_request_size)
312
requests.append((start, next_size))
316
mutter('SFTP.readv() %s offsets => %s coalesced => %s requests',
317
len(offsets), len(coalesced), len(requests))
319
# Queue the current read until we have read the full coalesced section
322
cur_coalesced_stack = iter(coalesced)
323
cur_coalesced = cur_coalesced_stack.next()
325
# Cache the results, but only until they have been fulfilled
327
# turn the list of offsets into a stack
328
offset_stack = iter(offsets)
329
cur_offset_and_size = offset_stack.next()
331
for data in fp.readv(requests):
333
cur_data_len += len(data)
335
if cur_data_len < cur_coalesced.length:
337
if cur_data_len != cur_coalesced.length:
338
raise AssertionError(
339
"Somehow we read too much: %s != %s"
340
% (cur_data_len, cur_coalesced.length))
341
all_data = ''.join(cur_data)
345
for suboffset, subsize in cur_coalesced.ranges:
346
key = (cur_coalesced.start+suboffset, subsize)
347
data_map[key] = all_data[suboffset:suboffset+subsize]
349
# Now that we've read some data, see if we can yield anything back
350
while cur_offset_and_size in data_map:
351
this_data = data_map.pop(cur_offset_and_size)
352
yield cur_offset_and_size[0], this_data
353
cur_offset_and_size = offset_stack.next()
355
# We read a coalesced entry, so mark it as done
357
# Now that we've read all of the data for this coalesced section
359
cur_coalesced = cur_coalesced_stack.next()
361
if cur_coalesced is not None:
362
raise errors.ShortReadvError(relpath, cur_coalesced.start,
363
cur_coalesced.length, len(data))
483
365
def put_file(self, relpath, f, mode=None):