~bzr-pqm/bzr/bzr.dev

5089.1.1 by Martin Pool
Fix typo in ReadVFile.readline (thanks mnordhoff)
1
# Copyright (C) 2007, 2009, 2010 Canonical Ltd
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
16
17
"""Container format for Bazaar data.
18
2916.2.13 by Andrew Bennetts
Improve some docstrings.
19
"Containers" and "records" are described in
20
doc/developers/container-format.txt.
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
21
"""
22
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
23
from cStringIO import StringIO
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
24
import re
25
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
26
from bzrlib import errors
27
28
2535.3.26 by Andrew Bennetts
Revert merge of container-format changes rejected for bzr.dev (i.e. undo andrew.bennetts@canonical.com-20070717044423-cetp5spep142xsr4).
29
FORMAT_ONE = "Bazaar pack format 1 (introduced in 0.18)"
30
31
32
_whitespace_re = re.compile('[\t\n\x0b\x0c\r ]')
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
33
34
35
def _check_name(name):
36
    """Do some basic checking of 'name'.
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
37
2535.3.26 by Andrew Bennetts
Revert merge of container-format changes rejected for bzr.dev (i.e. undo andrew.bennetts@canonical.com-20070717044423-cetp5spep142xsr4).
38
    At the moment, this just checks that there are no whitespace characters in a
39
    name.
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
40
41
    :raises InvalidRecordError: if name is not valid.
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
42
    :seealso: _check_name_encoding
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
43
    """
2535.3.26 by Andrew Bennetts
Revert merge of container-format changes rejected for bzr.dev (i.e. undo andrew.bennetts@canonical.com-20070717044423-cetp5spep142xsr4).
44
    if _whitespace_re.search(name) is not None:
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
45
        raise errors.InvalidRecordError("%r is not a valid name." % (name,))
46
47
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
48
def _check_name_encoding(name):
49
    """Check that 'name' is valid UTF-8.
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
50
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
51
    This is separate from _check_name because UTF-8 decoding is relatively
52
    expensive, and we usually want to avoid it.
53
54
    :raises InvalidRecordError: if name is not valid UTF-8.
55
    """
56
    try:
57
        name.decode('utf-8')
58
    except UnicodeDecodeError, e:
59
        raise errors.InvalidRecordError(str(e))
60
61
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
62
class ContainerSerialiser(object):
2916.2.6 by Andrew Bennetts
Better docstrings.
63
    """A helper class for serialising containers.
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
64
2916.2.6 by Andrew Bennetts
Better docstrings.
65
    It simply returns bytes from method calls to 'begin', 'end' and
66
    'bytes_record'.  You may find ContainerWriter to be a more convenient
67
    interface.
68
    """
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
69
70
    def begin(self):
2916.2.6 by Andrew Bennetts
Better docstrings.
71
        """Return the bytes to begin a container."""
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
72
        return FORMAT_ONE + "\n"
73
74
    def end(self):
2916.2.6 by Andrew Bennetts
Better docstrings.
75
        """Return the bytes to finish a container."""
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
76
        return "E"
77
6257.5.1 by Martin Pool
ContainerWriter: Avoid one possible large-string join
78
    def bytes_header(self, length, names):
79
        """Return the header for a Bytes record."""
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
80
        # Kind marker
81
        byte_sections = ["B"]
82
        # Length
6257.5.1 by Martin Pool
ContainerWriter: Avoid one possible large-string join
83
        byte_sections.append(str(length) + "\n")
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
84
        # Names
85
        for name_tuple in names:
86
            # Make sure we're writing valid names.  Note that we will leave a
87
            # half-written record if a name is bad!
88
            for name in name_tuple:
89
                _check_name(name)
90
            byte_sections.append('\x00'.join(name_tuple) + "\n")
91
        # End of headers
92
        byte_sections.append("\n")
93
        return ''.join(byte_sections)
94
6257.5.1 by Martin Pool
ContainerWriter: Avoid one possible large-string join
95
    def bytes_record(self, bytes, names):
96
        """Return the bytes for a Bytes record with the given name and
97
        contents.
98
99
        If the content may be large, construct the header separately and then
100
        stream out the contents.
101
        """
102
        return self.bytes_header(len(bytes), names) + bytes
103
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
104
2506.3.1 by Andrew Bennetts
More progress:
105
class ContainerWriter(object):
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
106
    """A class for writing containers to a file.
2698.1.1 by Robert Collins
Add records_written attribute to ContainerWriter's. (Robert Collins).
107
108
    :attribute records_written: The number of user records added to the
109
        container. This does not count the prelude or suffix of the container
110
        introduced by the begin() and end() methods.
111
    """
2506.3.1 by Andrew Bennetts
More progress:
112
6257.5.1 by Martin Pool
ContainerWriter: Avoid one possible large-string join
113
    # Join up headers with the body if writing fewer than this many bytes:
114
    # trades off memory usage and copying to do less IO ops.
115
    _JOIN_WRITES_THRESHOLD = 100000
116
2506.3.1 by Andrew Bennetts
More progress:
117
    def __init__(self, write_func):
118
        """Constructor.
119
120
        :param write_func: a callable that will be called when this
121
            ContainerWriter needs to write some bytes.
122
        """
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
123
        self._write_func = write_func
124
        self.current_offset = 0
2698.1.1 by Robert Collins
Add records_written attribute to ContainerWriter's. (Robert Collins).
125
        self.records_written = 0
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
126
        self._serialiser = ContainerSerialiser()
2506.3.1 by Andrew Bennetts
More progress:
127
128
    def begin(self):
129
        """Begin writing a container."""
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
130
        self.write_func(self._serialiser.begin())
2506.3.1 by Andrew Bennetts
More progress:
131
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
132
    def write_func(self, bytes):
133
        self._write_func(bytes)
134
        self.current_offset += len(bytes)
135
2506.3.1 by Andrew Bennetts
More progress:
136
    def end(self):
137
        """Finish writing a container."""
2916.2.5 by Andrew Bennetts
Extract a ContainerSerialiser class from ContainerWriter.
138
        self.write_func(self._serialiser.end())
2506.3.1 by Andrew Bennetts
More progress:
139
140
    def add_bytes_record(self, bytes, names):
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
141
        """Add a Bytes record with the given names.
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
142
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
143
        :param bytes: The bytes to insert.
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
144
        :param names: The names to give the inserted bytes. Each name is
145
            a tuple of bytestrings. The bytestrings may not contain
146
            whitespace.
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
147
        :return: An offset, length tuple. The offset is the offset
148
            of the record within the container, and the length is the
149
            length of data that will need to be read to reconstitute the
150
            record. These offset and length can only be used with the pack
151
            interface - they might be offset by headers or other such details
152
            and thus are only suitable for use by a ContainerReader.
153
        """
154
        current_offset = self.current_offset
6257.5.1 by Martin Pool
ContainerWriter: Avoid one possible large-string join
155
        length = len(bytes)
156
        if length < self._JOIN_WRITES_THRESHOLD:
157
            self.write_func(self._serialiser.bytes_header(length, names)
158
                + bytes)
159
        else:
160
            self.write_func(self._serialiser.bytes_header(length, names))
161
            self.write_func(bytes)
2916.2.4 by Andrew Bennetts
Extract a _serialise_byte_records function.
162
        self.records_written += 1
163
        # return a memo of where we wrote data to allow random access.
164
        return current_offset, self.current_offset - current_offset
165
2506.3.1 by Andrew Bennetts
More progress:
166
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
167
class ReadVFile(object):
4491.2.1 by Martin Pool
Clearer documentation and variable name in ReadVFile
168
    """Adapt a readv result iterator to a file like protocol.
5757.1.3 by Jelmer Vernooij
Revert noknit branch for the moment.
169
    
4491.2.1 by Martin Pool
Clearer documentation and variable name in ReadVFile
170
    The readv result must support the iterator protocol returning (offset,
171
    data_bytes) pairs.
172
    """
173
174
    # XXX: This could be a generic transport class, as other code may want to
175
    # gradually consume the readv result.
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
176
177
    def __init__(self, readv_result):
4491.2.4 by Martin Pool
ReadVFile copes if readv result isn't an iter; also better errors
178
        """Construct a new ReadVFile wrapper.
179
180
        :seealso: make_readv_reader
181
182
        :param readv_result: the most recent readv result - list or generator
183
        """
4491.2.8 by Martin Pool
iter(i) returns i so we don't need a check
184
        # readv can return a sequence or an iterator, but we require an
185
        # iterator to know how much has been consumed.
186
        readv_result = iter(readv_result)
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
187
        self.readv_result = readv_result
188
        self._string = None
189
190
    def _next(self):
191
        if (self._string is None or
192
            self._string.tell() == self._string_length):
4491.2.1 by Martin Pool
Clearer documentation and variable name in ReadVFile
193
            offset, data = self.readv_result.next()
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
194
            self._string_length = len(data)
195
            self._string = StringIO(data)
196
197
    def read(self, length):
198
        self._next()
199
        result = self._string.read(length)
200
        if len(result) < length:
4491.2.4 by Martin Pool
ReadVFile copes if readv result isn't an iter; also better errors
201
            raise errors.BzrError('wanted %d bytes but next '
202
                'hunk only contains %d: %r...' %
203
                (length, len(result), result[:20]))
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
204
        return result
205
206
    def readline(self):
207
        """Note that readline will not cross readv segments."""
208
        self._next()
209
        result = self._string.readline()
210
        if self._string.tell() == self._string_length and result[-1] != '\n':
4491.2.4 by Martin Pool
ReadVFile copes if readv result isn't an iter; also better errors
211
            raise errors.BzrError('short readline in the readvfile hunk: %r'
5089.1.1 by Martin Pool
Fix typo in ReadVFile.readline (thanks mnordhoff)
212
                % (result, ))
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
213
        return result
214
215
216
def make_readv_reader(transport, filename, requested_records):
217
    """Create a ContainerReader that will read selected records only.
218
219
    :param transport: The transport the pack file is located on.
220
    :param filename: The filename of the pack file.
221
    :param requested_records: The record offset, length tuples as returned
222
        by add_bytes_record for the desired records.
223
    """
224
    readv_blocks = [(0, len(FORMAT_ONE)+1)]
225
    readv_blocks.extend(requested_records)
226
    result = ContainerReader(ReadVFile(
227
        transport.readv(filename, readv_blocks)))
228
    return result
229
230
2506.3.1 by Andrew Bennetts
More progress:
231
class BaseReader(object):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
232
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
233
    def __init__(self, source_file):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
234
        """Constructor.
235
2506.2.12 by Andrew Bennetts
Update docstring for Aaron's changes.
236
        :param source_file: a file-like object with `read` and `readline`
237
            methods.
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
238
        """
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
239
        self._source = source_file
240
241
    def reader_func(self, length=None):
242
        return self._source.read(length)
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
243
2506.3.1 by Andrew Bennetts
More progress:
244
    def _read_line(self):
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
245
        line = self._source.readline()
246
        if not line.endswith('\n'):
247
            raise errors.UnexpectedEndOfContainerError()
248
        return line.rstrip('\n')
2506.3.1 by Andrew Bennetts
More progress:
249
250
251
class ContainerReader(BaseReader):
252
    """A class for reading Bazaar's container format."""
253
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
254
    def iter_records(self):
255
        """Iterate over the container, yielding each record as it is read.
256
2506.6.2 by Andrew Bennetts
Docstring improvements.
257
        Each yielded record will be a 2-tuple of (names, callable), where names
258
        is a ``list`` and bytes is a function that takes one argument,
259
        ``max_length``.
260
4031.3.1 by Frank Aspell
Fixing various typos
261
        You **must not** call the callable after advancing the iterator to the
2506.6.2 by Andrew Bennetts
Docstring improvements.
262
        next record.  That is, this code is invalid::
263
264
            record_iter = container.iter_records()
265
            names1, callable1 = record_iter.next()
266
            names2, callable2 = record_iter.next()
267
            bytes1 = callable1(None)
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
268
2506.6.2 by Andrew Bennetts
Docstring improvements.
269
        As it will give incorrect results and invalidate the state of the
270
        ContainerReader.
2506.3.1 by Andrew Bennetts
More progress:
271
4031.3.1 by Frank Aspell
Fixing various typos
272
        :raises ContainerError: if any sort of container corruption is
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
273
            detected, e.g. UnknownContainerFormatError is the format of the
274
            container is unrecognised.
2506.6.2 by Andrew Bennetts
Docstring improvements.
275
        :seealso: ContainerReader.read
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
276
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
277
        self._read_format()
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
278
        return self._iter_records()
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
279
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
280
    def iter_record_objects(self):
281
        """Iterate over the container, yielding each record as it is read.
282
283
        Each yielded record will be an object with ``read`` and ``validate``
2506.6.2 by Andrew Bennetts
Docstring improvements.
284
        methods.  Like with iter_records, it is not safe to use a record object
285
        after advancing the iterator to yield next record.
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
286
4031.3.1 by Frank Aspell
Fixing various typos
287
        :raises ContainerError: if any sort of container corruption is
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
288
            detected, e.g. UnknownContainerFormatError is the format of the
289
            container is unrecognised.
2506.6.2 by Andrew Bennetts
Docstring improvements.
290
        :seealso: iter_records
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
291
        """
292
        self._read_format()
293
        return self._iter_record_objects()
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
294
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
295
    def _iter_records(self):
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
296
        for record in self._iter_record_objects():
297
            yield record.read()
298
299
    def _iter_record_objects(self):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
300
        while True:
301
            record_kind = self.reader_func(1)
302
            if record_kind == 'B':
303
                # Bytes record.
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
304
                reader = BytesRecordReader(self._source)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
305
                yield reader
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
306
            elif record_kind == 'E':
307
                # End marker.  There are no more records.
308
                return
309
            elif record_kind == '':
310
                # End of stream encountered, but no End Marker record seen, so
311
                # this container is incomplete.
312
                raise errors.UnexpectedEndOfContainerError()
313
            else:
314
                # Unknown record type.
315
                raise errors.UnknownRecordTypeError(record_kind)
316
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
317
    def _read_format(self):
318
        format = self._read_line()
2535.3.26 by Andrew Bennetts
Revert merge of container-format changes rejected for bzr.dev (i.e. undo andrew.bennetts@canonical.com-20070717044423-cetp5spep142xsr4).
319
        if format != FORMAT_ONE:
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
320
            raise errors.UnknownContainerFormatError(format)
321
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
322
    def validate(self):
323
        """Validate this container and its records.
324
2506.2.7 by Andrew Bennetts
Change read/iter_records to return a callable, add more validation, and
325
        Validating consumes the data stream just like iter_records and
326
        iter_record_objects, so you cannot call it after
327
        iter_records/iter_record_objects.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
328
329
        :raises ContainerError: if something is invalid.
330
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
331
        all_names = set()
332
        for record_names, read_bytes in self.iter_records():
333
            read_bytes(None)
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
334
            for name_tuple in record_names:
335
                for name in name_tuple:
336
                    _check_name_encoding(name)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
337
                # Check that the name is unique.  Note that Python will refuse
338
                # to decode non-shortest forms of UTF-8 encoding, so there is no
339
                # risk that the same unicode string has been encoded two
340
                # different ways.
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
341
                if name_tuple in all_names:
6112.5.13 by Jonathan Riddell
pass in a name not a tuple
342
                    raise errors.DuplicateRecordNameError(name_tuple[0])
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
343
                all_names.add(name_tuple)
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
344
        excess_bytes = self.reader_func(1)
345
        if excess_bytes != '':
346
            raise errors.ContainerHasExcessDataError(excess_bytes)
347
2506.3.1 by Andrew Bennetts
More progress:
348
349
class BytesRecordReader(BaseReader):
350
351
    def read(self):
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
352
        """Read this record.
353
2506.6.2 by Andrew Bennetts
Docstring improvements.
354
        You can either validate or read a record, you can't do both.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
355
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
356
        :returns: A tuple of (names, callable).  The callable can be called
357
            repeatedly to obtain the bytes for the record, with a max_length
358
            argument.  If max_length is None, returns all the bytes.  Because
359
            records can be arbitrarily large, using None is not recommended
360
            unless you have reason to believe the content will fit in memory.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
361
        """
2506.3.1 by Andrew Bennetts
More progress:
362
        # Read the content length.
363
        length_line = self._read_line()
364
        try:
365
            length = int(length_line)
366
        except ValueError:
367
            raise errors.InvalidRecordError(
368
                "%r is not a valid length." % (length_line,))
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
369
2506.3.1 by Andrew Bennetts
More progress:
370
        # Read the list of names.
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
371
        names = []
372
        while True:
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
373
            name_line = self._read_line()
374
            if name_line == '':
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
375
                break
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
376
            name_tuple = tuple(name_line.split('\x00'))
377
            for name in name_tuple:
378
                _check_name(name)
379
            names.append(name_tuple)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
380
381
        self._remaining_length = length
382
        return names, self._content_reader
383
384
    def _content_reader(self, max_length):
385
        if max_length is None:
386
            length_to_read = self._remaining_length
387
        else:
388
            length_to_read = min(max_length, self._remaining_length)
389
        self._remaining_length -= length_to_read
390
        bytes = self.reader_func(length_to_read)
391
        if len(bytes) != length_to_read:
2506.3.3 by Andrew Bennetts
Deal with EOF in the middle of a bytes record.
392
            raise errors.UnexpectedEndOfContainerError()
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
393
        return bytes
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
394
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
395
    def validate(self):
396
        """Validate this record.
397
398
        You can either validate or read, you can't do both.
399
400
        :raises ContainerError: if this record is invalid.
401
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
402
        names, read_bytes = self.read()
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
403
        for name_tuple in names:
404
            for name in name_tuple:
405
                _check_name_encoding(name)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
406
        read_bytes(None)
407
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
408
409
class ContainerPushParser(object):
2916.2.14 by Andrew Bennetts
Add a docstring.
410
    """A "push" parser for container format 1.
411
412
    It accepts bytes via the ``accept_bytes`` method, and parses them into
413
    records which can be retrieved via the ``read_pending_records`` method.
414
    """
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
415
416
    def __init__(self):
417
        self._buffer = ''
418
        self._state_handler = self._state_expecting_format_line
419
        self._parsed_records = []
420
        self._reset_current_record()
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
421
        self.finished = False
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
422
423
    def _reset_current_record(self):
424
        self._current_record_length = None
425
        self._current_record_names = []
426
427
    def accept_bytes(self, bytes):
428
        self._buffer += bytes
429
        # Keep iterating the state machine until it stops consuming bytes from
430
        # the buffer.
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
431
        last_buffer_length = None
432
        cur_buffer_length = len(self._buffer)
4464.1.1 by Aaron Bentley
ContainerPushParser.accept_bytes handles zero-length records correctly.
433
        last_state_handler = None
434
        while (cur_buffer_length != last_buffer_length
435
               or last_state_handler != self._state_handler):
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
436
            last_buffer_length = cur_buffer_length
4464.1.1 by Aaron Bentley
ContainerPushParser.accept_bytes handles zero-length records correctly.
437
            last_state_handler = self._state_handler
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
438
            self._state_handler()
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
439
            cur_buffer_length = len(self._buffer)
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
440
4060.1.4 by Robert Collins
Streaming fetch from remote servers.
441
    def read_pending_records(self, max=None):
442
        if max:
443
            records = self._parsed_records[:max]
444
            del self._parsed_records[:max]
445
            return records
446
        else:
447
            records = self._parsed_records
448
            self._parsed_records = []
449
            return records
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
450
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
451
    def _consume_line(self):
452
        """Take a line out of the buffer, and return the line.
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
453
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
454
        If a newline byte is not found in the buffer, the buffer is
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
455
        unchanged and this returns None instead.
456
        """
457
        newline_pos = self._buffer.find('\n')
458
        if newline_pos != -1:
459
            line = self._buffer[:newline_pos]
460
            self._buffer = self._buffer[newline_pos+1:]
461
            return line
462
        else:
463
            return None
464
465
    def _state_expecting_format_line(self):
466
        line = self._consume_line()
467
        if line is not None:
468
            if line != FORMAT_ONE:
469
                raise errors.UnknownContainerFormatError(line)
470
            self._state_handler = self._state_expecting_record_type
471
472
    def _state_expecting_record_type(self):
473
        if len(self._buffer) >= 1:
474
            record_type = self._buffer[0]
475
            self._buffer = self._buffer[1:]
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
476
            if record_type == 'B':
477
                self._state_handler = self._state_expecting_length
478
            elif record_type == 'E':
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
479
                self.finished = True
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
480
                self._state_handler = self._state_expecting_nothing
481
            else:
482
                raise errors.UnknownRecordTypeError(record_type)
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
483
484
    def _state_expecting_length(self):
485
        line = self._consume_line()
486
        if line is not None:
487
            try:
488
                self._current_record_length = int(line)
489
            except ValueError:
490
                raise errors.InvalidRecordError(
491
                    "%r is not a valid length." % (line,))
492
            self._state_handler = self._state_expecting_name
493
494
    def _state_expecting_name(self):
495
        encoded_name_parts = self._consume_line()
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
496
        if encoded_name_parts == '':
497
            self._state_handler = self._state_expecting_body
498
        elif encoded_name_parts:
499
            name_parts = tuple(encoded_name_parts.split('\x00'))
500
            for name_part in name_parts:
501
                _check_name(name_part)
502
            self._current_record_names.append(name_parts)
3943.8.1 by Marius Kruger
remove all trailing whitespace from bzr source
503
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
504
    def _state_expecting_body(self):
505
        if len(self._buffer) >= self._current_record_length:
506
            body_bytes = self._buffer[:self._current_record_length]
507
            self._buffer = self._buffer[self._current_record_length:]
508
            record = (self._current_record_names, body_bytes)
509
            self._parsed_records.append(record)
510
            self._reset_current_record()
511
            self._state_handler = self._state_expecting_record_type
512
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
513
    def _state_expecting_nothing(self):
514
        pass
515
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
516
    def read_size_hint(self):
517
        hint = 16384
518
        if self._state_handler == self._state_expecting_body:
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
519
            remaining = self._current_record_length - len(self._buffer)
520
            if remaining < 0:
521
                remaining = 0
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
522
            return max(hint, remaining)
523
        return hint
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
524
525
526
def iter_records_from_file(source_file):
527
    parser = ContainerPushParser()
528
    while True:
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
529
        bytes = source_file.read(parser.read_size_hint())
2916.2.8 by Andrew Bennetts
Add bzrlib.pack.iter_records_from_file.
530
        parser.accept_bytes(bytes)
531
        for record in parser.read_pending_records():
532
            yield record
2916.2.10 by Andrew Bennetts
Simpler iter_records_from_file implementation.
533
        if parser.finished:
534
            break
2916.2.1 by Andrew Bennetts
Initial implementation of a 'push' parser for the container format.
535