~bzr-pqm/bzr/bzr.dev

2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
1
# Copyright (C) 2007 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Container format for Bazaar data.
18
19
"Containers" and "records" are described in doc/developers/container-format.txt.
20
"""
21
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
22
from cStringIO import StringIO
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
23
import re
24
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
25
from bzrlib import errors
26
27
2506.2.10 by Andrew Bennetts
Add '(introduced in 0.18)' to pack format string.
28
FORMAT_ONE = "Bazaar pack format 1 (introduced in 0.18)"
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
29
30
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
31
_whitespace_re = re.compile('[\t\n\x0b\x0c\r ]')
32
33
34
def _check_name(name):
35
    """Do some basic checking of 'name'.
36
    
37
    At the moment, this just checks that there are no whitespace characters in a
38
    name.
39
40
    :raises InvalidRecordError: if name is not valid.
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
41
    :seealso: _check_name_encoding
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
42
    """
43
    if _whitespace_re.search(name) is not None:
44
        raise errors.InvalidRecordError("%r is not a valid name." % (name,))
45
46
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
47
def _check_name_encoding(name):
48
    """Check that 'name' is valid UTF-8.
49
    
50
    This is separate from _check_name because UTF-8 decoding is relatively
51
    expensive, and we usually want to avoid it.
52
53
    :raises InvalidRecordError: if name is not valid UTF-8.
54
    """
55
    try:
56
        name.decode('utf-8')
57
    except UnicodeDecodeError, e:
58
        raise errors.InvalidRecordError(str(e))
59
60
2506.3.1 by Andrew Bennetts
More progress:
61
class ContainerWriter(object):
62
    """A class for writing containers."""
63
64
    def __init__(self, write_func):
65
        """Constructor.
66
67
        :param write_func: a callable that will be called when this
68
            ContainerWriter needs to write some bytes.
69
        """
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
70
        self._write_func = write_func
71
        self.current_offset = 0
2506.3.1 by Andrew Bennetts
More progress:
72
73
    def begin(self):
74
        """Begin writing a container."""
75
        self.write_func(FORMAT_ONE + "\n")
76
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
77
    def write_func(self, bytes):
78
        self._write_func(bytes)
79
        self.current_offset += len(bytes)
80
2506.3.1 by Andrew Bennetts
More progress:
81
    def end(self):
82
        """Finish writing a container."""
83
        self.write_func("E")
84
85
    def add_bytes_record(self, bytes, names):
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
86
        """Add a Bytes record with the given names.
87
        
88
        :param bytes: The bytes to insert.
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
89
        :param names: The names to give the inserted bytes. Each name is
90
            a tuple of bytestrings. The bytestrings may not contain
91
            whitespace.
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
92
        :return: An offset, length tuple. The offset is the offset
93
            of the record within the container, and the length is the
94
            length of data that will need to be read to reconstitute the
95
            record. These offset and length can only be used with the pack
96
            interface - they might be offset by headers or other such details
97
            and thus are only suitable for use by a ContainerReader.
98
        """
99
        current_offset = self.current_offset
2506.3.1 by Andrew Bennetts
More progress:
100
        # Kind marker
101
        self.write_func("B")
102
        # Length
103
        self.write_func(str(len(bytes)) + "\n")
104
        # Names
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
105
        for name_tuple in names:
2506.5.2 by Andrew Bennetts
Raise InvalidRecordError on invalid names.
106
            # Make sure we're writing valid names.  Note that we will leave a
107
            # half-written record if a name is bad!
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
108
            for name in name_tuple:
109
                _check_name(name)
110
            self.write_func('\x00'.join(name_tuple) + "\n")
2506.3.1 by Andrew Bennetts
More progress:
111
        # End of headers
112
        self.write_func("\n")
113
        # Finally, the contents.
114
        self.write_func(bytes)
2661.2.1 by Robert Collins
* ``bzrlib.pack.ContainerWriter`` now returns an offset, length tuple to
115
        # return a memo of where we wrote data to allow random access.
116
        return current_offset, self.current_offset - current_offset
2506.3.1 by Andrew Bennetts
More progress:
117
118
2661.2.2 by Robert Collins
* ``bzrlib.pack.make_readv_reader`` allows readv based access to pack
119
class ReadVFile(object):
120
    """Adapt a readv result iterator to a file like protocol."""
121
122
    def __init__(self, readv_result):
123
        self.readv_result = readv_result
124
        # the most recent readv result block
125
        self._string = None
126
127
    def _next(self):
128
        if (self._string is None or
129
            self._string.tell() == self._string_length):
130
            length, data = self.readv_result.next()
131
            self._string_length = len(data)
132
            self._string = StringIO(data)
133
134
    def read(self, length):
135
        self._next()
136
        result = self._string.read(length)
137
        if len(result) < length:
138
            raise errors.BzrError('request for too much data from a readv hunk.')
139
        return result
140
141
    def readline(self):
142
        """Note that readline will not cross readv segments."""
143
        self._next()
144
        result = self._string.readline()
145
        if self._string.tell() == self._string_length and result[-1] != '\n':
146
            raise errors.BzrError('short readline in the readvfile hunk.')
147
        return result
148
149
150
def make_readv_reader(transport, filename, requested_records):
151
    """Create a ContainerReader that will read selected records only.
152
153
    :param transport: The transport the pack file is located on.
154
    :param filename: The filename of the pack file.
155
    :param requested_records: The record offset, length tuples as returned
156
        by add_bytes_record for the desired records.
157
    """
158
    readv_blocks = [(0, len(FORMAT_ONE)+1)]
159
    readv_blocks.extend(requested_records)
160
    result = ContainerReader(ReadVFile(
161
        transport.readv(filename, readv_blocks)))
162
    return result
163
164
2506.3.1 by Andrew Bennetts
More progress:
165
class BaseReader(object):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
166
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
167
    def __init__(self, source_file):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
168
        """Constructor.
169
2506.2.12 by Andrew Bennetts
Update docstring for Aaron's changes.
170
        :param source_file: a file-like object with `read` and `readline`
171
            methods.
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
172
        """
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
173
        self._source = source_file
174
175
    def reader_func(self, length=None):
176
        return self._source.read(length)
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
177
2506.3.1 by Andrew Bennetts
More progress:
178
    def _read_line(self):
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
179
        line = self._source.readline()
180
        if not line.endswith('\n'):
181
            raise errors.UnexpectedEndOfContainerError()
182
        return line.rstrip('\n')
2506.3.1 by Andrew Bennetts
More progress:
183
184
185
class ContainerReader(BaseReader):
186
    """A class for reading Bazaar's container format."""
187
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
188
    def iter_records(self):
189
        """Iterate over the container, yielding each record as it is read.
190
2506.6.2 by Andrew Bennetts
Docstring improvements.
191
        Each yielded record will be a 2-tuple of (names, callable), where names
192
        is a ``list`` and bytes is a function that takes one argument,
193
        ``max_length``.
194
195
        You **must not** call the callable after advancing the interator to the
196
        next record.  That is, this code is invalid::
197
198
            record_iter = container.iter_records()
199
            names1, callable1 = record_iter.next()
200
            names2, callable2 = record_iter.next()
201
            bytes1 = callable1(None)
202
        
203
        As it will give incorrect results and invalidate the state of the
204
        ContainerReader.
2506.3.1 by Andrew Bennetts
More progress:
205
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
206
        :raises ContainerError: if any sort of containter corruption is
207
            detected, e.g. UnknownContainerFormatError is the format of the
208
            container is unrecognised.
2506.6.2 by Andrew Bennetts
Docstring improvements.
209
        :seealso: ContainerReader.read
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
210
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
211
        self._read_format()
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
212
        return self._iter_records()
213
    
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
214
    def iter_record_objects(self):
215
        """Iterate over the container, yielding each record as it is read.
216
217
        Each yielded record will be an object with ``read`` and ``validate``
2506.6.2 by Andrew Bennetts
Docstring improvements.
218
        methods.  Like with iter_records, it is not safe to use a record object
219
        after advancing the iterator to yield next record.
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
220
221
        :raises ContainerError: if any sort of containter corruption is
222
            detected, e.g. UnknownContainerFormatError is the format of the
223
            container is unrecognised.
2506.6.2 by Andrew Bennetts
Docstring improvements.
224
        :seealso: iter_records
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
225
        """
226
        self._read_format()
227
        return self._iter_record_objects()
228
    
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
229
    def _iter_records(self):
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
230
        for record in self._iter_record_objects():
231
            yield record.read()
232
233
    def _iter_record_objects(self):
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
234
        while True:
235
            record_kind = self.reader_func(1)
236
            if record_kind == 'B':
237
                # Bytes record.
2506.2.9 by Aaron Bentley
Use file-like objects as container input, not callables
238
                reader = BytesRecordReader(self._source)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
239
                yield reader
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
240
            elif record_kind == 'E':
241
                # End marker.  There are no more records.
242
                return
243
            elif record_kind == '':
244
                # End of stream encountered, but no End Marker record seen, so
245
                # this container is incomplete.
246
                raise errors.UnexpectedEndOfContainerError()
247
            else:
248
                # Unknown record type.
249
                raise errors.UnknownRecordTypeError(record_kind)
250
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
251
    def _read_format(self):
252
        format = self._read_line()
253
        if format != FORMAT_ONE:
254
            raise errors.UnknownContainerFormatError(format)
255
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
256
    def validate(self):
257
        """Validate this container and its records.
258
2506.2.7 by Andrew Bennetts
Change read/iter_records to return a callable, add more validation, and
259
        Validating consumes the data stream just like iter_records and
260
        iter_record_objects, so you cannot call it after
261
        iter_records/iter_record_objects.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
262
263
        :raises ContainerError: if something is invalid.
264
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
265
        all_names = set()
266
        for record_names, read_bytes in self.iter_records():
267
            read_bytes(None)
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
268
            for name_tuple in record_names:
269
                for name in name_tuple:
270
                    _check_name_encoding(name)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
271
                # Check that the name is unique.  Note that Python will refuse
272
                # to decode non-shortest forms of UTF-8 encoding, so there is no
273
                # risk that the same unicode string has been encoded two
274
                # different ways.
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
275
                if name_tuple in all_names:
276
                    raise errors.DuplicateRecordNameError(name_tuple)
277
                all_names.add(name_tuple)
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
278
        excess_bytes = self.reader_func(1)
279
        if excess_bytes != '':
280
            raise errors.ContainerHasExcessDataError(excess_bytes)
281
2506.3.1 by Andrew Bennetts
More progress:
282
283
class BytesRecordReader(BaseReader):
284
285
    def read(self):
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
286
        """Read this record.
287
2506.6.2 by Andrew Bennetts
Docstring improvements.
288
        You can either validate or read a record, you can't do both.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
289
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
290
        :returns: A tuple of (names, callable).  The callable can be called
291
            repeatedly to obtain the bytes for the record, with a max_length
292
            argument.  If max_length is None, returns all the bytes.  Because
293
            records can be arbitrarily large, using None is not recommended
294
            unless you have reason to believe the content will fit in memory.
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
295
        """
2506.3.1 by Andrew Bennetts
More progress:
296
        # Read the content length.
297
        length_line = self._read_line()
298
        try:
299
            length = int(length_line)
300
        except ValueError:
301
            raise errors.InvalidRecordError(
302
                "%r is not a valid length." % (length_line,))
303
        
304
        # Read the list of names.
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
305
        names = []
306
        while True:
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
307
            name_line = self._read_line()
308
            if name_line == '':
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
309
                break
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
310
            name_tuple = tuple(name_line.split('\x00'))
311
            for name in name_tuple:
312
                _check_name(name)
313
            names.append(name_tuple)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
314
315
        self._remaining_length = length
316
        return names, self._content_reader
317
318
    def _content_reader(self, max_length):
319
        if max_length is None:
320
            length_to_read = self._remaining_length
321
        else:
322
            length_to_read = min(max_length, self._remaining_length)
323
        self._remaining_length -= length_to_read
324
        bytes = self.reader_func(length_to_read)
325
        if len(bytes) != length_to_read:
2506.3.3 by Andrew Bennetts
Deal with EOF in the middle of a bytes record.
326
            raise errors.UnexpectedEndOfContainerError()
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
327
        return bytes
2506.2.1 by Andrew Bennetts
Start implementing container format reading and writing.
328
2506.2.6 by Andrew Bennetts
Add validate method to ContainerReader and BytesRecordReader.
329
    def validate(self):
330
        """Validate this record.
331
332
        You can either validate or read, you can't do both.
333
334
        :raises ContainerError: if this record is invalid.
335
        """
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
336
        names, read_bytes = self.read()
2682.1.1 by Robert Collins
* The ``bzrlib.pack`` interface has changed to use tuples of bytestrings
337
        for name_tuple in names:
338
            for name in name_tuple:
339
                _check_name_encoding(name)
2506.6.1 by Andrew Bennetts
Return a callable instead of a str from read, and add more validation.
340
        read_bytes(None)
341