~bzr-pqm/bzr/bzr.dev

4763.2.4 by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry.
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17
"""Serializer object for CHK based inventory storage."""
18
19
from bzrlib import (
4398.5.2 by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations.
20
    bencode,
4290.1.1 by Jelmer Vernooij
Add simple revision serializer based on RIO.
21
    cache_utf8,
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
22
    inventory,
4290.1.1 by Jelmer Vernooij
Add simple revision serializer based on RIO.
23
    revision as _mod_revision,
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
24
    xml6,
4543.2.7 by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5
25
    xml7,
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
26
    )
27
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
28
29
def _validate_properties(props, _decode=cache_utf8._utf8_decode):
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
30
    # TODO: we really want an 'isascii' check for key
4398.5.15 by John Arbash Meinel
Change how schemas are validated (down to 1.02s)
31
    # Cast the utf8 properties into Unicode 'in place'
32
    for key, value in props.iteritems():
33
        props[key] = _decode(value)[0]
34
    return props
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
35
36
37
def _is_format_10(value):
38
    if value != 10:
39
        raise ValueError('Format number was not recognized, expected 10 got %d'
40
                         % (value,))
41
    return 10
42
43
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
44
class BEncodeRevisionSerializer1(object):
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
45
    """Simple revision serializer based around bencode.
4290.1.1 by Jelmer Vernooij
Add simple revision serializer based on RIO.
46
    """
47
4416.5.1 by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer.
48
    squashes_xml_invalid_characters = False
49
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
50
    # Maps {key:(Revision attribute, bencode_type, validator)}
51
    # This tells us what kind we expect bdecode to create, what variable on
52
    # Revision we should be using, and a function to call to validate/transform
53
    # the type.
54
    # TODO: add a 'validate_utf8' for things like revision_id and file_id
55
    #       and a validator for parent-ids
56
    _schema = {'format': (None, int, _is_format_10),
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
57
               'committer': ('committer', str, cache_utf8.decode),
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
58
               'timezone': ('timezone', int, None),
59
               'timestamp': ('timestamp', str, float),
60
               'revision-id': ('revision_id', str, None),
4398.5.19 by John Arbash Meinel
Change parent_ids back to a list, because there are other tests that expect it.
61
               'parent-ids': ('parent_ids', list, None),
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
62
               'inventory-sha1': ('inventory_sha1', str, None),
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
63
               'message': ('message', str, cache_utf8.decode),
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
64
               'properties': ('properties', dict, _validate_properties),
65
    }
66
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
67
    def write_revision_to_string(self, rev):
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
68
        encode_utf8 = cache_utf8._utf8_encode
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
69
        # Use a list of tuples rather than a dict
70
        # This lets us control the ordering, so that we are able to create
71
        # smaller deltas
72
        ret = [
73
            ("format", 10),
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
74
            ("committer", encode_utf8(rev.committer)[0]),
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
75
        ]
76
        if rev.timezone is not None:
77
            ret.append(("timezone", rev.timezone))
78
        # For bzr revisions, the most common property is just 'branch-nick'
79
        # which changes infrequently.
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
80
        revprops = {}
81
        for key, value in rev.properties.iteritems():
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
82
            revprops[key] = encode_utf8(value)[0]
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
83
        ret.append(('properties', revprops))
84
        ret.extend([
85
            ("timestamp", "%.3f" % rev.timestamp),
86
            ("revision-id", rev.revision_id),
87
            ("parent-ids", rev.parent_ids),
88
            ("inventory-sha1", rev.inventory_sha1),
4398.5.9 by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')
89
            ("message", encode_utf8(rev.message)[0]),
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
90
        ])
4398.5.2 by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations.
91
        return bencode.bencode(ret)
4290.1.8 by Jelmer Vernooij
Some performance tweaks.
92
93
    def write_revision(self, rev, f):
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
94
        f.write(self.write_revision_to_string(rev))
95
96
    def read_revision_from_string(self, text):
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
97
        # TODO: consider writing a Revision decoder, rather than using the
98
        #       generic bencode decoder
4398.5.8 by John Arbash Meinel
Update the TODO comment a bit.
99
        #       However, to decode all 25k revisions of bzr takes approx 1.3s
100
        #       If we remove all extra validation that goes down to about 1.2s.
101
        #       Of that time, probably 0.6s is spend in bencode.bdecode().
102
        #       Regardless 'time bzr log' of everything is 7+s, so 1.3s to
103
        #       extract revision texts isn't a majority of time.
4398.5.2 by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations.
104
        ret = bencode.bdecode(text)
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
105
        if not isinstance(ret, list):
106
            raise ValueError("invalid revision text")
4398.5.15 by John Arbash Meinel
Change how schemas are validated (down to 1.02s)
107
        schema = self._schema
4398.5.7 by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop
108
        # timezone is allowed to be missing, but should be set
109
        bits = {'timezone': None}
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
110
        for key, value in ret:
4398.5.7 by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop
111
            # Will raise KeyError if not a valid part of the schema, or an
112
            # entry is given 2 times.
4398.5.15 by John Arbash Meinel
Change how schemas are validated (down to 1.02s)
113
            var_name, expected_type, validator = schema[key]
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
114
            if value.__class__ is not expected_type:
115
                raise ValueError('key %s did not conform to the expected type'
116
                                 ' %s, but was %s'
117
                                 % (key, expected_type, type(value)))
118
            if validator is not None:
119
                value = validator(value)
4398.5.7 by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop
120
            bits[var_name] = value
4398.5.15 by John Arbash Meinel
Change how schemas are validated (down to 1.02s)
121
        if len(bits) != len(schema):
122
            missing = [key for key, (var_name, _, _) in schema.iteritems()
123
                       if var_name not in bits]
124
            raise ValueError('Revision text was missing expected keys %s.'
125
                             ' text %r' % (missing, text))
126
        del bits[None]  # Get rid of 'format' since it doesn't get mapped
4398.5.5 by John Arbash Meinel
Update the CHK Serializer to do lots more validation.
127
        rev = _mod_revision.Revision(**bits)
4290.1.8 by Jelmer Vernooij
Some performance tweaks.
128
        return rev
129
130
    def read_revision(self, f):
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
131
        return self.read_revision_from_string(f.read())
132
133
4543.2.7 by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5
134
class CHKSerializerSubtree(BEncodeRevisionSerializer1, xml7.Serializer_v7):
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
135
    """A CHKInventory based serializer that supports tree references"""
136
137
    supported_kinds = set(['file', 'directory', 'symlink', 'tree-reference'])
138
    format_num = '9'
139
    revision_format_num = None
140
    support_altered_by_hack = False
141
4849.4.2 by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute.
142
    def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
143
        kind = elt.tag
144
        if not kind in self.supported_kinds:
145
            raise AssertionError('unsupported entry kind %s' % kind)
146
        if kind == 'tree-reference':
147
            file_id = elt.attrib['file_id']
148
            name = elt.attrib['name']
149
            parent_id = elt.attrib['parent_id']
150
            revision = elt.get('revision')
151
            reference_revision = elt.get('reference_revision')
152
            return inventory.TreeReference(file_id, name, parent_id, revision,
153
                                           reference_revision)
154
        else:
4849.4.2 by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute.
155
            return xml7.Serializer_v7._unpack_entry(self, elt,
156
                entry_cache=entry_cache, return_from_cache=return_from_cache)
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
157
158
    def __init__(self, node_size, search_key_name):
159
        self.maximum_size = node_size
160
        self.search_key_name = search_key_name
161
162
4543.2.7 by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5
163
class CHKSerializer(xml6.Serializer_v6):
4241.6.1 by Ian Clatworthy
chk_map code from brisbane-core
164
    """A CHKInventory based serializer with 'plain' behaviour."""
165
166
    format_num = '9'
167
    revision_format_num = None
168
    support_altered_by_hack = False
169
170
    def __init__(self, node_size, search_key_name):
171
        self.maximum_size = node_size
172
        self.search_key_name = search_key_name
173
174
175
chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way')
4290.1.7 by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer.
176
177
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
178
class CHKBEncodeSerializer(BEncodeRevisionSerializer1, CHKSerializer):
179
    """A CHKInventory and BEncode based serializer with 'plain' behaviour."""
4290.1.7 by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer.
180
181
    format_num = '10'
182
183
4290.1.12 by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer.
184
chk_bencode_serializer = CHKBEncodeSerializer(65536, 'hash-255-way')