4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
||
17 |
"""Serializer object for CHK based inventory storage."""
|
|
18 |
||
19 |
from bzrlib import ( |
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
20 |
bencode, |
4290.1.1
by Jelmer Vernooij
Add simple revision serializer based on RIO. |
21 |
cache_utf8, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
22 |
inventory, |
4290.1.1
by Jelmer Vernooij
Add simple revision serializer based on RIO. |
23 |
revision as _mod_revision, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
24 |
xml6, |
4543.2.7
by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5 |
25 |
xml7, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
26 |
)
|
27 |
||
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
28 |
|
29 |
def _validate_properties(props, _decode=cache_utf8._utf8_decode): |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
30 |
# TODO: we really want an 'isascii' check for key
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
31 |
# Cast the utf8 properties into Unicode 'in place'
|
32 |
for key, value in props.iteritems(): |
|
33 |
props[key] = _decode(value)[0] |
|
34 |
return props |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
35 |
|
36 |
||
37 |
def _is_format_10(value): |
|
38 |
if value != 10: |
|
39 |
raise ValueError('Format number was not recognized, expected 10 got %d' |
|
40 |
% (value,)) |
|
41 |
return 10 |
|
42 |
||
43 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
44 |
class BEncodeRevisionSerializer1(object): |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
45 |
"""Simple revision serializer based around bencode.
|
4290.1.1
by Jelmer Vernooij
Add simple revision serializer based on RIO. |
46 |
"""
|
47 |
||
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
48 |
squashes_xml_invalid_characters = False |
49 |
||
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
50 |
# Maps {key:(Revision attribute, bencode_type, validator)}
|
51 |
# This tells us what kind we expect bdecode to create, what variable on
|
|
52 |
# Revision we should be using, and a function to call to validate/transform
|
|
53 |
# the type.
|
|
54 |
# TODO: add a 'validate_utf8' for things like revision_id and file_id
|
|
55 |
# and a validator for parent-ids
|
|
56 |
_schema = {'format': (None, int, _is_format_10), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
57 |
'committer': ('committer', str, cache_utf8.decode), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
58 |
'timezone': ('timezone', int, None), |
59 |
'timestamp': ('timestamp', str, float), |
|
60 |
'revision-id': ('revision_id', str, None), |
|
4398.5.19
by John Arbash Meinel
Change parent_ids back to a list, because there are other tests that expect it. |
61 |
'parent-ids': ('parent_ids', list, None), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
62 |
'inventory-sha1': ('inventory_sha1', str, None), |
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
63 |
'message': ('message', str, cache_utf8.decode), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
64 |
'properties': ('properties', dict, _validate_properties), |
65 |
}
|
|
66 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
67 |
def write_revision_to_string(self, rev): |
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
68 |
encode_utf8 = cache_utf8._utf8_encode |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
69 |
# Use a list of tuples rather than a dict
|
70 |
# This lets us control the ordering, so that we are able to create
|
|
71 |
# smaller deltas
|
|
72 |
ret = [ |
|
73 |
("format", 10), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
74 |
("committer", encode_utf8(rev.committer)[0]), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
75 |
]
|
76 |
if rev.timezone is not None: |
|
77 |
ret.append(("timezone", rev.timezone)) |
|
78 |
# For bzr revisions, the most common property is just 'branch-nick'
|
|
79 |
# which changes infrequently.
|
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
80 |
revprops = {} |
81 |
for key, value in rev.properties.iteritems(): |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
82 |
revprops[key] = encode_utf8(value)[0] |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
83 |
ret.append(('properties', revprops)) |
84 |
ret.extend([ |
|
85 |
("timestamp", "%.3f" % rev.timestamp), |
|
86 |
("revision-id", rev.revision_id), |
|
87 |
("parent-ids", rev.parent_ids), |
|
88 |
("inventory-sha1", rev.inventory_sha1), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
89 |
("message", encode_utf8(rev.message)[0]), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
90 |
])
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
91 |
return bencode.bencode(ret) |
4290.1.8
by Jelmer Vernooij
Some performance tweaks. |
92 |
|
93 |
def write_revision(self, rev, f): |
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
94 |
f.write(self.write_revision_to_string(rev)) |
95 |
||
96 |
def read_revision_from_string(self, text): |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
97 |
# TODO: consider writing a Revision decoder, rather than using the
|
98 |
# generic bencode decoder
|
|
4398.5.8
by John Arbash Meinel
Update the TODO comment a bit. |
99 |
# However, to decode all 25k revisions of bzr takes approx 1.3s
|
100 |
# If we remove all extra validation that goes down to about 1.2s.
|
|
101 |
# Of that time, probably 0.6s is spend in bencode.bdecode().
|
|
102 |
# Regardless 'time bzr log' of everything is 7+s, so 1.3s to
|
|
103 |
# extract revision texts isn't a majority of time.
|
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
104 |
ret = bencode.bdecode(text) |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
105 |
if not isinstance(ret, list): |
106 |
raise ValueError("invalid revision text") |
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
107 |
schema = self._schema |
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
108 |
# timezone is allowed to be missing, but should be set
|
109 |
bits = {'timezone': None} |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
110 |
for key, value in ret: |
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
111 |
# Will raise KeyError if not a valid part of the schema, or an
|
112 |
# entry is given 2 times.
|
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
113 |
var_name, expected_type, validator = schema[key] |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
114 |
if value.__class__ is not expected_type: |
115 |
raise ValueError('key %s did not conform to the expected type' |
|
116 |
' %s, but was %s' |
|
117 |
% (key, expected_type, type(value))) |
|
118 |
if validator is not None: |
|
119 |
value = validator(value) |
|
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
120 |
bits[var_name] = value |
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
121 |
if len(bits) != len(schema): |
122 |
missing = [key for key, (var_name, _, _) in schema.iteritems() |
|
123 |
if var_name not in bits] |
|
124 |
raise ValueError('Revision text was missing expected keys %s.' |
|
125 |
' text %r' % (missing, text)) |
|
126 |
del bits[None] # Get rid of 'format' since it doesn't get mapped |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
127 |
rev = _mod_revision.Revision(**bits) |
4290.1.8
by Jelmer Vernooij
Some performance tweaks. |
128 |
return rev |
129 |
||
130 |
def read_revision(self, f): |
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
131 |
return self.read_revision_from_string(f.read()) |
132 |
||
133 |
||
4543.2.7
by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5 |
134 |
class CHKSerializerSubtree(BEncodeRevisionSerializer1, xml7.Serializer_v7): |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
135 |
"""A CHKInventory based serializer that supports tree references"""
|
136 |
||
137 |
supported_kinds = set(['file', 'directory', 'symlink', 'tree-reference']) |
|
138 |
format_num = '9' |
|
139 |
revision_format_num = None |
|
140 |
support_altered_by_hack = False |
|
141 |
||
4849.4.2
by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute. |
142 |
def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False): |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
143 |
kind = elt.tag |
144 |
if not kind in self.supported_kinds: |
|
145 |
raise AssertionError('unsupported entry kind %s' % kind) |
|
146 |
if kind == 'tree-reference': |
|
147 |
file_id = elt.attrib['file_id'] |
|
148 |
name = elt.attrib['name'] |
|
149 |
parent_id = elt.attrib['parent_id'] |
|
150 |
revision = elt.get('revision') |
|
151 |
reference_revision = elt.get('reference_revision') |
|
152 |
return inventory.TreeReference(file_id, name, parent_id, revision, |
|
153 |
reference_revision) |
|
154 |
else: |
|
4849.4.2
by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute. |
155 |
return xml7.Serializer_v7._unpack_entry(self, elt, |
156 |
entry_cache=entry_cache, return_from_cache=return_from_cache) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
157 |
|
158 |
def __init__(self, node_size, search_key_name): |
|
159 |
self.maximum_size = node_size |
|
160 |
self.search_key_name = search_key_name |
|
161 |
||
162 |
||
4543.2.7
by John Arbash Meinel
It turns out CHKSerializer was inheriting from xml5 |
163 |
class CHKSerializer(xml6.Serializer_v6): |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
164 |
"""A CHKInventory based serializer with 'plain' behaviour."""
|
165 |
||
166 |
format_num = '9' |
|
167 |
revision_format_num = None |
|
168 |
support_altered_by_hack = False |
|
169 |
||
170 |
def __init__(self, node_size, search_key_name): |
|
171 |
self.maximum_size = node_size |
|
172 |
self.search_key_name = search_key_name |
|
173 |
||
174 |
||
175 |
chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way') |
|
4290.1.7
by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer. |
176 |
|
177 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
178 |
class CHKBEncodeSerializer(BEncodeRevisionSerializer1, CHKSerializer): |
179 |
"""A CHKInventory and BEncode based serializer with 'plain' behaviour."""
|
|
4290.1.7
by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer. |
180 |
|
181 |
format_num = '10' |
|
182 |
||
183 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
184 |
chk_bencode_serializer = CHKBEncodeSerializer(65536, 'hash-255-way') |