4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
||
17 |
"""Serializer object for CHK based inventory storage."""
|
|
18 |
||
19 |
from bzrlib import ( |
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
20 |
bencode, |
4290.1.1
by Jelmer Vernooij
Add simple revision serializer based on RIO. |
21 |
cache_utf8, |
22 |
revision as _mod_revision, |
|
5671.2.2
by Jelmer Vernooij
Remove unused serializer class. |
23 |
xml8, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
24 |
)
|
25 |
||
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
26 |
|
27 |
def _validate_properties(props, _decode=cache_utf8._utf8_decode): |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
28 |
# TODO: we really want an 'isascii' check for key
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
29 |
# Cast the utf8 properties into Unicode 'in place'
|
30 |
for key, value in props.iteritems(): |
|
31 |
props[key] = _decode(value)[0] |
|
32 |
return props |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
33 |
|
34 |
||
35 |
def _is_format_10(value): |
|
36 |
if value != 10: |
|
37 |
raise ValueError('Format number was not recognized, expected 10 got %d' |
|
38 |
% (value,)) |
|
39 |
return 10 |
|
40 |
||
41 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
42 |
class BEncodeRevisionSerializer1(object): |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
43 |
"""Simple revision serializer based around bencode.
|
4290.1.1
by Jelmer Vernooij
Add simple revision serializer based on RIO. |
44 |
"""
|
45 |
||
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
46 |
squashes_xml_invalid_characters = False |
47 |
||
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
48 |
# Maps {key:(Revision attribute, bencode_type, validator)}
|
49 |
# This tells us what kind we expect bdecode to create, what variable on
|
|
50 |
# Revision we should be using, and a function to call to validate/transform
|
|
51 |
# the type.
|
|
52 |
# TODO: add a 'validate_utf8' for things like revision_id and file_id
|
|
53 |
# and a validator for parent-ids
|
|
54 |
_schema = {'format': (None, int, _is_format_10), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
55 |
'committer': ('committer', str, cache_utf8.decode), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
56 |
'timezone': ('timezone', int, None), |
57 |
'timestamp': ('timestamp', str, float), |
|
58 |
'revision-id': ('revision_id', str, None), |
|
4398.5.19
by John Arbash Meinel
Change parent_ids back to a list, because there are other tests that expect it. |
59 |
'parent-ids': ('parent_ids', list, None), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
60 |
'inventory-sha1': ('inventory_sha1', str, None), |
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
61 |
'message': ('message', str, cache_utf8.decode), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
62 |
'properties': ('properties', dict, _validate_properties), |
63 |
}
|
|
64 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
65 |
def write_revision_to_string(self, rev): |
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
66 |
encode_utf8 = cache_utf8._utf8_encode |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
67 |
# Use a list of tuples rather than a dict
|
68 |
# This lets us control the ordering, so that we are able to create
|
|
69 |
# smaller deltas
|
|
70 |
ret = [ |
|
71 |
("format", 10), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
72 |
("committer", encode_utf8(rev.committer)[0]), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
73 |
]
|
74 |
if rev.timezone is not None: |
|
75 |
ret.append(("timezone", rev.timezone)) |
|
76 |
# For bzr revisions, the most common property is just 'branch-nick'
|
|
77 |
# which changes infrequently.
|
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
78 |
revprops = {} |
79 |
for key, value in rev.properties.iteritems(): |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
80 |
revprops[key] = encode_utf8(value)[0] |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
81 |
ret.append(('properties', revprops)) |
82 |
ret.extend([ |
|
83 |
("timestamp", "%.3f" % rev.timestamp), |
|
84 |
("revision-id", rev.revision_id), |
|
85 |
("parent-ids", rev.parent_ids), |
|
86 |
("inventory-sha1", rev.inventory_sha1), |
|
4398.5.9
by John Arbash Meinel
it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8') |
87 |
("message", encode_utf8(rev.message)[0]), |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
88 |
])
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
89 |
return bencode.bencode(ret) |
4290.1.8
by Jelmer Vernooij
Some performance tweaks. |
90 |
|
91 |
def write_revision(self, rev, f): |
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
92 |
f.write(self.write_revision_to_string(rev)) |
93 |
||
94 |
def read_revision_from_string(self, text): |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
95 |
# TODO: consider writing a Revision decoder, rather than using the
|
96 |
# generic bencode decoder
|
|
4398.5.8
by John Arbash Meinel
Update the TODO comment a bit. |
97 |
# However, to decode all 25k revisions of bzr takes approx 1.3s
|
98 |
# If we remove all extra validation that goes down to about 1.2s.
|
|
99 |
# Of that time, probably 0.6s is spend in bencode.bdecode().
|
|
100 |
# Regardless 'time bzr log' of everything is 7+s, so 1.3s to
|
|
101 |
# extract revision texts isn't a majority of time.
|
|
4398.5.2
by John Arbash Meinel
Merge the chk serializer, and update it for the new bencode locations. |
102 |
ret = bencode.bdecode(text) |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
103 |
if not isinstance(ret, list): |
104 |
raise ValueError("invalid revision text") |
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
105 |
schema = self._schema |
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
106 |
# timezone is allowed to be missing, but should be set
|
107 |
bits = {'timezone': None} |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
108 |
for key, value in ret: |
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
109 |
# Will raise KeyError if not a valid part of the schema, or an
|
110 |
# entry is given 2 times.
|
|
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
111 |
var_name, expected_type, validator = schema[key] |
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
112 |
if value.__class__ is not expected_type: |
113 |
raise ValueError('key %s did not conform to the expected type' |
|
114 |
' %s, but was %s' |
|
115 |
% (key, expected_type, type(value))) |
|
116 |
if validator is not None: |
|
117 |
value = validator(value) |
|
4398.5.7
by John Arbash Meinel
Spend a little bit more time optimizing the read_revision_from_string loop |
118 |
bits[var_name] = value |
4398.5.15
by John Arbash Meinel
Change how schemas are validated (down to 1.02s) |
119 |
if len(bits) != len(schema): |
120 |
missing = [key for key, (var_name, _, _) in schema.iteritems() |
|
121 |
if var_name not in bits] |
|
122 |
raise ValueError('Revision text was missing expected keys %s.' |
|
123 |
' text %r' % (missing, text)) |
|
124 |
del bits[None] # Get rid of 'format' since it doesn't get mapped |
|
4398.5.5
by John Arbash Meinel
Update the CHK Serializer to do lots more validation. |
125 |
rev = _mod_revision.Revision(**bits) |
4290.1.8
by Jelmer Vernooij
Some performance tweaks. |
126 |
return rev |
127 |
||
128 |
def read_revision(self, f): |
|
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
129 |
return self.read_revision_from_string(f.read()) |
130 |
||
131 |
||
5671.2.2
by Jelmer Vernooij
Remove unused serializer class. |
132 |
class CHKSerializer(xml8.Serializer_v8): |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
133 |
"""A CHKInventory based serializer with 'plain' behaviour."""
|
134 |
||
135 |
format_num = '9' |
|
136 |
revision_format_num = None |
|
137 |
support_altered_by_hack = False |
|
138 |
||
139 |
def __init__(self, node_size, search_key_name): |
|
140 |
self.maximum_size = node_size |
|
141 |
self.search_key_name = search_key_name |
|
142 |
||
143 |
||
144 |
chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way') |
|
4290.1.7
by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer. |
145 |
|
146 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
147 |
class CHKBEncodeSerializer(BEncodeRevisionSerializer1, CHKSerializer): |
148 |
"""A CHKInventory and BEncode based serializer with 'plain' behaviour."""
|
|
4290.1.7
by Jelmer Vernooij
Add development7-rich-root format that uses the RIO Serializer. |
149 |
|
150 |
format_num = '10' |
|
151 |
||
152 |
||
4290.1.12
by Jelmer Vernooij
Use bencode rather than rio in the new revision serialiszer. |
153 |
chk_bencode_serializer = CHKBEncodeSerializer(65536, 'hash-255-way') |