~bzr-pqm/bzr/bzr.dev : contents of bzrlib/chk_serializer.py at revision 5462

~bzr-pqm/bzr/bzr.dev : (revision 5462)

# Copyright (C) 2008, 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Serializer object for CHK based inventory storage."""

from bzrlib import (
    bencode,
    cache_utf8,
    inventory,
    revision as _mod_revision,
    xml6,
    xml7,
    )


def _validate_properties(props, _decode=cache_utf8._utf8_decode):
    # TODO: we really want an 'isascii' check for key
    # Cast the utf8 properties into Unicode 'in place'
    for key, value in props.iteritems():
        props[key] = _decode(value)[0]
    return props


def _is_format_10(value):
    if value != 10:
        raise ValueError('Format number was not recognized, expected 10 got %d'
                         % (value,))
    return 10


class BEncodeRevisionSerializer1(object):
    """Simple revision serializer based around bencode.
    """

    squashes_xml_invalid_characters = False

    # Maps {key:(Revision attribute, bencode_type, validator)}
    # This tells us what kind we expect bdecode to create, what variable on
    # Revision we should be using, and a function to call to validate/transform
    # the type.
    # TODO: add a 'validate_utf8' for things like revision_id and file_id
    #       and a validator for parent-ids
    _schema = {'format': (None, int, _is_format_10),
               'committer': ('committer', str, cache_utf8.decode),
               'timezone': ('timezone', int, None),
               'timestamp': ('timestamp', str, float),
               'revision-id': ('revision_id', str, None),
               'parent-ids': ('parent_ids', list, None),
               'inventory-sha1': ('inventory_sha1', str, None),
               'message': ('message', str, cache_utf8.decode),
               'properties': ('properties', dict, _validate_properties),
    }

    def write_revision_to_string(self, rev):
        encode_utf8 = cache_utf8._utf8_encode
        # Use a list of tuples rather than a dict
        # This lets us control the ordering, so that we are able to create
        # smaller deltas
        ret = [
            ("format", 10),
            ("committer", encode_utf8(rev.committer)[0]),
        ]
        if rev.timezone is not None:
            ret.append(("timezone", rev.timezone))
        # For bzr revisions, the most common property is just 'branch-nick'
        # which changes infrequently.
        revprops = {}
        for key, value in rev.properties.iteritems():
            revprops[key] = encode_utf8(value)[0]
        ret.append(('properties', revprops))
        ret.extend([
            ("timestamp", "%.3f" % rev.timestamp),
            ("revision-id", rev.revision_id),
            ("parent-ids", rev.parent_ids),
            ("inventory-sha1", rev.inventory_sha1),
            ("message", encode_utf8(rev.message)[0]),
        ])
        return bencode.bencode(ret)

    def write_revision(self, rev, f):
        f.write(self.write_revision_to_string(rev))

    def read_revision_from_string(self, text):
        # TODO: consider writing a Revision decoder, rather than using the
        #       generic bencode decoder
        #       However, to decode all 25k revisions of bzr takes approx 1.3s
        #       If we remove all extra validation that goes down to about 1.2s.
        #       Of that time, probably 0.6s is spend in bencode.bdecode().
        #       Regardless 'time bzr log' of everything is 7+s, so 1.3s to
        #       extract revision texts isn't a majority of time.
        ret = bencode.bdecode(text)
        if not isinstance(ret, list):
            raise ValueError("invalid revision text")
        schema = self._schema
        # timezone is allowed to be missing, but should be set
        bits = {'timezone': None}
        for key, value in ret:
            # Will raise KeyError if not a valid part of the schema, or an
            # entry is given 2 times.
            var_name, expected_type, validator = schema[key]
            if value.__class__ is not expected_type:
                raise ValueError('key %s did not conform to the expected type'
                                 ' %s, but was %s'
                                 % (key, expected_type, type(value)))
            if validator is not None:
                value = validator(value)
            bits[var_name] = value
        if len(bits) != len(schema):
            missing = [key for key, (var_name, _, _) in schema.iteritems()
                       if var_name not in bits]
            raise ValueError('Revision text was missing expected keys %s.'
                             ' text %r' % (missing, text))
        del bits[None]  # Get rid of 'format' since it doesn't get mapped
        rev = _mod_revision.Revision(**bits)
        return rev

    def read_revision(self, f):
        return self.read_revision_from_string(f.read())


class CHKSerializerSubtree(BEncodeRevisionSerializer1, xml7.Serializer_v7):
    """A CHKInventory based serializer that supports tree references"""

    supported_kinds = set(['file', 'directory', 'symlink', 'tree-reference'])
    format_num = '9'
    revision_format_num = None
    support_altered_by_hack = False

    def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
        kind = elt.tag
        if not kind in self.supported_kinds:
            raise AssertionError('unsupported entry kind %s' % kind)
        if kind == 'tree-reference':
            file_id = elt.attrib['file_id']
            name = elt.attrib['name']
            parent_id = elt.attrib['parent_id']
            revision = elt.get('revision')
            reference_revision = elt.get('reference_revision')
            return inventory.TreeReference(file_id, name, parent_id, revision,
                                           reference_revision)
        else:
            return xml7.Serializer_v7._unpack_entry(self, elt,
                entry_cache=entry_cache, return_from_cache=return_from_cache)

    def __init__(self, node_size, search_key_name):
        self.maximum_size = node_size
        self.search_key_name = search_key_name


class CHKSerializer(xml6.Serializer_v6):
    """A CHKInventory based serializer with 'plain' behaviour."""

    format_num = '9'
    revision_format_num = None
    support_altered_by_hack = False

    def __init__(self, node_size, search_key_name):
        self.maximum_size = node_size
        self.search_key_name = search_key_name


chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way')


class CHKBEncodeSerializer(BEncodeRevisionSerializer1, CHKSerializer):
    """A CHKInventory and BEncode based serializer with 'plain' behaviour."""

    format_num = '10'


chk_bencode_serializer = CHKBEncodeSerializer(65536, 'hash-255-way')

4763.2.4 by John Arbash Meinel merge bzr.2.1 in preparation for NEWS entry.	1	# Copyright (C) 2008, 2009, 2010 Canonical Ltd
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	16
	17	"""Serializer object for CHK based inventory storage."""
	18
	19	from bzrlib import (
4398.5.2 by John Arbash Meinel Merge the chk serializer, and update it for the new bencode locations.	20	bencode,
4290.1.1 by Jelmer Vernooij Add simple revision serializer based on RIO.	21	cache_utf8,
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	22	inventory,
4290.1.1 by Jelmer Vernooij Add simple revision serializer based on RIO.	23	revision as _mod_revision,
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	24	xml6,
4543.2.7 by John Arbash Meinel It turns out CHKSerializer was inheriting from xml5	25	xml7,
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	26	)
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	27
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	28
	29	def _validate_properties(props, _decode=cache_utf8._utf8_decode):
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	30	# TODO: we really want an 'isascii' check for key
4398.5.15 by John Arbash Meinel Change how schemas are validated (down to 1.02s)	31	# Cast the utf8 properties into Unicode 'in place'
	32	for key, value in props.iteritems():
	33	props[key] = _decode(value)[0]
	34	return props
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	35
	36
	37	def _is_format_10(value):
	38	if value != 10:
	39	raise ValueError('Format number was not recognized, expected 10 got %d'
	40	% (value,))
	41	return 10
	42
	43
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	44	class BEncodeRevisionSerializer1(object):
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	45	"""Simple revision serializer based around bencode.
4290.1.1 by Jelmer Vernooij Add simple revision serializer based on RIO.	46	"""
	47
4416.5.1 by Jelmer Vernooij Move squashing of XML-invalid characters to XMLSerializer.	48	squashes_xml_invalid_characters = False
	49
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	50	# Maps {key:(Revision attribute, bencode_type, validator)}
	51	# This tells us what kind we expect bdecode to create, what variable on
	52	# Revision we should be using, and a function to call to validate/transform
	53	# the type.
	54	# TODO: add a 'validate_utf8' for things like revision_id and file_id
	55	# and a validator for parent-ids
	56	_schema = {'format': (None, int, _is_format_10),
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	57	'committer': ('committer', str, cache_utf8.decode),
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	58	'timezone': ('timezone', int, None),
	59	'timestamp': ('timestamp', str, float),
	60	'revision-id': ('revision_id', str, None),
4398.5.19 by John Arbash Meinel Change parent_ids back to a list, because there are other tests that expect it.	61	'parent-ids': ('parent_ids', list, None),
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	62	'inventory-sha1': ('inventory_sha1', str, None),
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	63	'message': ('message', str, cache_utf8.decode),
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	64	'properties': ('properties', dict, _validate_properties),
	65	}
	66
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	67	def write_revision_to_string(self, rev):
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	68	encode_utf8 = cache_utf8._utf8_encode
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	69	# Use a list of tuples rather than a dict
	70	# This lets us control the ordering, so that we are able to create
	71	# smaller deltas
	72	ret = [
	73	("format", 10),
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	74	("committer", encode_utf8(rev.committer)[0]),
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	75	]
	76	if rev.timezone is not None:
	77	ret.append(("timezone", rev.timezone))
	78	# For bzr revisions, the most common property is just 'branch-nick'
	79	# which changes infrequently.
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	80	revprops = {}
	81	for key, value in rev.properties.iteritems():
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	82	revprops[key] = encode_utf8(value)[0]
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	83	ret.append(('properties', revprops))
	84	ret.extend([
	85	("timestamp", "%.3f" % rev.timestamp),
	86	("revision-id", rev.revision_id),
	87	("parent-ids", rev.parent_ids),
	88	("inventory-sha1", rev.inventory_sha1),
4398.5.9 by John Arbash Meinel it seems that codecs.utf_8_decode is quite a bit faster than codecs.get_decoder('utf-8')	89	("message", encode_utf8(rev.message)[0]),
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	90	])
4398.5.2 by John Arbash Meinel Merge the chk serializer, and update it for the new bencode locations.	91	return bencode.bencode(ret)
4290.1.8 by Jelmer Vernooij Some performance tweaks.	92
4290.1.8 by Jelmer Vernooij Some performance tweaks.	93	def write_revision(self, rev, f):
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	94	f.write(self.write_revision_to_string(rev))
	95
	96	def read_revision_from_string(self, text):
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	97	# TODO: consider writing a Revision decoder, rather than using the
	98	# generic bencode decoder
4398.5.8 by John Arbash Meinel Update the TODO comment a bit.	99	# However, to decode all 25k revisions of bzr takes approx 1.3s
	100	# If we remove all extra validation that goes down to about 1.2s.
	101	# Of that time, probably 0.6s is spend in bencode.bdecode().
	102	# Regardless 'time bzr log' of everything is 7+s, so 1.3s to
	103	# extract revision texts isn't a majority of time.
4398.5.2 by John Arbash Meinel Merge the chk serializer, and update it for the new bencode locations.	104	ret = bencode.bdecode(text)
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	105	if not isinstance(ret, list):
	106	raise ValueError("invalid revision text")
4398.5.15 by John Arbash Meinel Change how schemas are validated (down to 1.02s)	107	schema = self._schema
4398.5.7 by John Arbash Meinel Spend a little bit more time optimizing the read_revision_from_string loop	108	# timezone is allowed to be missing, but should be set
	109	bits = {'timezone': None}
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	110	for key, value in ret:
4398.5.7 by John Arbash Meinel Spend a little bit more time optimizing the read_revision_from_string loop	111	# Will raise KeyError if not a valid part of the schema, or an
	112	# entry is given 2 times.
4398.5.15 by John Arbash Meinel Change how schemas are validated (down to 1.02s)	113	var_name, expected_type, validator = schema[key]
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	114	if value.__class__ is not expected_type:
	115	raise ValueError('key %s did not conform to the expected type'
	116	' %s, but was %s'
	117	% (key, expected_type, type(value)))
	118	if validator is not None:
	119	value = validator(value)
4398.5.7 by John Arbash Meinel Spend a little bit more time optimizing the read_revision_from_string loop	120	bits[var_name] = value
4398.5.15 by John Arbash Meinel Change how schemas are validated (down to 1.02s)	121	if len(bits) != len(schema):
	122	missing = [key for key, (var_name, _, _) in schema.iteritems()
	123	if var_name not in bits]
	124	raise ValueError('Revision text was missing expected keys %s.'
	125	' text %r' % (missing, text))
	126	del bits[None] # Get rid of 'format' since it doesn't get mapped
4398.5.5 by John Arbash Meinel Update the CHK Serializer to do lots more validation.	127	rev = _mod_revision.Revision(**bits)
4290.1.8 by Jelmer Vernooij Some performance tweaks.	128	return rev
	129
	130	def read_revision(self, f):
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	131	return self.read_revision_from_string(f.read())
	132
	133
4543.2.7 by John Arbash Meinel It turns out CHKSerializer was inheriting from xml5	134	class CHKSerializerSubtree(BEncodeRevisionSerializer1, xml7.Serializer_v7):
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	135	"""A CHKInventory based serializer that supports tree references"""
	136
	137	supported_kinds = set(['file', 'directory', 'symlink', 'tree-reference'])
	138	format_num = '9'
	139	revision_format_num = None
	140	support_altered_by_hack = False
	141
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	142	def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	143	kind = elt.tag
	144	if not kind in self.supported_kinds:
	145	raise AssertionError('unsupported entry kind %s' % kind)
	146	if kind == 'tree-reference':
	147	file_id = elt.attrib['file_id']
	148	name = elt.attrib['name']
	149	parent_id = elt.attrib['parent_id']
	150	revision = elt.get('revision')
	151	reference_revision = elt.get('reference_revision')
	152	return inventory.TreeReference(file_id, name, parent_id, revision,
	153	reference_revision)
	154	else:
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	155	return xml7.Serializer_v7._unpack_entry(self, elt,
	156	entry_cache=entry_cache, return_from_cache=return_from_cache)
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	157
	158	def __init__(self, node_size, search_key_name):
	159	self.maximum_size = node_size
	160	self.search_key_name = search_key_name
	161
	162
4543.2.7 by John Arbash Meinel It turns out CHKSerializer was inheriting from xml5	163	class CHKSerializer(xml6.Serializer_v6):
4241.6.1 by Ian Clatworthy chk_map code from brisbane-core	164	"""A CHKInventory based serializer with 'plain' behaviour."""
	165
	166	format_num = '9'
	167	revision_format_num = None
	168	support_altered_by_hack = False
	169
	170	def __init__(self, node_size, search_key_name):
	171	self.maximum_size = node_size
	172	self.search_key_name = search_key_name
	173
	174
	175	chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way')
4290.1.7 by Jelmer Vernooij Add development7-rich-root format that uses the RIO Serializer.	176
	177
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	178	class CHKBEncodeSerializer(BEncodeRevisionSerializer1, CHKSerializer):
	179	"""A CHKInventory and BEncode based serializer with 'plain' behaviour."""
4290.1.7 by Jelmer Vernooij Add development7-rich-root format that uses the RIO Serializer.	180
	181	format_num = '10'
	182
	183
4290.1.12 by Jelmer Vernooij Use bencode rather than rio in the new revision serialiszer.	184	chk_bencode_serializer = CHKBEncodeSerializer(65536, 'hash-255-way')