~bzr-pqm/bzr/bzr.dev : contents of bzrlib/chunk_writer.py at revision 3641.3.31

~bzr-pqm/bzr/bzr.dev : (revision 3641.3.31)

# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

"""ChunkWriter: write compressed data out with a fixed upper bound."""

import zlib
from zlib import Z_FINISH, Z_SYNC_FLUSH


class ChunkWriter(object):
    """ChunkWriter allows writing of compressed data with a fixed size.

    If less data is supplied than fills a chunk, the chunk is padded with
    NULL bytes. If more data is supplied, then the writer packs as much
    in as it can, but never splits any item it was given.

    The algorithm for packing is open to improvement! Current it is:
     - write the bytes given
     - if the total seen bytes so far exceeds the chunk size, flush.

    :cvar _max_repack: To fit the maximum number of entries into a node, we
        will sometimes start over and compress the whole list to get tighter
        packing. We get diminishing returns after a while, so this limits the
        number of times we will try.
        In testing, some values for bzr.dev::

                    w/o copy    w/ copy     w/ copy ins w/ copy & save
            repack  time  MB    time  MB    time  MB    time  MB
             1       8.8  5.1    8.9  5.1    9.6  4.4   12.5  4.1
             2       9.6  4.4   10.1  4.3   10.4  4.2   11.1  4.1
             3      10.6  4.2   11.1  4.1   11.2  4.1   11.3  4.1
             4      12.0  4.1
             5      12.6  4.1
            20      12.9  4.1   12.2  4.1   12.3  4.1

        In testing, some values for mysql-unpacked::

                    w/o copy    w/ copy     w/ copy ins w/ copy & save
            repack  time  MB    time  MB    time  MB    time  MB
             1      56.6  16.9              60.7  14.2
             2      59.3  14.1              62.6  13.5  64.3  13.4
             3      64.4  13.5
            20      73.4  13.4

    :cvar _default_min_compression_size: The expected minimum compression.
        While packing nodes into the page, we won't Z_SYNC_FLUSH until we have
        received this much input data. This saves time, because we don't bloat
        the result with SYNC entries (and then need to repack), but if it is
        set too high we will accept data that will never fit and trigger a
        fault later.
    """

    _max_repack = 2
    _default_min_compression_size = 1.8

    def __init__(self, chunk_size, reserved=0):
        """Create a ChunkWriter to write chunk_size chunks.

        :param chunk_size: The total byte count to emit at the end of the
            chunk.
        :param reserved: How many bytes to allow for reserved data. reserved
            data space can only be written to via the write_reserved method.
        """
        self.chunk_size = chunk_size
        self.compressor = zlib.compressobj()
        self.bytes_in = []
        self.bytes_list = []
        self.bytes_out_len = 0
        self.compressed = None
        self.seen_bytes = 0
        self.num_repack = 0
        self.unused_bytes = None
        self.reserved_size = reserved
        self.min_compress_size = self._default_min_compression_size

    def finish(self):
        """Finish the chunk.

        This returns the final compressed chunk, and either None, or the
        bytes that did not fit in the chunk.
        """
        self.bytes_in = None # Free the data cached so far, we don't need it
        out = self.compressor.flush(Z_FINISH)
        self.bytes_list.append(out)
        self.bytes_out_len += len(out)
        if self.bytes_out_len > self.chunk_size:
            raise AssertionError('Somehow we ended up with too much'
                                 ' compressed data, %d > %d'
                                 % (self.bytes_out_len, self.chunk_size))
        nulls_needed = self.chunk_size - self.bytes_out_len % self.chunk_size
        if nulls_needed:
            self.bytes_list.append("\x00" * nulls_needed)
        return self.bytes_list, self.unused_bytes, nulls_needed

    def _recompress_all_bytes_in(self, extra_bytes=None):
        """Recompress the current bytes_in, and optionally more.

        :param extra_bytes: Optional, if supplied we will try to add it with
            Z_SYNC_FLUSH
        :return: (bytes_out, compressor, alt_compressed)
            bytes_out   is the compressed bytes returned from the compressor
            compressor  An object with everything packed in so far, and
                        Z_SYNC_FLUSH called.
            alt_compressed  If the compressor supports copy(), then this is a
                            snapshot just before extra_bytes is added.
                            It is (bytes_out, compressor) as well.
                            The idea is if you find you cannot fit the new
                            bytes, you don't have to start over.
                            And if you *can* you don't have to Z_SYNC_FLUSH
                            yet.
        """
        compressor = zlib.compressobj()
        bytes_out = []
        append = bytes_out.append
        compress = compressor.compress
        for accepted_bytes in self.bytes_in:
            out = compress(accepted_bytes)
            if out:
                append(out)
        if extra_bytes:
            out = compress(extra_bytes)
            out += compressor.flush(Z_SYNC_FLUSH)
            if out:
                append(out)
        bytes_out_len = sum(map(len, bytes_out))
        return bytes_out, bytes_out_len, compressor

    def write(self, bytes, reserved=False):
        """Write some bytes to the chunk.

        If the bytes fit, False is returned. Otherwise True is returned
        and the bytes have not been added to the chunk.
        """
        if reserved:
            capacity = self.chunk_size
        else:
            capacity = self.chunk_size - self.reserved_size
        # Check quickly to see if this is likely to put us outside of our
        # budget:
        next_seen_size = self.seen_bytes + len(bytes)
        comp = self.compressor
        if (next_seen_size < self.min_compress_size * capacity):
            # No need, we assume this will "just fit"
            out = comp.compress(bytes)
            if out:
                self.bytes_list.append(out)
                self.bytes_out_len += len(out)
            self.bytes_in.append(bytes)
            self.seen_bytes = next_seen_size
        else:
            if self.num_repack >= self._max_repack and not reserved:
                # We already know we don't want to try to fit more
                return True
            # This may or may not fit, try to add it with Z_SYNC_FLUSH
            out = comp.compress(bytes)
            out += comp.flush(Z_SYNC_FLUSH)
            if out:
                self.bytes_list.append(out)
                self.bytes_out_len += len(out)
            if self.bytes_out_len + 10 > capacity:
                # We are over budget, try to squeeze this in without any
                # Z_SYNC_FLUSH calls
                self.num_repack += 1
                bytes_out, this_len, compressor = self._recompress_all_bytes_in(bytes)
                if this_len + 10 > capacity:
                    # No way we can add anymore, we need to re-pack because our
                    # compressor is now out of sync.
                    # This seems to be rarely triggered over
                    #   num_repack > _max_repack
                    bytes_out, this_len, compressor = self._recompress_all_bytes_in()
                    self.compressor = compressor
                    self.bytes_list = bytes_out
                    self.bytes_out_len = this_len
                    self.unused_bytes = bytes
                    return True
                else:
                    # This fits when we pack it tighter, so use the new packing
                    # There is one Z_SYNC_FLUSH call in
                    # _recompress_all_bytes_in
                    self.compressor = compressor
                    self.bytes_in.append(bytes)
                    self.bytes_list = bytes_out
                    self.bytes_out_len = this_len
            else:
                # It fit, so mark it added
                self.bytes_in.append(bytes)
                self.seen_bytes = next_seen_size
        return False


3641.3.29 by John Arbash Meinel Cleanup the copyright headers	1	# Copyright (C) 2008 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""ChunkWriter: write compressed data out with a fixed upper bound."""
	19
	20	import zlib
	21	from zlib import Z_FINISH, Z_SYNC_FLUSH
	22
	23
	24	class ChunkWriter(object):
	25	"""ChunkWriter allows writing of compressed data with a fixed size.
	26
	27	If less data is supplied than fills a chunk, the chunk is padded with
	28	NULL bytes. If more data is supplied, then the writer packs as much
	29	in as it can, but never splits any item it was given.
	30
	31	The algorithm for packing is open to improvement! Current it is:
	32	- write the bytes given
	33	- if the total seen bytes so far exceeds the chunk size, flush.
3641.3.4 by John Arbash Meinel Tweak some 'sum' lines.	34
	35	:cvar _max_repack: To fit the maximum number of entries into a node, we
	36	will sometimes start over and compress the whole list to get tighter
	37	packing. We get diminishing returns after a while, so this limits the
	38	number of times we will try.
3641.3.14 by John Arbash Meinel Replace time/space benchmarks with real-world testing.	39	In testing, some values for bzr.dev::
	40
	41	w/o copy w/ copy w/ copy ins w/ copy & save
	42	repack time MB time MB time MB time MB
	43	1 8.8 5.1 8.9 5.1 9.6 4.4 12.5 4.1
	44	2 9.6 4.4 10.1 4.3 10.4 4.2 11.1 4.1
	45	3 10.6 4.2 11.1 4.1 11.2 4.1 11.3 4.1
	46	4 12.0 4.1
	47	5 12.6 4.1
	48	20 12.9 4.1 12.2 4.1 12.3 4.1
	49
	50	In testing, some values for mysql-unpacked::
	51
	52	w/o copy w/ copy w/ copy ins w/ copy & save
	53	repack time MB time MB time MB time MB
	54	1 56.6 16.9 60.7 14.2
	55	2 59.3 14.1 62.6 13.5 64.3 13.4
	56	3 64.4 13.5
	57	20 73.4 13.4
	58
3641.3.4 by John Arbash Meinel Tweak some 'sum' lines.	59	:cvar _default_min_compression_size: The expected minimum compression.
	60	While packing nodes into the page, we won't Z_SYNC_FLUSH until we have
	61	received this much input data. This saves time, because we don't bloat
	62	the result with SYNC entries (and then need to repack), but if it is
3641.3.12 by John Arbash Meinel Collect some info on the space/time tradeoff for _max_repack.	63	set too high we will accept data that will never fit and trigger a
	64	fault later.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	65	"""
	66
	67	_max_repack = 2
3641.3.4 by John Arbash Meinel Tweak some 'sum' lines.	68	_default_min_compression_size = 1.8
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	69
	70	def __init__(self, chunk_size, reserved=0):
	71	"""Create a ChunkWriter to write chunk_size chunks.
	72
	73	:param chunk_size: The total byte count to emit at the end of the
	74	chunk.
	75	:param reserved: How many bytes to allow for reserved data. reserved
	76	data space can only be written to via the write_reserved method.
	77	"""
	78	self.chunk_size = chunk_size
	79	self.compressor = zlib.compressobj()
	80	self.bytes_in = []
	81	self.bytes_list = []
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	82	self.bytes_out_len = 0
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	83	self.compressed = None
	84	self.seen_bytes = 0
	85	self.num_repack = 0
	86	self.unused_bytes = None
	87	self.reserved_size = reserved
3641.3.4 by John Arbash Meinel Tweak some 'sum' lines.	88	self.min_compress_size = self._default_min_compression_size
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	89
	90	def finish(self):
	91	"""Finish the chunk.
	92
	93	This returns the final compressed chunk, and either None, or the
	94	bytes that did not fit in the chunk.
	95	"""
	96	self.bytes_in = None # Free the data cached so far, we don't need it
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	97	out = self.compressor.flush(Z_FINISH)
	98	self.bytes_list.append(out)
	99	self.bytes_out_len += len(out)
	100	if self.bytes_out_len > self.chunk_size:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	101	raise AssertionError('Somehow we ended up with too much'
	102	' compressed data, %d > %d'
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	103	% (self.bytes_out_len, self.chunk_size))
	104	nulls_needed = self.chunk_size - self.bytes_out_len % self.chunk_size
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	105	if nulls_needed:
	106	self.bytes_list.append("\x00" * nulls_needed)
	107	return self.bytes_list, self.unused_bytes, nulls_needed
	108
	109	def _recompress_all_bytes_in(self, extra_bytes=None):
3641.3.12 by John Arbash Meinel Collect some info on the space/time tradeoff for _max_repack.	110	"""Recompress the current bytes_in, and optionally more.
	111
	112	:param extra_bytes: Optional, if supplied we will try to add it with
	113	Z_SYNC_FLUSH
	114	:return: (bytes_out, compressor, alt_compressed)
	115	bytes_out is the compressed bytes returned from the compressor
	116	compressor An object with everything packed in so far, and
	117	Z_SYNC_FLUSH called.
	118	alt_compressed If the compressor supports copy(), then this is a
	119	snapshot just before extra_bytes is added.
	120	It is (bytes_out, compressor) as well.
	121	The idea is if you find you cannot fit the new
	122	bytes, you don't have to start over.
	123	And if you can you don't have to Z_SYNC_FLUSH
	124	yet.
	125	"""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	126	compressor = zlib.compressobj()
	127	bytes_out = []
3641.3.5 by John Arbash Meinel For iter_all and three_level tests adjust spill-at.	128	append = bytes_out.append
	129	compress = compressor.compress
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	130	for accepted_bytes in self.bytes_in:
3641.3.5 by John Arbash Meinel For iter_all and three_level tests adjust spill-at.	131	out = compress(accepted_bytes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	132	if out:
3641.3.5 by John Arbash Meinel For iter_all and three_level tests adjust spill-at.	133	append(out)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	134	if extra_bytes:
3641.3.5 by John Arbash Meinel For iter_all and three_level tests adjust spill-at.	135	out = compress(extra_bytes)
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	136	out += compressor.flush(Z_SYNC_FLUSH)
	137	if out:
	138	append(out)
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	139	bytes_out_len = sum(map(len, bytes_out))
	140	return bytes_out, bytes_out_len, compressor
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	141
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	142	def write(self, bytes, reserved=False):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	143	"""Write some bytes to the chunk.
	144
	145	If the bytes fit, False is returned. Otherwise True is returned
	146	and the bytes have not been added to the chunk.
	147	"""
	148	if reserved:
	149	capacity = self.chunk_size
	150	else:
	151	capacity = self.chunk_size - self.reserved_size
	152	# Check quickly to see if this is likely to put us outside of our
	153	# budget:
	154	next_seen_size = self.seen_bytes + len(bytes)
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	155	comp = self.compressor
3641.3.4 by John Arbash Meinel Tweak some 'sum' lines.	156	if (next_seen_size < self.min_compress_size * capacity):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	157	# No need, we assume this will "just fit"
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	158	out = comp.compress(bytes)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	159	if out:
	160	self.bytes_list.append(out)
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	161	self.bytes_out_len += len(out)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	162	self.bytes_in.append(bytes)
	163	self.seen_bytes = next_seen_size
	164	else:
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	165	if self.num_repack >= self._max_repack and not reserved:
3641.3.15 by John Arbash Meinel Now that we have real data, remove the copy() code.	166	# We already know we don't want to try to fit more
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	167	return True
	168	# This may or may not fit, try to add it with Z_SYNC_FLUSH
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	169	out = comp.compress(bytes)
	170	out += comp.flush(Z_SYNC_FLUSH)
3641.3.15 by John Arbash Meinel Now that we have real data, remove the copy() code.	171	if out:
	172	self.bytes_list.append(out)
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	173	self.bytes_out_len += len(out)
	174	if self.bytes_out_len + 10 > capacity:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	175	# We are over budget, try to squeeze this in without any
	176	# Z_SYNC_FLUSH calls
	177	self.num_repack += 1
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	178	bytes_out, this_len, compressor = self._recompress_all_bytes_in(bytes)
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	179	if this_len + 10 > capacity:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	180	# No way we can add anymore, we need to re-pack because our
3641.3.15 by John Arbash Meinel Now that we have real data, remove the copy() code.	181	# compressor is now out of sync.
	182	# This seems to be rarely triggered over
	183	# num_repack > _max_repack
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	184	bytes_out, this_len, compressor = self._recompress_all_bytes_in()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	185	self.compressor = compressor
	186	self.bytes_list = bytes_out
3641.3.27 by John Arbash Meinel Bringing reserved in as a keyword to write() also saves some time.	187	self.bytes_out_len = this_len
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	188	self.unused_bytes = bytes
	189	return True
	190	else:
	191	# This fits when we pack it tighter, so use the new packing
3641.3.15 by John Arbash Meinel Now that we have real data, remove the copy() code.	192	# There is one Z_SYNC_FLUSH call in
	193	# _recompress_all_bytes_in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	194	self.compressor = compressor
	195	self.bytes_in.append(bytes)
	196	self.bytes_list = bytes_out
3641.3.16 by John Arbash Meinel Somewhat surprisingly, tracking bytes_out_len makes a	197	self.bytes_out_len = this_len
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	198	else:
	199	# It fit, so mark it added
	200	self.bytes_in.append(bytes)
	201	self.seen_bytes = next_seen_size
	202	return False
	203