546
553
# time (self._block._content) is a little expensive.
547
554
self._block._ensure_content(self._last_byte)
549
def _check_rebuild_block(self):
556
def _check_rebuild_action(self):
550
557
"""Check to see if our block should be repacked."""
551
558
total_bytes_used = 0
552
559
last_byte_used = 0
553
560
for factory in self._factories:
554
561
total_bytes_used += factory._end - factory._start
555
last_byte_used = max(last_byte_used, factory._end)
556
# If we are using most of the bytes from the block, we have nothing
557
# else to check (currently more that 1/2)
562
if last_byte_used < factory._end:
563
last_byte_used = factory._end
564
# If we are using more than half of the bytes from the block, we have
565
# nothing else to check
558
566
if total_bytes_used * 2 >= self._block._content_length:
560
# Can we just strip off the trailing bytes? If we are going to be
561
# transmitting more than 50% of the front of the content, go ahead
567
return None, last_byte_used, total_bytes_used
568
# We are using less than 50% of the content. Is the content we are
569
# using at the beginning of the block? If so, we can just trim the
570
# tail, rather than rebuilding from scratch.
562
571
if total_bytes_used * 2 > last_byte_used:
563
self._trim_block(last_byte_used)
572
return 'trim', last_byte_used, total_bytes_used
566
574
# We are using a small amount of the data, and it isn't just packed
567
575
# nicely at the front, so rebuild the content.
574
582
# expanding many deltas into fulltexts, as well.
575
583
# If we build a cheap enough 'strip', then we could try a strip,
576
584
# if that expands the content, we then rebuild.
577
self._rebuild_block()
585
return 'rebuild', last_byte_used, total_bytes_used
587
def check_is_well_utilized(self):
588
"""Is the current block considered 'well utilized'?
590
This heuristic asks if the current block considers itself to be a fully
591
developed group, rather than just a loose collection of data.
593
if len(self._factories) == 1:
594
# A block of length 1 could be improved by combining with other
595
# groups - don't look deeper. Even larger than max size groups
596
# could compress well with adjacent versions of the same thing.
598
action, last_byte_used, total_bytes_used = self._check_rebuild_action()
599
block_size = self._block._content_length
600
if total_bytes_used < block_size * self._max_cut_fraction:
601
# This block wants to trim itself small enough that we want to
602
# consider it under-utilized.
604
# TODO: This code is meant to be the twin of _insert_record_stream's
605
# 'start_new_block' logic. It would probably be better to factor
606
# out that logic into a shared location, so that it stays
608
# We currently assume a block is properly utilized whenever it is >75%
609
# of the size of a 'full' block. In normal operation, a block is
610
# considered full when it hits 4MB of same-file content. So any block
611
# >3MB is 'full enough'.
612
# The only time this isn't true is when a given block has large-object
613
# content. (a single file >4MB, etc.)
614
# Under these circumstances, we allow a block to grow to
615
# 2 x largest_content. Which means that if a given block had a large
616
# object, it may actually be under-utilized. However, given that this
617
# is 'pack-on-the-fly' it is probably reasonable to not repack large
618
# content blobs on-the-fly. Note that because we return False for all
619
# 1-item blobs, we will repack them; we may wish to reevaluate our
620
# treatment of large object blobs in the future.
621
if block_size >= self._full_enough_block_size:
623
# If a block is <3MB, it still may be considered 'full' if it contains
624
# mixed content. The current rule is 2MB of mixed content is considered
625
# full. So check to see if this block contains mixed content, and
626
# set the threshold appropriately.
628
for factory in self._factories:
629
prefix = factory.key[:-1]
630
if common_prefix is None:
631
common_prefix = prefix
632
elif prefix != common_prefix:
633
# Mixed content, check the size appropriately
634
if block_size >= self._full_enough_mixed_block_size:
637
# The content failed both the mixed check and the single-content check
638
# so obviously it is not fully utilized
639
# TODO: there is one other constraint that isn't being checked
640
# namely, that the entries in the block are in the appropriate
641
# order. For example, you could insert the entries in exactly
642
# reverse groupcompress order, and we would think that is ok.
643
# (all the right objects are in one group, and it is fully
644
# utilized, etc.) For now, we assume that case is rare,
645
# especially since we should always fetch in 'groupcompress'
649
def _check_rebuild_block(self):
650
action, last_byte_used, total_bytes_used = self._check_rebuild_action()
654
self._trim_block(last_byte_used)
655
elif action == 'rebuild':
656
self._rebuild_block()
658
raise ValueError('unknown rebuild action: %r' % (action,))
579
660
def _wire_bytes(self):
580
661
"""Return a byte stream suitable for transmitting over the wire."""
1087
1168
class GroupCompressVersionedFiles(VersionedFiles):
1088
1169
"""A group-compress based VersionedFiles implementation."""
1090
def __init__(self, index, access, delta=True):
1171
def __init__(self, index, access, delta=True, _unadded_refs=None):
1091
1172
"""Create a GroupCompressVersionedFiles object.
1093
1174
:param index: The index object storing access and graph data.
1094
1175
:param access: The access object storing raw data.
1095
1176
:param delta: Whether to delta compress or just entropy compress.
1177
:param _unadded_refs: private parameter, don't use.
1097
1179
self._index = index
1098
1180
self._access = access
1099
1181
self._delta = delta
1100
self._unadded_refs = {}
1182
if _unadded_refs is None:
1184
self._unadded_refs = _unadded_refs
1101
1185
self._group_cache = LRUSizeCache(max_size=50*1024*1024)
1102
1186
self._fallback_vfs = []
1188
def without_fallbacks(self):
1189
"""Return a clone of this object without any fallbacks configured."""
1190
return GroupCompressVersionedFiles(self._index, self._access,
1191
self._delta, _unadded_refs=dict(self._unadded_refs))
1104
1193
def add_lines(self, key, parents, lines, parent_texts=None,
1105
1194
left_matching_blocks=None, nostore_sha=None, random_id=False,
1106
1195
check_content=True):
1583
1673
if reuse_blocks:
1584
1674
# If the reuse_blocks flag is set, check to see if we can just
1585
1675
# copy a groupcompress block as-is.
1676
# We only check on the first record (groupcompress-block) not
1677
# on all of the (groupcompress-block-ref) entries.
1678
# The reuse_this_block flag is then kept for as long as
1679
if record.storage_kind == 'groupcompress-block':
1680
# Check to see if we really want to re-use this block
1681
insert_manager = record._manager
1682
reuse_this_block = insert_manager.check_is_well_utilized()
1684
reuse_this_block = False
1685
if reuse_this_block:
1686
# We still want to reuse this block
1586
1687
if record.storage_kind == 'groupcompress-block':
1587
1688
# Insert the raw block into the target repo
1588
1689
insert_manager = record._manager
1589
insert_manager._check_rebuild_block()
1590
1690
bytes = record._manager._block.to_bytes()
1591
1691
_, start, length = self._access.add_raw_records(
1592
1692
[(None, len(bytes))], bytes)[0]
1597
1697
'groupcompress-block-ref'):
1598
1698
if insert_manager is None:
1599
1699
raise AssertionError('No insert_manager set')
1700
if insert_manager is not record._manager:
1701
raise AssertionError('insert_manager does not match'
1702
' the current record, we cannot be positive'
1703
' that the appropriate content was inserted.'
1600
1705
value = "%d %d %d %d" % (block_start, block_length,
1601
1706
record._start, record._end)
1602
1707
nodes = [(record.key, value, (record.parents,))]
1800
1906
result.append((key, value))
1801
1907
records = result
1802
1908
key_dependencies = self._key_dependencies
1803
if key_dependencies is not None and self._parents:
1804
for key, value, refs in records:
1806
key_dependencies.add_references(key, parents)
1909
if key_dependencies is not None:
1911
for key, value, refs in records:
1913
key_dependencies.add_references(key, parents)
1915
for key, value, refs in records:
1916
new_keys.add_key(key)
1807
1917
self._add_callback(records)
1809
1919
def _check_read(self):
1927
2037
This allows this _GCGraphIndex to keep track of any missing
1928
2038
compression parents we may want to have filled in to make those
2039
indices valid. It also allows _GCGraphIndex to track any new keys.
1931
2041
:param graph_index: A GraphIndex
1933
if self._key_dependencies is not None:
1934
# Add parent refs from graph_index (and discard parent refs that
1935
# the graph_index has).
1936
add_refs = self._key_dependencies.add_references
1937
for node in graph_index.iter_all_entries():
1938
add_refs(node[1], node[3][0])
2043
key_dependencies = self._key_dependencies
2044
if key_dependencies is None:
2046
for node in graph_index.iter_all_entries():
2047
# Add parent refs from graph_index (and discard parent refs
2048
# that the graph_index has).
2049
key_dependencies.add_references(node[1], node[3][0])
1942
2052
from bzrlib._groupcompress_py import (