~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to groupcompress.py

Committer: Robert Collins
Date: 2008-07-25 00:09:27 UTC
mfrom: (0.18.31 groupcompress)
mto: (0.21.2 dev5) (3735.31.1 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: robertc@robertcollins.net-20080725000927-d88agg6z2bfzcks2

Merge John's pyrex accelerator.

files added:
.bzrignore

_groupcompress_c.pyx

equivalence_table.py

tests/test__groupcompress_c.py

tests/test_equivalence_table.py

files modified:
groupcompress.py

setup.py

tests/__init__.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

groupcompress.py

split_lines,

)

from bzrlib.plugins.index2.btree_index import BTreeBuilder

from bzrlib.plugins.groupcompress import equivalence_table

from bzrlib.tsort import topo_sort

from bzrlib.versionedfile import (

adapter_registry,

109

110

left side.

110

111

"""

111

112

113

_equivalence_table_class = equivalence_table.EquivalenceTable

114

112

115

def __init__(self, delta=True):

113

116

"""Create a GroupCompressor.

114

117

115

118

:paeam delta: If False, do not compress records.

116

119

"""

117

120

self._delta = delta

118

self.lines = []

119

121

self.line_offsets = []

120

122

self.endpoint = 0

121

123

self.input_bytes = 0

122

# line: set(locations it appears at), set(N+1 for N in locations)

123

self.line_locations = {}

124

self.line_locations = self._equivalence_table_class([])

125

self.lines = self.line_locations.lines

124

126

self.labels_deltas = {}

125

127

126

128

def get_matching_blocks(self, lines):

132

134

of the list is always (old_len, new_len, 0) to provide a end point

133

135

for generating instructions from the matching blocks list.

134

136

"""

137

import time

135

138

result = []

136

139

pos = 0

137

140

line_locations = self.line_locations

138

range_len = 0

139

range_start = 0

140

copy_ends = None

141

line_locations.set_right_lines(lines)

141

142

# We either copy a range (while there are reusable lines) or we

142

143

# insert new lines. To find reusable lines we traverse

143

while pos < len(lines):

144

line = lines[pos]

145

if line not in line_locations:

146

if copy_ends:

147

result.append((min(copy_ends) - range_len, range_start, range_len))

148

range_start = pos

149

range_len = 1

150

copy_ends = None

151

else:

152

range_len += 1

153

else:

154

locations = line_locations[line]

155

if copy_ends:

156

next_locations = locations.intersection(copy_ends)

157

if len(next_locations):

158

# range continues

159

range_len += 1

160

copy_ends = set(loc + 1 for loc in next_locations)

161

pos += 1

162

continue

163

if copy_ends:

164

result.append((min(copy_ends) - range_len, range_start, range_len))

165

range_len = 1

166

copy_ends = set(loc + 1 for loc in locations)

167

range_start = pos

168

pos += 1

169

if copy_ends:

170

result.append((min(copy_ends) - range_len, range_start, range_len))

144

locations = None

145

max_pos = len(lines)

146

timer = time.clock

147

max_time = 0.0

148

max_info = None

149

while pos < max_pos:

150

tstart = timer()

151

block, next_pos, locations = _get_longest_match(line_locations, pos,

152

max_pos, locations)

153

tdelta = timer() - tstart

154

if tdelta > max_time:

155

max_time = tdelta

156

max_info = tdelta, pos, block, next_pos, locations

157

pos = next_pos

158

159

if block is not None:

160

result.append(block)

171

161

result.append((len(self.lines), len(lines), 0))

162

# if max_time > 0.01:

163

# print max_info[:-1]

164

# import pdb; pdb.set_trace()

172

165

return result

173

166

174

167

def compress(self, key, lines, expected_sha):

267

260

:param index_lines: A boolean flag for each line - when True, index

268

261

that line.

269

262

"""

263

# indexed_newlines = [idx for idx, val in enumerate(index_lines)

264

# if val and new_lines[idx] == '\n']

265

# if indexed_newlines:

266

# import pdb; pdb.set_trace()

270

267

endpoint = self.endpoint

271

offset = len(self.lines)

272

line_append = self.line_offsets.append

273

setdefault = self.line_locations.setdefault

274

for (pos, line), index in izip(enumerate(new_lines), index_lines):

268

self.line_locations.extend_lines(new_lines, index_lines)

269

for line in new_lines:

275

270

endpoint += len(line)

276

line_append(endpoint)

277

if index:

278

indices = setdefault(line, set())

279

indices.add(pos + offset)

280

self.lines.extend(new_lines)

271

self.line_offsets.append(endpoint)

281

272

self.endpoint = endpoint

282

273

283

274

def ratio(self):

837

828

basis_end = int(bits[2])

838

829

delta_end = int(bits[3])

839

830

return node[0], start, stop, basis_end, delta_end

831

832

833

def _get_longest_match(equivalence_table, pos, max_pos, locations):

834

"""Get the longest possible match for the current position."""

835

range_start = pos

836

range_len = 0

837

copy_ends = None

838

while pos < max_pos:

839

if locations is None:

840

locations = equivalence_table.get_idx_matches(pos)

841

if locations is None:

842

# No more matches, just return whatever we have, but we know that

843

# this last position is not going to match anything

844

pos += 1

845

break

846

else:

847

if copy_ends is None:

848

# We are starting a new range

849

copy_ends = [loc + 1 for loc in locations]

850

range_len = 1

851

locations = None # Consumed

852

else:

853

# We are currently in the middle of a match

854

next_locations = set(copy_ends).intersection(locations)

855

if len(next_locations):

856

# range continues

857

copy_ends = [loc + 1 for loc in next_locations]

858

range_len += 1

859

locations = None # Consumed

860

else:

861

# But we are done with this match, we should be

862

# starting a new one, though. We will pass back 'locations'

863

# so that we don't have to do another lookup.

864

break

865

pos += 1

866

if copy_ends is None:

867

return None, pos, locations

868

return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations

869

870

871

try:

872

from bzrlib.plugins.groupcompress import _groupcompress_c

873

except ImportError:

874

pass

875

else:

876

GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable

877

_get_longest_match = _groupcompress_c._get_longest_match

Older »