~bzr-pqm/bzr/bzr.dev

Viewing changes to tests/test__groupcompress_pyx.py

Committer: John Arbash Meinel
Date: 2009-03-04 16:01:55 UTC
mfrom: (0.23.59 rabin)
mto: (0.17.34 trunk) (3735.31.1 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090304160155-66iy2jorb5h39n6d

Bring in the 'rabin' experiment.
Change the names and disk-strings for the various repository formats.
Make the CHK format repositories all 'rich-root' we can introduce non-rich-root later.
Make a couple other small tweaks, like copyright statements, etc.
Remove patch-delta.c, at this point, it was only a reference implementation,
as we have fully integrated the patching into pyrex, to allow nicer exception
handling.

files added:
delta.h

diff-delta.c

files renamed:
_groupcompress_c.pyx => _groupcompress_pyx.pyx

tests/test__groupcompress_c.py => tests/test__groupcompress_pyx.py

files modified:
.bzrignore

TODO

__init__.py

groupcompress.py

repofmt.py

setup.py

tests/__init__.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

tests/test__groupcompress_pyx.py

def _probe(self):

try:

import bzrlib.plugins.groupcompress._groupcompress_c

import bzrlib.plugins.groupcompress._groupcompress_pyx

except ImportError:

return False

else:

return True

def feature_name(self):

return 'bzrlib.plugins.groupcompress._groupcompress_c'

return 'bzrlib.plugins.groupcompress._groupcompress_pyx'

CompiledGroupCompress = _CompiledGroupCompress()

class TestCompiledEquivalenceTable(tests.TestCase):

"""Direct tests for the compiled Equivalence Table."""

_tests_need_features = [CompiledGroupCompress]

# These tests assume that hash(int) == int

# If that ever changes, we can simply change this code to use a custom

# class that has precomputed values returned from __hash__.

# TODO: We need a test that forces a rebuild of the hash array, and makes

# sure that the lines that should not be indexed *stay* not indexed

def setUp(self):

super(TestCompiledEquivalenceTable, self).setUp()

from bzrlib.plugins.groupcompress import _groupcompress_c

self._gc_module = _groupcompress_c

def test_minimum_hash_size(self):

eq = self._gc_module.EquivalenceTable([])

# We request at least 50% free space in the hash (to make collisions

# more bearable)

self.assertEqual(1024, eq._py_compute_minimum_hash_size(512))

self.assertEqual(2048, eq._py_compute_minimum_hash_size(513))

self.assertEqual(2048, eq._py_compute_minimum_hash_size(1000))

self.assertEqual(2048, eq._py_compute_minimum_hash_size(1024))

def test_recommended_hash_size(self):

eq = self._gc_module.EquivalenceTable([])

# We always recommend a minimum of 8k

self.assertEqual(8192, eq._py_compute_recommended_hash_size(10))

self.assertEqual(8192, eq._py_compute_recommended_hash_size(1000))

self.assertEqual(8192, eq._py_compute_recommended_hash_size(2000))

self.assertEqual(16384, eq._py_compute_recommended_hash_size(4000))

# And we recommend at least 75% free slots

self.assertEqual(16384, eq._py_compute_recommended_hash_size(4096))

self.assertEqual(32768, eq._py_compute_recommended_hash_size(4097))

def test__raw_lines(self):

eq = self._gc_module.EquivalenceTable([1, 2, 3])

self.assertEqual([(1, 1, 1, -1), (2, 2, 2, -1), (3, 3, 3, -1)],

eq._inspect_left_lines())

def test_build_hash(self):

eq = self._gc_module.EquivalenceTable([1, 2, 3])

# (size, [(offset, head_offset_in_lines, count)])

self.assertEqual((8192, [(1, 0, 1), (2, 1, 1), (3, 2, 1)]),

eq._inspect_hash_table())

def test_build_hash_with_duplicates(self):

eq = self._gc_module.EquivalenceTable([1, 2, 4, 0, 1, 4, 2, 4])

self.assertEqual([

(1, 1, 1, 4),

(2, 2, 2, 6),

(4, 4, 4, 5),

(0, 0, 0, -1),

(1, 1, 1, -1),

(4, 4, 4, 7),

(2, 2, 2, -1),

100

(4, 4, 4, -1),

101

], eq._inspect_left_lines())

102

# (hash_offset, head_offset_in_lines, count)

103

self.assertEqual((8192, [

104

(0, 3, 1),

105

(1, 0, 2),

106

(2, 1, 2),

107

(4, 2, 3),

108

]), eq._inspect_hash_table())

109

110

def test_build_hash_table_with_wraparound(self):

111

eq = self._gc_module.EquivalenceTable([1, 2+8192])

112

self.assertEqual([

113

(1, 1, 1, -1),

114

(8194, 8194, 2, -1),

115

], eq._inspect_left_lines())

116

self.assertEqual((8192, [

117

(1, 0, 1),

118

(2, 1, 1),

119

]), eq._inspect_hash_table())

120

121

def test_build_hash_table_with_collisions(self):

122

# We build up backwards, so # 2+8192 will wrap around to 2, and take

123

# its spot because the 2 offset is taken, then the real '2' will get

124

# bumped to 3, which will bump 3 into 4. then when we have 5, it will

125

# be fine, but the 8192+5 will get bumped to 6

126

eq = self._gc_module.EquivalenceTable([1, 5+8192, 5, 3, 2, 2+8192])

127

self.assertEqual([

128

(1, 1, 1, -1),

129

(8197, 8197, 6, -1),

130

(5, 5, 5, -1),

131

(3, 3, 4, -1),

132

(2, 2, 3, -1),

133

(8194, 8194, 2, -1),

134

], eq._inspect_left_lines())

135

self.assertEqual((8192, [

136

(1, 0, 1),

137

(2, 5, 1),

138

(3, 4, 1),

139

(4, 3, 1),

140

(5, 2, 1),

141

(6, 1, 1),

142

]), eq._inspect_hash_table())

143

144

def test_build_hash_table_with_wrapped_collisions(self):

145

eq = self._gc_module.EquivalenceTable([0, 8191+8192, 8191])

146

self.assertEqual([

147

(0, 0, 1, -1),

148

(16383, 16383, 0, -1),

149

(8191, 8191, 8191, -1),

150

], eq._inspect_left_lines())

151

self.assertEqual((8192, [

152

(0, 1, 1),

153

(1, 0, 1),

154

(8191, 2, 1),

155

]), eq._inspect_hash_table())

156

157

def test_build_hash_table_with_strings(self):

158

eq = self._gc_module.EquivalenceTable(['a', 'b', 'c', 'b'])

159

self.assertEqual([

160

('a', hash('a'), hash('a') & 8191, -1),

161

('b', hash('b'), hash('b') & 8191, 3),

162

('c', hash('c'), hash('c') & 8191, -1),

163

('b', hash('b'), hash('b') & 8191, -1),

164

], eq._inspect_left_lines())

165

self.assertEqual((8192, sorted([

166

(hash('a') & 8191, 0, 1),

167

(hash('b') & 8191, 1, 2),

168

(hash('c') & 8191, 2, 1),

169

])), eq._inspect_hash_table())

170

171

def test_with_unhashable(self):

172

self.assertRaises(TypeError, self._gc_module.EquivalenceTable,

173

[['unhashable', 'list'], 'foo'])

174

self.assertRaises(TypeError, self._gc_module.EquivalenceTable,

175

('foo', ['unhashable', 'list']))

176

177

def test_extend_lines(self):

178

eq = self._gc_module.EquivalenceTable([10, 20, 30, 20])

179

eq.extend_lines([30, 20, 40], [True, True, True])

180

self.assertEqual([

181

(10, 10, 10, -1),

182

(20, 20, 20, 3),

183

(30, 30, 30, 4),

184

(20, 20, 20, 5),

185

(30, 30, 30, -1),

186

(20, 20, 20, -1),

187

(40, 40, 40, -1),

188

], eq._inspect_left_lines())

189

# (hash_offset, head_offset_in_lines, count)

190

self.assertEqual((8192, [

191

(10, 0, 1),

192

(20, 1, 3),

193

(30, 2, 2),

194

(40, 6, 1),

195

]), eq._inspect_hash_table())

196

197

def test_extend_lines_ignored(self):

198

eq = self._gc_module.EquivalenceTable([10, 20, 30, 20])

199

eq.extend_lines([30, 20, 40], [True, False, False])

200

self.assertEqual([

201

(10, 10, 10, -1),

202

(20, 20, 20, 3),

203

(30, 30, 30, 4),

204

(20, 20, 20, -1),

205

(30, 30, 30, -1),

206

(20, 20, -1, -1),

207

(40, 40, -1, -1),

208

], eq._inspect_left_lines())

209

# (hash_offset, head_offset_in_lines, count)

210

self.assertEqual((8192, [

211

(10, 0, 1),

212

(20, 1, 2),

213

(30, 2, 2),

214

]), eq._inspect_hash_table())

215

216

217

class TestGetLongestMatch(tests.TestCase):

218

219

def setUp(self):

220

super(TestGetLongestMatch, self).setUp()

221

self._longest_match = groupcompress._get_longest_match

222

self._eq_class = groupcompress.GroupCompressor._equivalence_table_class

223

224

def assertLongestMatch(self, block, end_pos, matching_locs,

225

eq, start_pos, max_len, known_locs):

226

"""Check that _get_longest_match gives correct results."""

227

the_block, the_end_pos, next_locs = self._longest_match(eq,

228

start_pos, max_len, known_locs)

229

self.assertEqual(block, the_block)

230

self.assertEqual(end_pos, the_end_pos)

231

if next_locs is not None:

232

self.assertEqual(matching_locs, next_locs)

233

234

def test_all_match(self):

235

eq = self._eq_class(['a', 'b', 'c'])

236

eq.set_right_lines(['a', 'b', 'c'])

237

self.assertLongestMatch((0, 0, 3), 3, None,

238

eq, 0, 3, None)

239

self.assertLongestMatch((0, 0, 3), 3, None,

240

eq, 0, 3, [0])

241

self.assertLongestMatch((1, 1, 2), 3, None,

242

eq, 1, 3, None)

243

self.assertLongestMatch((1, 1, 2), 3, None,

244

eq, 1, 3, [1])

245

246

def test_no_match(self):

247

eq = self._eq_class(['a', 'b', 'c'])

248

eq.set_right_lines(['d', 'e', 'f'])

249

self.assertLongestMatch(None, 1, None,

250

eq, 0, 3, None)

251

self.assertLongestMatch(None, 2, None,

252

eq, 1, 3, None)

253

self.assertLongestMatch(None, 3, None,

254

eq, 2, 3, None)

255

256

def test_next_match(self):

257

eq = self._eq_class(['a', 'b', 'c'])

258

eq.set_right_lines(['a', 'c'])

259

self.assertLongestMatch((0, 0, 1), 1, [2],

260

eq, 0, 2, None)

261

self.assertLongestMatch((2, 1, 1), 2, None,

262

eq, 1, 2, None)

263

264

def test_lots_of_matches(self):

265

eq = self._eq_class(['a']*1000)

266

eq.set_right_lines(['a']*1000)

267

self.assertLongestMatch((0, 0, 1000), 1000, None,

268

eq, 0, 1000, None)

269

eq = self._eq_class(['a']*1000)

270

eq.set_right_lines(['a']*2000)

271

self.assertLongestMatch((0, 0, 1000), 1000, range(1000),

272

eq, 0, 2000, None)

273

eq = self._eq_class(['a']*2000)

274

eq.set_right_lines(['a']*1000)

275

self.assertLongestMatch((0, 0, 1000), 1000, None,

276

eq, 0, 1000, None)

277

278

279

class TestCompiledGetLongestMatch(TestGetLongestMatch):

280

281

_tests_need_features = [CompiledGroupCompress]

282

283

def setUp(self):

284

super(TestGetLongestMatch, self).setUp()

285

from bzrlib.plugins.groupcompress import _groupcompress_c

286

self._longest_match = _groupcompress_c._get_longest_match

287

self._eq_class = _groupcompress_c.EquivalenceTable

288

_text1 = """\

This is a bit

of source text

which is meant to be matched

against other text

"""

_text2 = """\

This is a bit

of source text

which is meant to differ from

against other text

"""

_text3 = """\

This is a bit

of source text

which is meant to be matched

against other text

except it also

has a lot more data

at the end of the file

"""

_first_text = """\

a bit of text, that

does not have much in

common with the next text

"""

_second_text = """\

some more bit of text, that

does not have much in

common with the previous text

and has some extra text

"""

_third_text = """\

a bit of text, that

has some in common with the previous text

and has some extra text

and not have much in

common with the next text

"""

class Test_GroupCompress(tests.TestCase):

"""Direct tests for the compiled extension."""

def setUp(self):

super(Test_GroupCompress, self).setUp()

self.requireFeature(CompiledGroupCompress)

from bzrlib.plugins.groupcompress import _groupcompress_pyx

self._gc_module = _groupcompress_pyx

class TestMakeAndApplyDelta(Test_GroupCompress):

def setUp(self):

super(TestMakeAndApplyDelta, self).setUp()

100

self.make_delta = self._gc_module.make_delta

101

self.apply_delta = self._gc_module.apply_delta

102

103

def test_make_delta_is_typesafe(self):

104

self.make_delta('a string', 'another string')

105

self.assertRaises(TypeError,

106

self.make_delta, 'a string', object())

107

self.assertRaises(TypeError,

108

self.make_delta, 'a string', u'not a string')

109

self.assertRaises(TypeError,

110

self.make_delta, object(), 'a string')

111

self.assertRaises(TypeError,

112

self.make_delta, u'not a string', 'a string')

113

114

def test_make_noop_delta(self):

115

ident_delta = self.make_delta(_text1, _text1)

116

self.assertEqual('MM\x90M', ident_delta)

117

ident_delta = self.make_delta(_text2, _text2)

118

self.assertEqual('NN\x90N', ident_delta)

119

ident_delta = self.make_delta(_text3, _text3)

120

self.assertEqual('\x87\x01\x87\x01\x90\x87', ident_delta)

121

122

def test_make_delta(self):

123

delta = self.make_delta(_text1, _text2)

124

self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)

125

delta = self.make_delta(_text2, _text1)

126

self.assertEqual('NM\x90/\x1ebe matched\nagainst other text\n', delta)

127

delta = self.make_delta(_text3, _text1)

128

self.assertEqual('\x87\x01M\x90M', delta)

129

delta = self.make_delta(_text3, _text2)

130

self.assertEqual('\x87\x01N\x90/\x1fdiffer from\nagainst other text\n',

131

delta)

132

133

def test_apply_delta_is_typesafe(self):

134

self.apply_delta(_text1, 'MM\x90M')

135

self.assertRaises(TypeError,

136

self.apply_delta, object(), 'MM\x90M')

137

self.assertRaises(TypeError,

138

self.apply_delta, unicode(_text1), 'MM\x90M')

139

self.assertRaises(TypeError,

140

self.apply_delta, _text1, u'MM\x90M')

141

self.assertRaises(TypeError,

142

self.apply_delta, _text1, object())

143

144

def test_apply_delta(self):

145

target = self.apply_delta(_text1,

146

'MN\x90/\x1fdiffer from\nagainst other text\n')

147

self.assertEqual(_text2, target)

148

target = self.apply_delta(_text2,

149

'NM\x90/\x1ebe matched\nagainst other text\n')

150

self.assertEqual(_text1, target)

151

152

153

class TestDeltaIndex(Test_GroupCompress):

154

155

def test_repr(self):

156

di = self._gc_module.DeltaIndex('test text\n')

157

self.assertEqual('DeltaIndex(1, 10)', repr(di))

158

159

def test_make_delta(self):

160

di = self._gc_module.DeltaIndex(_text1)

161

delta = di.make_delta(_text2)

162

self.assertEqual('MN\x90/\x1fdiffer from\nagainst other text\n', delta)

163

164

def test_delta_against_multiple_sources(self):

165

di = self._gc_module.DeltaIndex()

166

di.add_source(_first_text, 0)

167

self.assertEqual(len(_first_text), di._source_offset)

168

di.add_source(_second_text, 0)

169

self.assertEqual(len(_first_text) + len(_second_text), di._source_offset)

170

delta = di.make_delta(_third_text)

171

result = self._gc_module.apply_delta(_first_text + _second_text, delta)

172

self.assertEqualDiff(_third_text, result)

173

self.assertEqual('\xac\x01\x85\x01\x90\x14\x0chas some in '

174

'\x91v6\x03and\x91d"\x91:\n', delta)

175

176

def test_delta_with_offsets(self):

177

di = self._gc_module.DeltaIndex()

178

di.add_source(_first_text, 5)

179

self.assertEqual(len(_first_text) + 5, di._source_offset)

180

di.add_source(_second_text, 10)

181

self.assertEqual(len(_first_text) + len(_second_text) + 15,

182

di._source_offset)

183

delta = di.make_delta(_third_text)

184

self.assertIsNot(None, delta)

185

result = self._gc_module.apply_delta(

186

'12345' + _first_text + '1234567890' + _second_text, delta)

187

self.assertIsNot(None, result)

188

self.assertEqualDiff(_third_text, result)

189

self.assertEqual('\xbb\x01\x85\x01\x91\x05\x14\x0chas some in '

190

'\x91\x856\x03and\x91s"\x91?\n', delta)

191

192

def test_delta_with_delta_bytes(self):

193

di = self._gc_module.DeltaIndex()

194

di.add_source(_first_text, 0)

195

self.assertEqual(len(_first_text), di._source_offset)

196

delta = di.make_delta(_second_text)

197

self.assertEqual('Dh\tsome more\x91\x019'

198

'&previous text\nand has some extra text\n', delta)

199

di.add_delta_source(delta, 0)

200

self.assertEqual(len(_first_text) + len(delta), di._source_offset)

201

third_delta = di.make_delta(_third_text)

202

result = self._gc_module.apply_delta(_first_text + delta, third_delta)

203

self.assertEqualDiff(_third_text, result)

204

# We should be able to match against the 'previous text\nand has some...'

205

# that was part of the delta bytes

206

# Note that we don't match the 'common with the', because it isn't long

207

# enough to match in the original text, and those bytes are not present

208

# in the delta for the second text.

209

self.assertEqual('z\x85\x01\x90\x14\x1chas some in common with the '

210

'\x91T&\x03and\x91\x18,', third_delta)

Older »