~bzr-pqm/bzr/bzr.dev

Viewing changes to tests/test_groupcompress.py

Committer: John Arbash Meinel
Date: 2009-03-05 17:20:17 UTC
mfrom: (0.25.16 internal_index)
mto: (3735.31.1 groupcompress) (0.17.44 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090305172017-mefnbegtuk4vt99i

Another disk-format bump.

Move the labels/sha1 information into a pre-header. This also makes it
easier to decide to enable/disable the headers, as we can support
both with the same deserialising code (at least until we remove
the extra info from the indexes.)

This also makes a fulltext record stream start with 'f' and a delta
record stream start with 'd', which makes them more self describing.
The next step would probably be to write the base128 length of the
encoded bytes, which would make them fully independent, though
you wouldn't know what content they refer to.

This also brings in an update to .compress() which allows us to
see that we overflowed our group, roll back and start a new one.
This seems to give better compression in a 'more stable' manner.
Still open to tweaking, though.

Also introduce the 'gcc-chk255-big' which uses 64k leaf pages
rather than 4k leaf pages. Initial results show smaller compressed
size at a small (10%) increase in uncompressed size. Also shows
a full level decrease in the tree depth.

No-labels decreases the inv size approx 300kB, and big-page decreases
the inv size another 300kB, not to mention the 116k decrease in the
.cix index, just from not having the extra pages.

Having both no-labels and big inv pages brings a total drop of
11023k down to 9847k for the repo (1176kB savings, or 10% overall).

For now, leave the default with labels, but consider changing it.

files removed:
equivalence_table.py

files modified:
__init__.py

errors.py

groupcompress.py

repofmt.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

tests/test_groupcompress.py

return standard_tests

class TestGroupCompressor(TestCaseWithTransport):

class TestGroupCompressor(tests.TestCase):

"""Tests for GroupCompressor"""

def test_empty_delta(self):

def test_one_nosha_delta(self):

# diff against NUKK

compressor = groupcompress.GroupCompressor(True)

sha1, end_point = compressor.compress(('label',),

sha1, end_point, _, _ = compressor.compress(('label',),

'strange\ncommon\n', None)

self.assertEqual(sha_string('strange\ncommon\n'), sha1)

expected_lines = [

'fulltext\n',

'label:label\nsha1:%s\n' % sha1,

'len:15\n',

'strange\ncommon\n',

]

self.assertEqual(expected_lines, compressor.lines)

def test_two_nosha_delta(self):

compressor = groupcompress.GroupCompressor(True)

sha1_1, _ = compressor.compress(('label',),

sha1_1, _, _, _ = compressor.compress(('label',),

'strange\ncommon long line\nthat needs a 16 byte match\n', None)

expected_lines = list(compressor.lines)

sha1_2, end_point = compressor.compress(('newlabel',),

sha1_2, end_point, _, _ = compressor.compress(('newlabel',),

'common long line\nthat needs a 16 byte match\ndifferent\n', None)

self.assertEqual(sha_string('common long line\n'

'that needs a 16 byte match\n'

'different\n'), sha1_2)

expected_lines.extend([

100

'delta\n'

101

'label:newlabel\n',

102

'sha1:%s\n' % sha1_2,

103

'len:16\n',

104

# source and target length

105

'\x7e\x36',

'\x34\x36',

106

# copy the line common

107

'\x91\x52\x2c', #copy, offset 0x52, len 0x2c

100

'\x91\x08\x2c', #copy, offset 0x08, len 0x2c

108

101

# add the line different, and the trailing newline

109

102

'\x0adifferent\n', # insert 10 bytes

110

103

])

115

108

# The first interesting test: make a change that should use lines from

116

109

# both parents.

117

110

compressor = groupcompress.GroupCompressor(True)

118

sha1_1, end_point = compressor.compress(('label',),

111

sha1_1, end_point, _, _ = compressor.compress(('label',),

119

112

'strange\ncommon very very long line\nwith some extra text\n', None)

120

sha1_2, _ = compressor.compress(('newlabel',),

113

sha1_2, _, _, _ = compressor.compress(('newlabel',),

121

114

'different\nmoredifferent\nand then some more\n', None)

122

115

expected_lines = list(compressor.lines)

123

sha1_3, end_point = compressor.compress(('label3',),

116

sha1_3, end_point, _, _ = compressor.compress(('label3',),

124

117

'new\ncommon very very long line\nwith some extra text\n'

125

118

'different\nmoredifferent\nand then some more\n',

126

119

None)

129

122

'different\nmoredifferent\nand then some more\n'),

130

123

sha1_3)

131

124

expected_lines.extend([

132

'delta\n',

133

'label:label3\n',

134

'sha1:%s\n' % sha1_3,

135

'len:13\n',

136

'\xfa\x01\x5f' # source and target length

125

'\x63\x5f' # source and target length

137

126

# insert new

138

127

'\x03new',

139

128

# Copy of first parent 'common' range

140

'\x91\x51\x31' # copy, offset 0x51, 0x31 bytes

129

'\x91\x07\x31' # copy, offset 0x07, 0x31 bytes

141

130

# Copy of second parent 'different' range

142

'\x91\xcf\x2b' # copy, offset 0xcf, 0x2b bytes

131

'\x91\x38\x2b' # copy, offset 0x38, 0x2b bytes

143

132

])

144

133

self.assertEqualDiffEncoded(expected_lines, compressor.lines)

145

134

self.assertEqual(sum(map(len, expected_lines)), end_point)

146

135

147

136

def test_stats(self):

148

137

compressor = groupcompress.GroupCompressor(True)

149

compressor.compress(('label',), 'strange\ncommon\n', None)

138

compressor.compress(('label',), 'strange\ncommon long line\n'

139

'plus more text\n', None)

150

140

compressor.compress(('newlabel',),

151

'common\ndifferent\nmoredifferent\n', None)

141

'common long line\nplus more text\n'

142

'different\nmoredifferent\n', None)

152

143

compressor.compress(('label3',),

153

'new\ncommon\ndifferent\nmoredifferent\n', None)

154

self.assertAlmostEqual(0.3, compressor.ratio(), 1)

144

'new\ncommon long line\nplus more text\n'

145

'\ndifferent\nmoredifferent\n', None)

146

self.assertAlmostEqual(1.4, compressor.ratio(), 1)

155

147

156

148

def test_extract_from_compressor(self):

157

149

# Knit fetching will try to reconstruct texts locally which results in

158

150

# reading something that is in the compressor stream already.

159

151

compressor = groupcompress.GroupCompressor(True)

160

sha_1, _ = compressor.compress(('label',), 'strange\ncommon\n', None)

161

sha_2, _ = compressor.compress(('newlabel',),

162

'common\ndifferent\nmoredifferent\n', None)

152

sha1_1, _, _, _ = compressor.compress(('label',),

153

'strange\ncommon long line\nthat needs a 16 byte match\n', None)

154

expected_lines = list(compressor.lines)

155

sha1_2, end_point, _, _ = compressor.compress(('newlabel',),

156

'common long line\nthat needs a 16 byte match\ndifferent\n', None)

163

157

# get the first out

164

self.assertEqual((['strange\ncommon\n'], sha_1),

158

self.assertEqual(('strange\ncommon long line\n'

159

'that needs a 16 byte match\n', sha1_1),

165

160

compressor.extract(('label',)))

166

161

# and the second

167

self.assertEqual((['common\ndifferent\nmoredifferent\n'],

168

sha_2), compressor.extract(('newlabel',)))

162

self.assertEqual(('common long line\nthat needs a 16 byte match\n'

163

'different\n', sha1_2),

164

compressor.extract(('newlabel',)))

165

166

167

class TestBase128Int(tests.TestCase):

168

169

def assertEqualEncode(self, bytes, val):

170

self.assertEqual(bytes, groupcompress.encode_base128_int(val))

171

172

def assertEqualDecode(self, val, num_decode, bytes):

173

self.assertEqual((val, num_decode),

174

groupcompress.decode_base128_int(bytes))

175

176

def test_encode(self):

177

self.assertEqualEncode('\x01', 1)

178

self.assertEqualEncode('\x02', 2)

179

self.assertEqualEncode('\x7f', 127)

180

self.assertEqualEncode('\x80\x01', 128)

181

self.assertEqualEncode('\xff\x01', 255)

182

self.assertEqualEncode('\x80\x02', 256)

183

self.assertEqualEncode('\xff\xff\xff\xff\x0f', 0xFFFFFFFF)

184

185

def test_decode(self):

186

self.assertEqualDecode(1, 1, '\x01')

187

self.assertEqualDecode(2, 1, '\x02')

188

self.assertEqualDecode(127, 1, '\x7f')

189

self.assertEqualDecode(128, 2, '\x80\x01')

190

self.assertEqualDecode(255, 2, '\xff\x01')

191

self.assertEqualDecode(256, 2, '\x80\x02')

192

self.assertEqualDecode(0xFFFFFFFF, 5, '\xff\xff\xff\xff\x0f')

193

194

def test_decode_with_trailing_bytes(self):

195

self.assertEqualDecode(1, 1, '\x01abcdef')

196

self.assertEqualDecode(127, 1, '\x7f\x01')

197

self.assertEqualDecode(128, 2, '\x80\x01abcdef')

198

self.assertEqualDecode(255, 2, '\xff\x01\xff')

199

200

201

class TestGroupCompressBlock(tests.TestCase):

202

203

def test_from_empty_bytes(self):

204

self.assertRaises(errors.InvalidGroupCompressBlock,

205

groupcompress.GroupCompressBlock.from_bytes, '')

206

207

def test_from_minimal_bytes(self):

208

block = groupcompress.GroupCompressBlock.from_bytes('gcb1z\n0\n0\n')

209

self.assertIsInstance(block, groupcompress.GroupCompressBlock)

210

self.assertEqual({}, block._entries)

211

212

def test_from_bytes(self):

213

z_header_bytes = (

214

'gcb1z\n' # group compress block v1 plain

215

'76\n' # Length of zlib bytes

216

'183\n' # Length of all meta-info

217

+ zlib.compress(

218

'key:bing\n'

219

'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'

220

'type:fulltext\n'

221

'start:100\n'

222

'length:100\n'

223

'\n'

224

'key:foo\x00bar\n'

225

'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'

226

'type:fulltext\n'

227

'start:0\n'

228

'length:100\n'

229

'\n'))

230

block = groupcompress.GroupCompressBlock.from_bytes(

231

z_header_bytes)

232

self.assertIs(None, block._content)

233

self.assertIsInstance(block, groupcompress.GroupCompressBlock)

234

self.assertEqual([('bing',), ('foo', 'bar')], sorted(block._entries))

235

bing = block._entries[('bing',)]

236

self.assertEqual(('bing',), bing.key)

237

self.assertEqual('fulltext', bing.type)

238

self.assertEqual('abcd'*10, bing.sha1)

239

self.assertEqual(100, bing.start)

240

self.assertEqual(100, bing.length)

241

foobar = block._entries[('foo', 'bar')]

242

self.assertEqual(('foo', 'bar'), foobar.key)

243

self.assertEqual('fulltext', foobar.type)

244

self.assertEqual('abcd'*10, foobar.sha1)

245

self.assertEqual(0, foobar.start)

246

self.assertEqual(100, foobar.length)

247

248

def test_add_entry(self):

249

gcb = groupcompress.GroupCompressBlock()

250

e = gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)

251

self.assertIsInstance(e, groupcompress.GroupCompressBlockEntry)

252

self.assertEqual(('foo', 'bar'), e.key)

253

self.assertEqual('fulltext', e.type)

254

self.assertEqual('abcd'*10, e.sha1)

255

self.assertEqual(0, e.start)

256

self.assertEqual(100, e.length)

257

258

def test_to_bytes(self):

259

gcb = groupcompress.GroupCompressBlock()

260

gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)

261

gcb.add_entry(('bing',), 'fulltext', 'abcd'*10, 100, 100)

262

bytes = gcb.to_bytes()

263

self.assertStartsWith(bytes,

264

'gcb1z\n' # group compress block v1 zlib

265

'77\n' # Length of compressed bytes

266

'183\n' # Length of all meta-info

267

)

268

remaining_bytes = bytes[13:]

269

raw_bytes = zlib.decompress(remaining_bytes)

270

self.assertEqualDiff('key:bing\n'

271

'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'

272

'type:fulltext\n'

273

'start:100\n'

274

'length:100\n'

275

'\n'

276

'key:foo\x00bar\n'

277

'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'

278

'type:fulltext\n'

279

'start:0\n'

280

'length:100\n'

281

'\n', raw_bytes)

Older »