~bzr-pqm/bzr/bzr.dev

Viewing changes to bzrlib/_known_graph_py.py

Committer: Canonical.com Patch Queue Manager
Date: 2009-06-21 04:30:25 UTC
mfrom: (4371.4.32 jam-gdfo-heads)
Revision ID: pqm@pqm.ubuntu.com-20090621043025-one1vuhpnsgdv5tv

(jam, vila) Improve the KnownGraph.heads() code,
using a min_gdfo value (can be another 10-100x faster on complex
graphs).

files modified:
NEWS

bzrlib/_known_graph_py.py

bzrlib/_known_graph_pyx.pyx

bzrlib/tests/test__known_graph.py

tools/time_graph.py

Show diffs side-by-side

added added

removed removed

bzrlib/_known_graph_py.py

"""Implementation of Graph algorithms when we have already loaded everything.

"""

import heapq

from bzrlib import (

revision,

)

class _KnownGraphNode(object):

"""Represents a single object in the known graph."""

__slots__ = ('key', 'parent_keys', 'child_keys', 'linear_dominator',

'gdfo', 'ancestor_of')

__slots__ = ('key', 'parent_keys', 'child_keys', 'gdfo')

def __init__(self, key, parent_keys):

self.key = key

self.parent_keys = parent_keys

self.child_keys = []

# oldest ancestor, such that no parents between here and there have >1

# child or >1 parent.

self.linear_dominator = None

# Greatest distance from origin

self.gdfo = None

# This will become a tuple of known heads that have this node as an

# ancestor

self.ancestor_of = None

def __repr__(self):

return '%s(%s gdfo:%s par:%s child:%s %s)' % (

return '%s(%s gdfo:%s par:%s child:%s)' % (

self.__class__.__name__, self.key, self.gdfo,

self.parent_keys, self.child_keys,

self.linear_dominator)

self.parent_keys, self.child_keys)

class KnownGraph(object):

self._known_heads = {}

self.do_cache = do_cache

self._initialize_nodes(parent_map)

self._find_linear_dominators()

self._find_gdfo()

def _initialize_nodes(self, parent_map):

"""Populate self._nodes.

After this has finished, self._nodes will have an entry for every entry

in parent_map. Ghosts will have a parent_keys = None, all nodes found

will also have .child_keys populated with all known child_keys.

After this has finished:

- self._nodes will have an entry for every entry in parent_map.

- ghosts will have a parent_keys = None,

- all nodes found will also have .child_keys populated with all known

child_keys,

- self._tails will list all the nodes without parents.

"""

tails = self._tails = set()

nodes = self._nodes

for key, parent_keys in parent_map.iteritems():

if key in nodes:

node = nodes[key]

node.parent_keys = parent_keys

if parent_keys:

# This node has been added before being seen in parent_map

# (see below)

tails.remove(node)

else:

node = _KnownGraphNode(key, parent_keys)

nodes[key] = node

except KeyError:

parent_node = _KnownGraphNode(parent_key, None)

nodes[parent_key] = parent_node

# Potentially a tail, if we're wrong we'll remove it later

# (see above)

tails.add(parent_node)

parent_node.child_keys.append(key)

def _find_linear_dominators(self):

"""For each node in the set, find any linear dominators.

For any given node, the 'linear dominator' is an ancestor, such that

all parents between this node and that one have a single parent, and a

single child. So if A->B->C->D then B,C,D all have a linear dominator

of A.

100

There are two main benefits:

101

1) When walking the graph, we can jump to the nearest linear dominator,

102

rather than walking all of the nodes inbetween.

103

2) When caching heads() results, dominators give the "same" results as

104

their children. (If the dominator is a head, then the descendant is

105

a head, if the dominator is not a head, then the child isn't

106

either.)

107

"""

108

def check_node(node):

109

if node.parent_keys is None or len(node.parent_keys) != 1:

110

# This node is either a ghost, a tail, or has multiple parents

111

# It its own dominator

112

node.linear_dominator = node.key

113

return None

114

parent_node = self._nodes[node.parent_keys[0]]

115

if len(parent_node.child_keys) > 1:

116

# The parent has multiple children, so *this* node is the

117

# dominator

118

node.linear_dominator = node.key

119

return None

120

# The parent is already filled in, so add and continue

121

if parent_node.linear_dominator is not None:

122

node.linear_dominator = parent_node.linear_dominator

123

return None

124

# We don't know this node, or its parent node, so start walking to

125

# next

126

return parent_node

127

128

for node in self._nodes.itervalues():

129

# The parent is not filled in, so walk until we get somewhere

130

if node.linear_dominator is not None: #already done

131

continue

132

next_node = check_node(node)

133

if next_node is None:

134

# Nothing more needs to be done

135

continue

136

stack = []

137

while next_node is not None:

138

stack.append(node)

139

node = next_node

140

next_node = check_node(node)

141

# The stack now contains the linear chain, and 'node' should have

142

# been labeled

143

dominator = node.linear_dominator

144

while stack:

145

next_node = stack.pop()

146

next_node.linear_dominator = dominator

147

node = next_node

148

149

def _find_gdfo(self):

150

def find_tails():

151

return [node for node in self._nodes.itervalues()

152

if not node.parent_keys]

153

tails = find_tails()

154

todo = []

155

heappush = heapq.heappush

156

heappop = heapq.heappop

157

nodes = self._nodes

158

for node in tails:

known_parent_gdfos = {}

pending = []

for node in self._tails:

159

node.gdfo = 1

160

heappush(todo, (1, node))

161

processed = 0

162

while todo:

163

gdfo, next = heappop(todo)

164

processed += 1

165

if next.gdfo is not None and gdfo < next.gdfo:

166

# This node was reached from a longer path, we assume it was

167

# enqued correctly with the longer gdfo, so don't continue

168

# processing now

169

continue

170

next_gdfo = gdfo + 1

171

for child_key in next.child_keys:

172

child_node = nodes[child_key]

173

if child_node.gdfo is None or child_node.gdfo < next_gdfo:

174

# Only enque children when all of their parents have been

175

# resolved

176

for parent_key in child_node.parent_keys:

177

# We know that 'this' parent is counted

178

if parent_key != next.key:

179

parent_node = nodes[parent_key]

180

if parent_node.gdfo is None:

181

break

182

else:

183

child_node.gdfo = next_gdfo

184

heappush(todo, (next_gdfo, child_node))

185

186

def _get_dominators_to_nodes(self, candidate_nodes):

187

"""Get the reverse mapping from dominator_key => candidate_nodes.

188

189

As a side effect, this can also remove potential candidate nodes if we

190

determine that they share a dominator.

191

"""

192

dom_to_node = {}

193

keys_to_remove = []

194

for node in candidate_nodes.values():

195

if node.linear_dominator in dom_to_node:

196

# This node already exists, resolve which node supersedes the

197

# other

198

other_node = dom_to_node[node.linear_dominator]

199

# There should be no way that nodes sharing a dominator could

200

# 'tie' for gdfo

201

if other_node.gdfo > node.gdfo:

202

# The other node has this node as an ancestor

203

keys_to_remove.append(node.key)

204

else:

205

# Replace the other node, and set this as the new key

206

keys_to_remove.append(other_node.key)

207

dom_to_node[node.linear_dominator] = node

208

else:

209

dom_to_node[node.linear_dominator] = node

210

for key in keys_to_remove:

211

candidate_nodes.pop(key)

212

return dom_to_node

pending.append(node)

100

101

while pending:

102

node = pending.pop()

103

for child_key in node.child_keys:

104

child = nodes[child_key]

105

if child_key in known_parent_gdfos:

106

known_gdfo = known_parent_gdfos[child_key] + 1

107

present = True

108

else:

109

known_gdfo = 1

110

present = False

111

if child.gdfo is None or node.gdfo + 1 > child.gdfo:

112

child.gdfo = node.gdfo + 1

113

if known_gdfo == len(child.parent_keys):

114

# We are the last parent updating that node, we can

115

# continue from there

116

pending.append(child)

117

if present:

118

del known_parent_gdfos[child_key]

119

else:

120

# Update known_parent_gdfos for a key we couldn't process

121

known_parent_gdfos[child_key] = known_gdfo

213

122

214

123

def heads(self, keys):

215

124

"""Return the heads from amongst keys.

217

126

This is done by searching the ancestries of each key. Any key that is

218

127

reachable from another key is not returned; all the others are.

219

128

220

This operation scales with the relative depth between any two keys. If

221

any two keys are completely disconnected all ancestry of both sides

222

will be retrieved.

129

This operation scales with the relative depth between any two keys. It

130

uses gdfo to avoid walking all ancestry.

223

131

224

132

:param keys: An iterable of keys.

225

133

:return: A set of the heads. Note that as a set there is no ordering

231

139

# NULL_REVISION is only a head if it is the only entry

232

140

candidate_nodes.pop(revision.NULL_REVISION)

233

141

if not candidate_nodes:

234

return set([revision.NULL_REVISION])

142

return frozenset([revision.NULL_REVISION])

235

143

if len(candidate_nodes) < 2:

144

# No or only one candidate

236

145

return frozenset(candidate_nodes)

237

146

heads_key = frozenset(candidate_nodes)

238

147

if heads_key != frozenset(keys):

148

# Mention duplicates

239

149

note('%s != %s', heads_key, frozenset(keys))

150

# Do we have a cached result ?

240

151

try:

241

152

heads = self._known_heads[heads_key]

242

153

return heads

243

154

except KeyError:

244

pass # compute it ourselves

245

dom_to_node = self._get_dominators_to_nodes(candidate_nodes)

246

if len(candidate_nodes) < 2:

247

# We shrunk candidate_nodes and determined a new head

248

return frozenset(candidate_nodes)

249

dom_heads_key = None

250

# Check the linear dominators of these keys, to see if we already

251

# know the heads answer

252

dom_heads_key = frozenset([node.linear_dominator

253

for node in candidate_nodes.itervalues()])

254

if dom_heads_key in self._known_heads:

255

# map back into the original keys

256

heads = self._known_heads[dom_heads_key]

257

heads = frozenset([dom_to_node[key].key for key in heads])

258

return heads

259

heads = self._heads_from_candidate_nodes(candidate_nodes, dom_to_node)

155

pass

156

# Let's compute the heads

157

seen = set()

158

pending = []

159

min_gdfo = None

160

for node in candidate_nodes.values():

161

if node.parent_keys:

162

pending.extend(node.parent_keys)

163

if min_gdfo is None or node.gdfo < min_gdfo:

164

min_gdfo = node.gdfo

165

nodes = self._nodes

166

while pending:

167

node_key = pending.pop()

168

if node_key in seen:

169

# node already appears in some ancestry

170

continue

171

seen.add(node_key)

172

node = nodes[node_key]

173

if node.gdfo <= min_gdfo:

174

continue

175

if node.parent_keys:

176

pending.extend(node.parent_keys)

177

heads = heads_key.difference(seen)

260

178

if self.do_cache:

261

179

self._known_heads[heads_key] = heads

262

# Cache the dominator heads

263

if dom_heads_key is not None:

264

dom_heads = frozenset([candidate_nodes[key].linear_dominator

265

for key in heads])

266

self._known_heads[dom_heads_key] = dom_heads

267

180

return heads

268

181

269

def _heads_from_candidate_nodes(self, candidate_nodes, dom_to_node):

270

queue = []

271

to_cleanup = []

272

to_cleanup_append = to_cleanup.append

273

for node in candidate_nodes.itervalues():

274

node.ancestor_of = (node.key,)

275

queue.append((-node.gdfo, node))

276

to_cleanup_append(node)

277

heapq.heapify(queue)

278

# These are nodes that we determined are 'common' that we are no longer

279

# walking

280

# Now we walk nodes until all nodes that are being walked are 'common'

281

num_candidates = len(candidate_nodes)

282

nodes = self._nodes

283

heappop = heapq.heappop

284

heappush = heapq.heappush

285

while queue and len(candidate_nodes) > 1:

286

_, node = heappop(queue)

287

next_ancestor_of = node.ancestor_of

288

if len(next_ancestor_of) == num_candidates:

289

# This node is now considered 'common'

290

# Make sure all parent nodes are marked as such

291

for parent_key in node.parent_keys:

292

parent_node = nodes[parent_key]

293

if parent_node.ancestor_of is not None:

294

parent_node.ancestor_of = next_ancestor_of

295

if node.linear_dominator != node.key:

296

parent_node = nodes[node.linear_dominator]

297

if parent_node.ancestor_of is not None:

298

parent_node.ancestor_of = next_ancestor_of

299

continue

300

if node.parent_keys is None:

301

# This is a ghost

302

continue

303

# Now project the current nodes ancestor list to the parent nodes,

304

# and queue them up to be walked

305

# Note: using linear_dominator speeds things up quite a bit

306

# enough that we actually start to be slightly faster

307

# than the default heads() implementation

308

if node.linear_dominator != node.key:

309

# We are at the tip of a long linear region

310

# We know that there is nothing between here and the tail

311

# that is interesting, so skip to the end

312

parent_keys = [node.linear_dominator]

313

else:

314

parent_keys = node.parent_keys

315

for parent_key in parent_keys:

316

if parent_key in candidate_nodes:

317

candidate_nodes.pop(parent_key)

318

if len(candidate_nodes) <= 1:

319

break

320

elif parent_key in dom_to_node:

321

orig_node = dom_to_node[parent_key]

322

if orig_node is not node:

323

if orig_node.key in candidate_nodes:

324

candidate_nodes.pop(orig_node.key)

325

if len(candidate_nodes) <= 1:

326

break

327

parent_node = nodes[parent_key]

328

ancestor_of = parent_node.ancestor_of

329

if ancestor_of is None:

330

# This node hasn't been walked yet

331

parent_node.ancestor_of = next_ancestor_of

332

# Enqueue this node

333

heappush(queue, (-parent_node.gdfo, parent_node))

334

to_cleanup_append(parent_node)

335

elif ancestor_of != next_ancestor_of:

336

# Combine to get the full set of parents

337

all_ancestors = set(ancestor_of)

338

all_ancestors.update(next_ancestor_of)

339

parent_node.ancestor_of = tuple(sorted(all_ancestors))

340

def cleanup():

341

for node in to_cleanup:

342

node.ancestor_of = None

343

cleanup()

344

return frozenset(candidate_nodes)

Older »