1
# Copyright (C) 2007, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Helper functions for DirState.
19
This is the python implementation for DirState functions.
22
from bzrlib import errors
23
from bzrlib.dirstate import DirState
26
# Give Pyrex some function definitions for it to understand.
27
# All of these are just hints to Pyrex, so that it can try to convert python
28
# objects into similar C objects. (such as PyInt => int).
29
# In anything defined 'cdef extern from XXX' the real C header will be
30
# imported, and the real definition will be used from there. So these are just
31
# hints, and do not need to match exactly to the C definitions.
34
ctypedef unsigned long size_t
36
cdef extern from "_dirstate_helpers_c.h":
40
cdef extern from "stdlib.h":
41
unsigned long int strtoul(char *nptr, char **endptr, int base)
44
# These functions allow us access to a bit of the 'bare metal' of python
45
# objects, rather than going through the object abstraction. (For example,
46
# PyList_Append, rather than getting the 'append' attribute of the object, and
47
# creating a tuple, and then using PyCallObject).
48
# Functions that return (or take) a void* are meant to grab a C PyObject*. This
49
# differs from the Pyrex 'object'. If you declare a variable as 'object' Pyrex
50
# will automatically Py_INCREF and Py_DECREF when appropriate. But for some
51
# inner loops, we don't need to do that at all, as the reference only lasts for
53
cdef extern from "Python.h":
54
ctypedef int Py_ssize_t
55
int PyList_Append(object lst, object item) except -1
56
void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
57
int PyList_CheckExact(object)
59
void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
61
char *PyString_AsString(object p)
62
char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
63
object PyString_FromString(char *)
64
object PyString_FromStringAndSize(char *, Py_ssize_t)
65
int PyString_Size(object p)
66
int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
67
int PyString_CheckExact(object p)
70
cdef extern from "string.h":
71
int strncmp(char *s1, char *s2, int len)
72
void *memchr(void *s, int c, size_t len)
73
int memcmp(void *b1, void *b2, size_t len)
74
# ??? memrchr is a GNU extension :(
75
# void *memrchr(void *s, int c, size_t len)
78
cdef void* _my_memrchr(void *s, int c, size_t n):
79
# memrchr seems to be a GNU extension, so we have to implement it ourselves
92
def _py_memrchr(s, c):
93
"""Just to expose _my_memrchr for testing.
95
:param s: The Python string to search
96
:param c: The character to search for
97
:return: The offset to the last instance of 'c' in s
104
_s = PyString_AsString(s)
105
length = PyString_Size(s)
107
_c = PyString_AsString(c)
108
assert PyString_Size(c) == 1,\
109
'Must be a single character string, not %s' % (c,)
110
found = _my_memrchr(_s, _c[0], length)
113
return <char*>found - <char*>_s
115
cdef object safe_string_from_size(char *s, Py_ssize_t size):
117
raise AssertionError(
118
'tried to create a string with an invalid size: %d @0x%x'
120
return PyString_FromStringAndSize(s, size)
123
cdef int _is_aligned(void *ptr):
124
"""Is this pointer aligned to an integer size offset?
126
:return: 1 if this pointer is aligned, 0 otherwise.
128
return ((<intptr_t>ptr) & ((sizeof(int))-1)) == 0
131
cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
132
cdef unsigned char *cur1
133
cdef unsigned char *cur2
134
cdef unsigned char *end1
135
cdef unsigned char *end2
141
if path1 == path2 and size1 == size2:
144
end1 = <unsigned char*>path1+size1
145
end2 = <unsigned char*>path2+size2
147
# Use 32-bit comparisons for the matching portion of the string.
148
# Almost all CPU's are faster at loading and comparing 32-bit integers,
149
# than they are at 8-bit integers.
150
# 99% of the time, these will be aligned, but in case they aren't just skip
152
if _is_aligned(path1) and _is_aligned(path2):
153
cur_int1 = <int*>path1
154
cur_int2 = <int*>path2
155
end_int1 = <int*>(path1 + size1 - (size1 % sizeof(int)))
156
end_int2 = <int*>(path2 + size2 - (size2 % sizeof(int)))
158
while cur_int1 < end_int1 and cur_int2 < end_int2:
159
if cur_int1[0] != cur_int2[0]:
161
cur_int1 = cur_int1 + 1
162
cur_int2 = cur_int2 + 1
164
cur1 = <unsigned char*>cur_int1
165
cur2 = <unsigned char*>cur_int2
167
cur1 = <unsigned char*>path1
168
cur2 = <unsigned char*>path2
170
while cur1 < end1 and cur2 < end2:
171
if cur1[0] == cur2[0]:
172
# This character matches, just go to the next one
176
# The current characters do not match
178
return -1 # Reached the end of path1 segment first
179
elif cur2[0] == c'/':
180
return 1 # Reached the end of path2 segment first
181
elif cur1[0] < cur2[0]:
186
# We reached the end of at least one of the strings
188
return 1 # Not at the end of cur1, must be at the end of cur2
190
return -1 # At the end of cur1, but not at cur2
191
# We reached the end of both strings
195
def cmp_by_dirs_c(path1, path2):
196
"""Compare two paths directory by directory.
198
This is equivalent to doing::
200
cmp(path1.split('/'), path2.split('/'))
202
The idea is that you should compare path components separately. This
203
differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
204
``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
206
:param path1: first path
207
:param path2: second path
208
:return: negative number if ``path1`` comes first,
209
0 if paths are equal,
210
and positive number if ``path2`` sorts first
212
if not PyString_CheckExact(path1):
213
raise TypeError("'path1' must be a plain string, not %s: %r"
214
% (type(path1), path1))
215
if not PyString_CheckExact(path2):
216
raise TypeError("'path2' must be a plain string, not %s: %r"
217
% (type(path2), path2))
218
return _cmp_by_dirs(PyString_AsString(path1),
219
PyString_Size(path1),
220
PyString_AsString(path2),
221
PyString_Size(path2))
224
def _cmp_path_by_dirblock_c(path1, path2):
225
"""Compare two paths based on what directory they are in.
227
This generates a sort order, such that all children of a directory are
228
sorted together, and grandchildren are in the same order as the
229
children appear. But all grandchildren come after all children.
231
In other words, all entries in a directory are sorted together, and
232
directorys are sorted in cmp_by_dirs order.
234
:param path1: first path
235
:param path2: the second path
236
:return: negative number if ``path1`` comes first,
238
and a positive number if ``path2`` sorts first
240
if not PyString_CheckExact(path1):
241
raise TypeError("'path1' must be a plain string, not %s: %r"
242
% (type(path1), path1))
243
if not PyString_CheckExact(path2):
244
raise TypeError("'path2' must be a plain string, not %s: %r"
245
% (type(path2), path2))
246
return _cmp_path_by_dirblock(PyString_AsString(path1),
247
PyString_Size(path1),
248
PyString_AsString(path2),
249
PyString_Size(path2))
252
cdef int _cmp_path_by_dirblock(char *path1, int path1_len,
253
char *path2, int path2_len):
254
"""Compare two paths by what directory they are in.
256
see ``_cmp_path_by_dirblock_c`` for details.
259
cdef int dirname1_len
261
cdef int dirname2_len
263
cdef int basename1_len
265
cdef int basename2_len
269
if path1_len == 0 and path2_len == 0:
272
if path1 == path2 and path1_len == path2_len:
281
basename1 = <char*>_my_memrchr(path1, c'/', path1_len)
283
if basename1 == NULL:
285
basename1_len = path1_len
290
dirname1_len = basename1 - path1
291
basename1 = basename1 + 1
292
basename1_len = path1_len - dirname1_len - 1
294
basename2 = <char*>_my_memrchr(path2, c'/', path2_len)
296
if basename2 == NULL:
298
basename2_len = path2_len
303
dirname2_len = basename2 - path2
304
basename2 = basename2 + 1
305
basename2_len = path2_len - dirname2_len - 1
307
cmp_val = _cmp_by_dirs(dirname1, dirname1_len,
308
dirname2, dirname2_len)
312
cur_len = basename1_len
313
if basename2_len < basename1_len:
314
cur_len = basename2_len
316
cmp_val = memcmp(basename1, basename2, cur_len)
319
if basename1_len == basename2_len:
321
if basename1_len < basename2_len:
326
def _bisect_path_left_c(paths, path):
327
"""Return the index where to insert path into paths.
329
This uses a path-wise comparison so we get::
339
:param paths: A list of paths to search through
340
:param path: A single path to insert
341
:return: An offset where 'path' can be inserted.
342
:seealso: bisect.bisect_left
353
if not PyList_CheckExact(paths):
354
raise TypeError("you must pass a python list for 'paths' not: %s %r"
355
% (type(paths), paths))
356
if not PyString_CheckExact(path):
357
raise TypeError("you must pass a string for 'path' not: %s %r"
358
% (type(path), path))
363
path_cstr = PyString_AsString(path)
364
path_size = PyString_Size(path)
367
_mid = (_lo + _hi) / 2
368
cur = PyList_GetItem_object_void(paths, _mid)
369
cur_cstr = PyString_AS_STRING_void(cur)
370
cur_size = PyString_GET_SIZE_void(cur)
371
if _cmp_path_by_dirblock(cur_cstr, cur_size, path_cstr, path_size) < 0:
378
def _bisect_path_right_c(paths, path):
379
"""Return the index where to insert path into paths.
381
This uses a path-wise comparison so we get::
391
:param paths: A list of paths to search through
392
:param path: A single path to insert
393
:return: An offset where 'path' can be inserted.
394
:seealso: bisect.bisect_right
405
if not PyList_CheckExact(paths):
406
raise TypeError("you must pass a python list for 'paths' not: %s %r"
407
% (type(paths), paths))
408
if not PyString_CheckExact(path):
409
raise TypeError("you must pass a string for 'path' not: %s %r"
410
% (type(path), path))
415
path_cstr = PyString_AsString(path)
416
path_size = PyString_Size(path)
419
_mid = (_lo + _hi) / 2
420
cur = PyList_GetItem_object_void(paths, _mid)
421
cur_cstr = PyString_AS_STRING_void(cur)
422
cur_size = PyString_GET_SIZE_void(cur)
423
if _cmp_path_by_dirblock(path_cstr, path_size, cur_cstr, cur_size) < 0:
430
def bisect_dirblock_c(dirblocks, dirname, lo=0, hi=None, cache=None):
431
"""Return the index where to insert dirname into the dirblocks.
433
The return value idx is such that all directories blocks in dirblock[:idx]
434
have names < dirname, and all blocks in dirblock[idx:] have names >=
437
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
438
slice of a to be searched.
443
cdef char *dirname_cstr
444
cdef int dirname_size
449
if not PyList_CheckExact(dirblocks):
450
raise TypeError("you must pass a python list for 'dirblocks' not: %s %r"
451
% (type(dirblocks), dirblocks))
452
if not PyString_CheckExact(dirname):
453
raise TypeError("you must pass a string for dirname not: %s %r"
454
% (type(dirname), dirname))
461
dirname_cstr = PyString_AsString(dirname)
462
dirname_size = PyString_Size(dirname)
465
_mid = (_lo + _hi) / 2
466
# Grab the dirname for the current dirblock
467
# cur = dirblocks[_mid][0]
468
cur = PyTuple_GetItem_void_void(
469
PyList_GetItem_object_void(dirblocks, _mid), 0)
470
cur_cstr = PyString_AS_STRING_void(cur)
471
cur_size = PyString_GET_SIZE_void(cur)
472
if _cmp_by_dirs(cur_cstr, cur_size, dirname_cstr, dirname_size) < 0:
480
"""Maintain the current location, and return fields as you parse them."""
482
cdef object state # The DirState object
483
cdef object text # The overall string object
484
cdef char *text_cstr # Pointer to the beginning of text
485
cdef int text_size # Length of text
487
cdef char *end_cstr # End of text
488
cdef char *cur_cstr # Pointer to the current record
489
cdef char *next # Pointer to the end of this record
491
def __init__(self, text, state):
494
self.text_cstr = PyString_AsString(text)
495
self.text_size = PyString_Size(text)
496
self.end_cstr = self.text_cstr + self.text_size
497
self.cur_cstr = self.text_cstr
499
cdef char *get_next(self, int *size) except NULL:
500
"""Return a pointer to the start of the next field."""
502
cdef Py_ssize_t extra_len
504
if self.cur_cstr == NULL:
505
raise AssertionError('get_next() called when cur_str is NULL')
506
elif self.cur_cstr >= self.end_cstr:
507
raise AssertionError('get_next() called when there are no chars'
510
self.cur_cstr = <char*>memchr(next, c'\0', self.end_cstr - next)
511
if self.cur_cstr == NULL:
512
extra_len = self.end_cstr - next
513
raise errors.DirstateCorrupt(self.state,
514
'failed to find trailing NULL (\\0).'
515
' Trailing garbage: %r'
516
% safe_string_from_size(next, extra_len))
517
size[0] = self.cur_cstr - next
518
self.cur_cstr = self.cur_cstr + 1
521
cdef object get_next_str(self):
522
"""Get the next field as a Python string."""
525
next = self.get_next(&size)
526
return safe_string_from_size(next, size)
528
cdef int _init(self) except -1:
529
"""Get the pointer ready.
531
This assumes that the dirstate header has already been read, and we
532
already have the dirblock string loaded into memory.
533
This just initializes our memory pointers, etc for parsing of the
538
# The first field should be an empty string left over from the Header
539
first = self.get_next(&size)
540
if first[0] != c'\0' and size == 0:
541
raise AssertionError('First character should be null not: %s'
545
cdef object _get_entry(self, int num_trees, void **p_current_dirname,
547
"""Extract the next entry.
549
This parses the next entry based on the current location in
551
Each entry can be considered a "row" in the total table. And each row
552
has a fixed number of columns. It is generally broken up into "key"
553
columns, then "current" columns, and then "parent" columns.
555
:param num_trees: How many parent trees need to be parsed
556
:param p_current_dirname: A pointer to the current PyString
557
representing the directory name.
558
We pass this in as a void * so that pyrex doesn't have to
559
increment/decrement the PyObject reference counter for each
561
We use a pointer so that _get_entry can update it with the new
563
:param new_block: This is to let the caller know that it needs to
564
create a new directory block to store the next entry.
566
cdef object path_name_file_id_key
567
cdef char *entry_size_cstr
568
cdef unsigned long int entry_size
569
cdef char* executable_cstr
570
cdef int is_executable
571
cdef char* dirname_cstr
576
cdef object fingerprint
579
# Read the 'key' information (dirname, name, file_id)
580
dirname_cstr = self.get_next(&cur_size)
581
# Check to see if we have started a new directory block.
582
# If so, then we need to create a new dirname PyString, so that it can
583
# be used in all of the tuples. This saves time and memory, by re-using
584
# the same object repeatedly.
586
# Do the cheap 'length of string' check first. If the string is a
587
# different length, then we *have* to be a different directory.
588
if (cur_size != PyString_GET_SIZE_void(p_current_dirname[0])
589
or strncmp(dirname_cstr,
590
# Extract the char* from our current dirname string. We
591
# know it is a PyString, so we can use
592
# PyString_AS_STRING, we use the _void version because
593
# we are tricking Pyrex by using a void* rather than an
595
PyString_AS_STRING_void(p_current_dirname[0]),
597
dirname = safe_string_from_size(dirname_cstr, cur_size)
598
p_current_dirname[0] = <void*>dirname
603
# Build up the key that will be used.
604
# By using <object>(void *) Pyrex will automatically handle the
605
# Py_INCREF that we need.
606
path_name_file_id_key = (<object>p_current_dirname[0],
611
# Parse all of the per-tree information. current has the information in
612
# the same location as parent trees. The only difference is that 'info'
613
# is a 'packed_stat' for current, while it is a 'revision_id' for
615
# minikind, fingerprint, and info will be returned as regular python
617
# entry_size and is_executable will be parsed into a python Long and
618
# python Boolean, respectively.
619
# TODO: jam 20070718 Consider changin the entry_size conversion to
620
# prefer python Int when possible. They are generally faster to
621
# work with, and it will be rare that we have a file >2GB.
622
# Especially since this code is pretty much fixed at a max of
625
for i from 0 <= i < num_trees:
626
minikind = self.get_next_str()
627
fingerprint = self.get_next_str()
628
entry_size_cstr = self.get_next(&cur_size)
629
entry_size = strtoul(entry_size_cstr, NULL, 10)
630
executable_cstr = self.get_next(&cur_size)
631
is_executable = (executable_cstr[0] == c'y')
632
info = self.get_next_str()
633
PyList_Append(trees, (
635
fingerprint, # fingerprint
637
is_executable,# executable
638
info, # packed_stat or revision_id
641
# The returned tuple is (key, [trees])
642
ret = (path_name_file_id_key, trees)
643
# Ignore the trailing newline, but assert that it does exist, this
644
# ensures that we always finish parsing a line on an end-of-entry
646
trailing = self.get_next(&cur_size)
647
if cur_size != 1 or trailing[0] != c'\n':
648
raise errors.DirstateCorrupt(self.state,
649
'Bad parse, we expected to end on \\n, not: %d %s: %s'
650
% (cur_size, safe_string_from_size(trailing, cur_size),
654
def _parse_dirblocks(self):
655
"""Parse all dirblocks in the state file."""
657
cdef object current_block
659
cdef void * current_dirname
661
cdef int expected_entry_count
664
num_trees = self.state._num_present_parents() + 1
665
expected_entry_count = self.state._num_entries
667
# Ignore the first record
671
dirblocks = [('', current_block), ('', [])]
672
self.state._dirblocks = dirblocks
674
current_dirname = <void*>obj
678
# TODO: jam 2007-05-07 Consider pre-allocating some space for the
679
# members, and then growing and shrinking from there. If most
680
# directories have close to 10 entries in them, it would save a
681
# few mallocs if we default our list size to something
682
# reasonable. Or we could malloc it to something large (100 or
683
# so), and then truncate. That would give us a malloc + realloc,
684
# rather than lots of reallocs.
685
while self.cur_cstr < self.end_cstr:
686
entry = self._get_entry(num_trees, ¤t_dirname, &new_block)
688
# new block - different dirname
690
PyList_Append(dirblocks,
691
(<object>current_dirname, current_block))
692
PyList_Append(current_block, entry)
693
entry_count = entry_count + 1
694
if entry_count != expected_entry_count:
695
raise errors.DirstateCorrupt(self.state,
696
'We read the wrong number of entries.'
697
' We expected to read %s, but read %s'
698
% (expected_entry_count, entry_count))
699
self.state._split_root_dirblock_into_contents()
702
def _read_dirblocks_c(state):
703
"""Read in the dirblocks for the given DirState object.
705
This is tightly bound to the DirState internal representation. It should be
706
thought of as a member function, which is only separated out so that we can
707
re-write it in pyrex.
709
:param state: A DirState object.
711
:postcondition: The dirblocks will be loaded into the appropriate fields in
714
state._state_file.seek(state._end_of_header)
715
text = state._state_file.read()
716
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
718
reader = Reader(text, state)
720
reader._parse_dirblocks()
721
state._dirblock_state = DirState.IN_MEMORY_UNMODIFIED