1
# Copyright (C) 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Helper functions for DirState.
19
This is the python implementation for DirState functions.
22
from bzrlib.dirstate import DirState
25
# Give Pyrex some function definitions for it to understand.
26
# All of these are just hints to Pyrex, so that it can try to convert python
27
# objects into similar C objects. (such as PyInt => int).
28
# In anything defined 'cdef extern from XXX' the real C header will be
29
# imported, and the real definition will be used from there. So these are just
30
# hints, and do not need to match exactly to the C definitions.
33
ctypedef unsigned long size_t
35
cdef extern from "stdint.h":
39
cdef extern from "stdlib.h":
40
unsigned long int strtoul(char *nptr, char **endptr, int base)
43
# These functions allow us access to a bit of the 'bare metal' of python
44
# objects, rather than going through the object abstraction. (For example,
45
# PyList_Append, rather than getting the 'append' attribute of the object, and
46
# creating a tuple, and then using PyCallObject).
47
# Functions that return (or take) a void* are meant to grab a C PyObject*. This
48
# differs from the Pyrex 'object'. If you declare a variable as 'object' Pyrex
49
# will automatically Py_INCREF and Py_DECREF when appropriate. But for some
50
# inner loops, we don't need to do that at all, as the reference only lasts for
52
cdef extern from "Python.h":
53
int PyList_Append(object lst, object item) except -1
54
void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
55
int PyList_CheckExact(object)
57
void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
59
char *PyString_AsString(object p)
60
char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
61
object PyString_FromString(char *)
62
object PyString_FromStringAndSize(char *, int)
63
int PyString_Size(object p)
64
int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
65
int PyString_CheckExact(object p)
68
cdef extern from "string.h":
69
int strncmp(char *s1, char *s2, int len)
70
void *memchr(void *s, int c, size_t len)
71
int memcmp(void *b1, void *b2, size_t len)
72
# ??? memrchr is a GNU extension :(
73
# void *memrchr(void *s, int c, size_t len)
76
cdef void* _my_memrchr(void *s, int c, size_t n):
77
# memrchr seems to be a GNU extension, so we have to implement it ourselves
90
def _py_memrchr(s, c):
91
"""Just to expose _my_memrchr for testing.
93
:param s: The Python string to search
94
:param c: The character to search for
95
:return: The offset to the last instance of 'c' in s
102
_s = PyString_AsString(s)
103
length = PyString_Size(s)
105
_c = PyString_AsString(c)
106
assert PyString_Size(c) == 1,\
107
'Must be a single character string, not %s' % (c,)
108
found = _my_memrchr(_s, _c[0], length)
114
cdef int _is_aligned(void *ptr):
115
"""Is this pointer aligned to an integer size offset?
117
:return: 1 if this pointer is aligned, 0 otherwise.
119
return ((<intptr_t>ptr) & ((sizeof(int))-1)) == 0
122
cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
123
cdef unsigned char *cur1
124
cdef unsigned char *cur2
125
cdef unsigned char *end1
126
cdef unsigned char *end2
132
if path1 == path2 and size1 == size2:
135
end1 = <unsigned char*>path1+size1
136
end2 = <unsigned char*>path2+size2
138
# Use 32-bit comparisons for the matching portion of the string.
139
# Almost all CPU's are faster at loading and comparing 32-bit integers,
140
# than they are at 8-bit integers.
141
# 99% of the time, these will be aligned, but in case they aren't just skip
143
if _is_aligned(path1) and _is_aligned(path2):
144
cur_int1 = <int*>path1
145
cur_int2 = <int*>path2
146
end_int1 = <int*>(path1 + size1 - (size1 % sizeof(int)))
147
end_int2 = <int*>(path2 + size2 - (size2 % sizeof(int)))
149
while cur_int1 < end_int1 and cur_int2 < end_int2:
150
if cur_int1[0] != cur_int2[0]:
152
cur_int1 = cur_int1 + 1
153
cur_int2 = cur_int2 + 1
155
cur1 = <unsigned char*>cur_int1
156
cur2 = <unsigned char*>cur_int2
158
cur1 = <unsigned char*>path1
159
cur2 = <unsigned char*>path2
161
while cur1 < end1 and cur2 < end2:
162
if cur1[0] == cur2[0]:
163
# This character matches, just go to the next one
167
# The current characters do not match
169
return -1 # Reached the end of path1 segment first
170
elif cur2[0] == c'/':
171
return 1 # Reached the end of path2 segment first
172
elif cur1[0] < cur2[0]:
177
# We reached the end of at least one of the strings
179
return 1 # Not at the end of cur1, must be at the end of cur2
181
return -1 # At the end of cur1, but not at cur2
182
# We reached the end of both strings
186
def cmp_by_dirs_c(path1, path2):
187
"""Compare two paths directory by directory.
189
This is equivalent to doing::
191
cmp(path1.split('/'), path2.split('/'))
193
The idea is that you should compare path components separately. This
194
differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
195
``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
197
:param path1: first path
198
:param path2: second path
199
:return: positive number if ``path1`` comes first,
200
0 if paths are equal,
201
and negative number if ``path2`` sorts first
203
if not PyString_CheckExact(path1):
204
raise TypeError("'path1' must be a plain string, not %s: %r"
205
% (type(path1), path1))
206
if not PyString_CheckExact(path2):
207
raise TypeError("'path2' must be a plain string, not %s: %r"
208
% (type(path2), path2))
209
return _cmp_by_dirs(PyString_AsString(path1),
210
PyString_Size(path1),
211
PyString_AsString(path2),
212
PyString_Size(path2))
215
def _cmp_path_by_dirblock_c(path1, path2):
216
"""Compare two paths based on what directory they are in.
218
This generates a sort order, such that all children of a directory are
219
sorted together, and grandchildren are in the same order as the
220
children appear. But all grandchildren come after all children.
222
In other words, all entries in a directory are sorted together, and
223
directorys are sorted in cmp_by_dirs order.
225
:param path1: first path
226
:param path2: the second path
227
:return: positive number if ``path1`` comes first,
229
and a negative number if ``path2`` sorts first
231
if not PyString_CheckExact(path1):
232
raise TypeError("'path1' must be a plain string, not %s: %r"
233
% (type(path1), path1))
234
if not PyString_CheckExact(path2):
235
raise TypeError("'path2' must be a plain string, not %s: %r"
236
% (type(path2), path2))
237
return _cmp_path_by_dirblock(PyString_AsString(path1),
238
PyString_Size(path1),
239
PyString_AsString(path2),
240
PyString_Size(path2))
243
cdef int _cmp_path_by_dirblock(char *path1, int path1_len,
244
char *path2, int path2_len):
245
"""Compare two paths by what directory they are in.
247
see ``_cmp_path_by_dirblock_c`` for details.
250
cdef int dirname1_len
252
cdef int dirname2_len
254
cdef int basename1_len
256
cdef int basename2_len
260
if path1_len == 0 and path2_len == 0:
263
if path1 == path2 and path1_len == path2_len:
272
basename1 = <char*>_my_memrchr(path1, c'/', path1_len)
274
if basename1 == NULL:
276
basename1_len = path1_len
281
dirname1_len = basename1 - path1
282
basename1 = basename1 + 1
283
basename1_len = path1_len - dirname1_len - 1
285
basename2 = <char*>_my_memrchr(path2, c'/', path2_len)
287
if basename2 == NULL:
289
basename2_len = path2_len
294
dirname2_len = basename2 - path2
295
basename2 = basename2 + 1
296
basename2_len = path2_len - dirname2_len - 1
298
cmp_val = _cmp_by_dirs(dirname1, dirname1_len,
299
dirname2, dirname2_len)
303
cur_len = basename1_len
304
if basename2_len < basename1_len:
305
cur_len = basename2_len
307
cmp_val = memcmp(basename1, basename2, cur_len)
310
if basename1_len == basename2_len:
312
if basename1_len < basename2_len:
317
def _bisect_path_left_c(paths, path):
318
"""Return the index where to insert path into paths.
320
This uses a path-wise comparison so we get::
330
:param paths: A list of paths to search through
331
:param path: A single path to insert
332
:return: An offset where 'path' can be inserted.
333
:seealso: bisect.bisect_left
344
if not PyList_CheckExact(paths):
345
raise TypeError("you must pass a python list for 'paths' not: %s %r"
346
% (type(paths), paths))
347
if not PyString_CheckExact(path):
348
raise TypeError("you must pass a string for 'path' not: %s %r"
349
% (type(path), path))
354
path_cstr = PyString_AsString(path)
355
path_size = PyString_Size(path)
358
_mid = (_lo + _hi) / 2
359
cur = PyList_GetItem_object_void(paths, _mid)
360
cur_cstr = PyString_AS_STRING_void(cur)
361
cur_size = PyString_GET_SIZE_void(cur)
362
if _cmp_path_by_dirblock(cur_cstr, cur_size, path_cstr, path_size) < 0:
369
def _bisect_path_right_c(paths, path):
370
"""Return the index where to insert path into paths.
372
This uses a path-wise comparison so we get::
382
:param paths: A list of paths to search through
383
:param path: A single path to insert
384
:return: An offset where 'path' can be inserted.
385
:seealso: bisect.bisect_right
396
if not PyList_CheckExact(paths):
397
raise TypeError("you must pass a python list for 'paths' not: %s %r"
398
% (type(paths), paths))
399
if not PyString_CheckExact(path):
400
raise TypeError("you must pass a string for 'path' not: %s %r"
401
% (type(path), path))
406
path_cstr = PyString_AsString(path)
407
path_size = PyString_Size(path)
410
_mid = (_lo + _hi) / 2
411
cur = PyList_GetItem_object_void(paths, _mid)
412
cur_cstr = PyString_AS_STRING_void(cur)
413
cur_size = PyString_GET_SIZE_void(cur)
414
if _cmp_path_by_dirblock(path_cstr, path_size, cur_cstr, cur_size) < 0:
421
def bisect_dirblock_c(dirblocks, dirname, lo=0, hi=None, cache=None):
422
"""Return the index where to insert dirname into the dirblocks.
424
The return value idx is such that all directories blocks in dirblock[:idx]
425
have names < dirname, and all blocks in dirblock[idx:] have names >=
428
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
429
slice of a to be searched.
434
cdef char *dirname_cstr
435
cdef int dirname_size
440
if not PyList_CheckExact(dirblocks):
441
raise TypeError("you must pass a python list for 'dirblocks' not: %s %r"
442
% (type(dirblocks), dirblocks))
443
if not PyString_CheckExact(dirname):
444
raise TypeError("you must pass a string for dirname not: %s %r"
445
% (type(dirname), dirname))
452
dirname_cstr = PyString_AsString(dirname)
453
dirname_size = PyString_Size(dirname)
456
_mid = (_lo + _hi) / 2
457
# Grab the dirname for the current dirblock
458
# cur = dirblocks[_mid][0]
459
cur = PyTuple_GetItem_void_void(
460
PyList_GetItem_object_void(dirblocks, _mid), 0)
461
cur_cstr = PyString_AS_STRING_void(cur)
462
cur_size = PyString_GET_SIZE_void(cur)
463
if _cmp_by_dirs(cur_cstr, cur_size, dirname_cstr, dirname_size) < 0:
471
"""Maintain the current location, and return fields as you parse them."""
473
cdef object text # The overall string object
474
cdef char *text_cstr # Pointer to the beginning of text
475
cdef int text_size # Length of text
477
cdef char *end_cstr # End of text
478
cdef char *cur_cstr # Pointer to the current record
479
cdef char *next # Pointer to the end of this record
481
def __new__(self, text):
483
self.text_cstr = PyString_AsString(text)
484
self.text_size = PyString_Size(text)
485
self.end_cstr = self.text_cstr + self.text_size
486
self.cur_cstr = self.text_cstr
488
cdef char *get_next(self, int *size):
489
"""Return a pointer to the start of the next field."""
492
self.cur_cstr = <char*>memchr(next, c'\0', self.end_cstr-next)
493
size[0] = self.cur_cstr - next
494
self.cur_cstr = self.cur_cstr + 1
497
cdef object get_next_str(self):
498
"""Get the next field as a Python string."""
501
next = self.get_next(&size)
502
return PyString_FromStringAndSize(next, size)
504
cdef int _init(self) except -1:
505
"""Get the pointer ready.
507
This assumes that the dirstate header has already been read, and we
508
already have the dirblock string loaded into memory.
509
This just initializes our memory pointers, etc for parsing of the
514
# The first field should be an empty string left over from the Header
515
first = self.get_next(&size)
516
if first[0] != c'\0' and size == 0:
517
raise AssertionError('First character should be null not: %s'
521
cdef object _get_entry(self, int num_trees, void **p_current_dirname,
523
"""Extract the next entry.
525
This parses the next entry based on the current location in
527
Each entry can be considered a "row" in the total table. And each row
528
has a fixed number of columns. It is generally broken up into "key"
529
columns, then "current" columns, and then "parent" columns.
531
:param num_trees: How many parent trees need to be parsed
532
:param p_current_dirname: A pointer to the current PyString
533
representing the directory name.
534
We pass this in as a void * so that pyrex doesn't have to
535
increment/decrement the PyObject reference counter for each
537
We use a pointer so that _get_entry can update it with the new
539
:param new_block: This is to let the caller know that it needs to
540
create a new directory block to store the next entry.
542
cdef object path_name_file_id_key
543
cdef char *entry_size_cstr
544
cdef unsigned long int entry_size
545
cdef char* executable_cstr
546
cdef int is_executable
547
cdef char* dirname_cstr
552
cdef object fingerprint
555
# Read the 'key' information (dirname, name, file_id)
556
dirname_cstr = self.get_next(&cur_size)
557
# Check to see if we have started a new directory block.
558
# If so, then we need to create a new dirname PyString, so that it can
559
# be used in all of the tuples. This saves time and memory, by re-using
560
# the same object repeatedly.
562
# Do the cheap 'length of string' check first. If the string is a
563
# different length, then we *have* to be a different directory.
564
if (cur_size != PyString_GET_SIZE_void(p_current_dirname[0])
565
or strncmp(dirname_cstr,
566
# Extract the char* from our current dirname string. We
567
# know it is a PyString, so we can use
568
# PyString_AS_STRING, we use the _void version because
569
# we are tricking Pyrex by using a void* rather than an
571
PyString_AS_STRING_void(p_current_dirname[0]),
573
dirname = PyString_FromStringAndSize(dirname_cstr, cur_size)
574
p_current_dirname[0] = <void*>dirname
579
# Build up the key that will be used.
580
# By using <object>(void *) Pyrex will automatically handle the
581
# Py_INCREF that we need.
582
path_name_file_id_key = (<object>p_current_dirname[0],
587
# Parse all of the per-tree information. current has the information in
588
# the same location as parent trees. The only difference is that 'info'
589
# is a 'packed_stat' for current, while it is a 'revision_id' for
591
# minikind, fingerprint, and info will be returned as regular python
593
# entry_size and is_executable will be parsed into a python Long and
594
# python Boolean, respectively.
595
# TODO: jam 20070718 Consider changin the entry_size conversion to
596
# prefer python Int when possible. They are generally faster to
597
# work with, and it will be rare that we have a file >2GB.
598
# Especially since this code is pretty much fixed at a max of
601
for i from 0 <= i < num_trees:
602
minikind = self.get_next_str()
603
fingerprint = self.get_next_str()
604
entry_size_cstr = self.get_next(&cur_size)
605
entry_size = strtoul(entry_size_cstr, NULL, 10)
606
executable_cstr = self.get_next(&cur_size)
607
is_executable = (executable_cstr[0] == c'y')
608
info = self.get_next_str()
609
PyList_Append(trees, (
611
fingerprint, # fingerprint
613
is_executable,# executable
614
info, # packed_stat or revision_id
617
# The returned tuple is (key, [trees])
618
ret = (path_name_file_id_key, trees)
619
# Ignore the trailing newline, but assert that it does exist, this
620
# ensures that we always finish parsing a line on an end-of-entry
622
trailing = self.get_next(&cur_size)
623
if cur_size != 1 or trailing[0] != c'\n':
624
raise AssertionError(
625
'Bad parse, we expected to end on \\n, not: %d %s: %s'
626
% (cur_size, PyString_FromStringAndSize(trailing, cur_size),
630
def _parse_dirblocks(self, state):
631
"""Parse all dirblocks in the state file."""
633
cdef object current_block
635
cdef void * current_dirname
637
cdef int expected_entry_count
640
num_trees = state._num_present_parents() + 1
641
expected_entry_count = state._num_entries
643
# Ignore the first record
647
state._dirblocks = [('', current_block), ('', [])]
649
current_dirname = <void*>obj
653
# TODO: jam 2007-05-07 Consider pre-allocating some space for the
654
# members, and then growing and shrinking from there. If most
655
# directories have close to 10 entries in them, it would save a
656
# few mallocs if we default our list size to something
657
# reasonable. Or we could malloc it to something large (100 or
658
# so), and then truncate. That would give us a malloc + realloc,
659
# rather than lots of reallocs.
660
while self.cur_cstr < self.end_cstr:
661
entry = self._get_entry(num_trees, ¤t_dirname, &new_block)
663
# new block - different dirname
665
PyList_Append(state._dirblocks,
666
(<object>current_dirname, current_block))
667
PyList_Append(current_block, entry)
668
entry_count = entry_count + 1
669
if entry_count != expected_entry_count:
670
raise AssertionError('We read the wrong number of entries.'
671
' We expected to read %s, but read %s'
672
% (expected_entry_count, entry_count))
673
state._split_root_dirblock_into_contents()
676
def _read_dirblocks_c(state):
677
"""Read in the dirblocks for the given DirState object.
679
This is tightly bound to the DirState internal representation. It should be
680
thought of as a member function, which is only separated out so that we can
681
re-write it in pyrex.
683
:param state: A DirState object.
685
:postcondition: The dirblocks will be loaded into the appropriate fields in
688
state._state_file.seek(state._end_of_header)
689
text = state._state_file.read()
690
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
692
reader = Reader(text)
694
reader._parse_dirblocks(state)
695
state._dirblock_state = DirState.IN_MEMORY_UNMODIFIED