2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
1 |
# Copyright (C) 2006 Canonical Ltd
|
2 |
||
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
16 |
||
17 |
"""Tools for converting globs to regular expressions.
|
|
18 |
||
19 |
This module provides functions for converting shell-like globs to regular
|
|
20 |
expressions.
|
|
21 |
"""
|
|
22 |
||
23 |
import re |
|
24 |
||
25 |
from bzrlib.trace import ( |
|
26 |
warning
|
|
27 |
)
|
|
28 |
||
29 |
||
30 |
class Replacer(object): |
|
31 |
"""Do a multiple-pattern substitution.
|
|
32 |
||
33 |
The patterns and substitutions are combined into one, so the result of
|
|
34 |
one replacement is never substituted again. Add the patterns and
|
|
35 |
replacements via the add method and then call the object. The patterns
|
|
36 |
must not contain capturing groups.
|
|
37 |
"""
|
|
38 |
||
39 |
_expand = re.compile(ur'\\&') |
|
40 |
||
41 |
def __init__(self, source=None): |
|
42 |
self._pat = None |
|
43 |
if source: |
|
44 |
self._pats = list(source._pats) |
|
45 |
self._funs = list(source._funs) |
|
46 |
else: |
|
47 |
self._pats = [] |
|
48 |
self._funs = [] |
|
49 |
||
50 |
def add(self, pat, fun): |
|
51 |
r"""Add a pattern and replacement. |
|
52 |
||
53 |
The pattern must not contain capturing groups.
|
|
54 |
The replacement might be either a string template in which \& will be
|
|
55 |
replaced with the match, or a function that will get the matching text
|
|
56 |
as argument. It does not get match object, because capturing is
|
|
57 |
forbidden anyway.
|
|
58 |
"""
|
|
59 |
self._pat = None |
|
60 |
self._pats.append(pat) |
|
61 |
self._funs.append(fun) |
|
62 |
||
63 |
def add_replacer(self, replacer): |
|
64 |
r"""Add all patterns from another replacer. |
|
65 |
||
66 |
All patterns and replacements from replacer are appended to the ones
|
|
67 |
already defined.
|
|
68 |
"""
|
|
69 |
self._pat = None |
|
70 |
self._pats.extend(replacer._pats) |
|
71 |
self._funs.extend(replacer._funs) |
|
72 |
||
73 |
def __call__(self, text): |
|
74 |
if not self._pat: |
|
75 |
self._pat = re.compile( |
|
76 |
u'|'.join([u'(%s)' % p for p in self._pats]), |
|
77 |
re.UNICODE) |
|
78 |
return self._pat.sub(self._do_sub, text) |
|
79 |
||
80 |
def _do_sub(self, m): |
|
81 |
fun = self._funs[m.lastindex - 1] |
|
82 |
if hasattr(fun, '__call__'): |
|
83 |
return fun(m.group(0)) |
|
84 |
else: |
|
85 |
return self._expand.sub(m.group(0), fun) |
|
86 |
||
87 |
||
88 |
_sub_named = Replacer() |
|
89 |
_sub_named.add(ur'\[:digit:\]', ur'\d') |
|
90 |
_sub_named.add(ur'\[:space:\]', ur'\s') |
|
91 |
_sub_named.add(ur'\[:alnum:\]', ur'\w') |
|
92 |
_sub_named.add(ur'\[:ascii:\]', ur'\0-\x7f') |
|
93 |
_sub_named.add(ur'\[:blank:\]', ur' \t') |
|
94 |
_sub_named.add(ur'\[:cntrl:\]', ur'\0-\x1f\x7f-\x9f') |
|
95 |
||
96 |
||
97 |
def _sub_group(m): |
|
98 |
if m[1] in (u'!', u'^'): |
|
99 |
return u'[^' + _sub_named(m[2:-1]) + u']' |
|
100 |
return u'[' + _sub_named(m[1:-1]) + u']' |
|
101 |
||
102 |
||
103 |
def _invalid_regex(repl): |
|
104 |
def _(m): |
|
2135.2.7
by Kent Gibson
Implement JAM's review suggestions. |
105 |
warning(u"'%s' not allowed within a regular expression. " |
106 |
"Replacing with '%s'" % (m, repl)) |
|
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
107 |
return repl |
108 |
return _ |
|
109 |
||
110 |
||
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
111 |
def _trailing_backslashes_regex(m): |
2298.8.2
by Kent Gibson
Review fixes for lp86451 patch. |
112 |
"""Check trailing backslashes.
|
113 |
||
114 |
Does a head count on trailing backslashes to ensure there isn't an odd
|
|
115 |
one on the end that would escape the brackets we wrap the RE in.
|
|
116 |
"""
|
|
117 |
if (len(m) % 2) != 0: |
|
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
118 |
warning(u"Regular expressions cannot end with an odd number of '\\'. " |
119 |
"Dropping the final '\\'.") |
|
120 |
return m[:-1] |
|
121 |
return m |
|
122 |
||
123 |
||
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
124 |
_sub_re = Replacer() |
125 |
_sub_re.add(u'^RE:', u'') |
|
126 |
_sub_re.add(u'\((?!\?)', u'(?:') |
|
127 |
_sub_re.add(u'\(\?P<.*>', _invalid_regex(u'(?:')) |
|
128 |
_sub_re.add(u'\(\?P=[^)]*\)', _invalid_regex(u'')) |
|
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
129 |
_sub_re.add(ur'\\+$', _trailing_backslashes_regex) |
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
130 |
|
131 |
||
2135.2.2
by Kent Gibson
Ignore pattern matcher (glob.py) patches: |
132 |
_sub_fullpath = Replacer() |
133 |
_sub_fullpath.add(ur'^RE:.*', _sub_re) # RE:<anything> is a regex |
|
134 |
_sub_fullpath.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group |
|
135 |
_sub_fullpath.add(ur'(?:(?<=/)|^)(?:\.?/)+', u'') # canonicalize path |
|
136 |
_sub_fullpath.add(ur'\\.', ur'\&') # keep anything backslashed |
|
137 |
_sub_fullpath.add(ur'[(){}|^$+.]', ur'\\&') # escape specials |
|
138 |
_sub_fullpath.add(ur'(?:(?<=/)|^)\*\*+/', ur'(?:.*/)?') # **/ after ^ or / |
|
139 |
_sub_fullpath.add(ur'\*+', ur'[^/]*') # * elsewhere |
|
140 |
_sub_fullpath.add(ur'\?', ur'[^/]') # ? everywhere |
|
141 |
||
142 |
||
143 |
_sub_basename = Replacer() |
|
144 |
_sub_basename.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group |
|
145 |
_sub_basename.add(ur'\\.', ur'\&') # keep anything backslashed |
|
146 |
_sub_basename.add(ur'[(){}|^$+.]', ur'\\&') # escape specials |
|
147 |
_sub_basename.add(ur'\*+', ur'.*') # * everywhere |
|
148 |
_sub_basename.add(ur'\?', ur'.') # ? everywhere |
|
149 |
||
150 |
||
151 |
def _sub_extension(pattern): |
|
152 |
return _sub_basename(pattern[2:]) |
|
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
153 |
|
154 |
||
155 |
class Globster(object): |
|
156 |
"""A simple wrapper for a set of glob patterns.
|
|
157 |
||
158 |
Provides the capability to search the patterns to find a match for
|
|
159 |
a given filename (including the full path).
|
|
160 |
||
161 |
Patterns are translated to regular expressions to expidite matching.
|
|
162 |
||
163 |
The regular expressions for multiple patterns are aggregated into
|
|
164 |
a super-regex containing groups of up to 99 patterns.
|
|
165 |
The 99 limitation is due to the grouping limit of the Python re module.
|
|
166 |
The resulting super-regex and associated patterns are stored as a list of
|
|
167 |
(regex,[patterns]) in _regex_patterns.
|
|
168 |
|
|
169 |
For performance reasons the patterns are categorised as extension patterns
|
|
170 |
(those that match against a file extension), basename patterns
|
|
171 |
(those that match against the basename of the filename),
|
|
172 |
and fullpath patterns (those that match against the full path).
|
|
2135.2.2
by Kent Gibson
Ignore pattern matcher (glob.py) patches: |
173 |
The translations used for extensions and basenames are relatively simpler
|
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
174 |
and therefore faster to perform than the fullpath patterns.
|
175 |
||
176 |
Also, the extension patterns are more likely to find a match and
|
|
177 |
so are matched first, then the basename patterns, then the fullpath
|
|
178 |
patterns.
|
|
179 |
"""
|
|
180 |
def __init__(self, patterns): |
|
181 |
self._regex_patterns = [] |
|
182 |
path_patterns = [] |
|
183 |
base_patterns = [] |
|
184 |
ext_patterns = [] |
|
185 |
for pat in patterns: |
|
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
186 |
pat = normalize_pattern(pat) |
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
187 |
if pat.startswith(u'RE:') or u'/' in pat: |
188 |
path_patterns.append(pat) |
|
189 |
elif pat.startswith(u'*.'): |
|
190 |
ext_patterns.append(pat) |
|
191 |
else: |
|
192 |
base_patterns.append(pat) |
|
2135.2.2
by Kent Gibson
Ignore pattern matcher (glob.py) patches: |
193 |
self._add_patterns(ext_patterns,_sub_extension, |
194 |
prefix=r'(?:.*/)?(?!.*/)(?:.*\.)') |
|
195 |
self._add_patterns(base_patterns,_sub_basename, |
|
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
196 |
prefix=r'(?:.*/)?(?!.*/)') |
2135.2.2
by Kent Gibson
Ignore pattern matcher (glob.py) patches: |
197 |
self._add_patterns(path_patterns,_sub_fullpath) |
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
198 |
|
199 |
def _add_patterns(self, patterns, translator, prefix=''): |
|
200 |
while patterns: |
|
201 |
grouped_rules = ['(%s)' % translator(pat) for pat in patterns[:99]] |
|
202 |
joined_rule = '%s(?:%s)$' % (prefix, '|'.join(grouped_rules)) |
|
2135.2.4
by Kent Gibson
Reverted case-insensitive matches on case-insensitive platforms. |
203 |
self._regex_patterns.append((re.compile(joined_rule, re.UNICODE), |
2135.2.1
by Kent Gibson
Added glob module to replace broken fnmatch based ignore pattern matching (#57637) |
204 |
patterns[:99])) |
205 |
patterns = patterns[99:] |
|
206 |
||
207 |
def match(self, filename): |
|
208 |
"""Searches for a pattern that matches the given filename.
|
|
209 |
|
|
210 |
:return A matching pattern or None if there is no matching pattern.
|
|
211 |
"""
|
|
212 |
for regex, patterns in self._regex_patterns: |
|
213 |
match = regex.match(filename) |
|
214 |
if match: |
|
215 |
return patterns[match.lastindex -1] |
|
216 |
return None |
|
217 |
||
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
218 |
|
219 |
def normalize_pattern(pattern): |
|
220 |
"""Converts backslashes in path patterns to forward slashes.
|
|
2298.8.2
by Kent Gibson
Review fixes for lp86451 patch. |
221 |
|
222 |
Doesn't normalize regular expressions - they may contain escapes.
|
|
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
223 |
"""
|
224 |
if not pattern.startswith('RE:'): |
|
2298.8.2
by Kent Gibson
Review fixes for lp86451 patch. |
225 |
pattern = pattern.replace('\\','/') |
2298.8.1
by Kent Gibson
Normalise ignore patterns to use '/' path separator. |
226 |
return pattern.rstrip('/') |