1
# Copyright (C) 2006, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Tools for converting globs to regular expressions.
19
This module provides functions for converting shell-like globs to regular
25
from bzrlib.trace import (
30
class Replacer(object):
31
"""Do a multiple-pattern substitution.
33
The patterns and substitutions are combined into one, so the result of
34
one replacement is never substituted again. Add the patterns and
35
replacements via the add method and then call the object. The patterns
36
must not contain capturing groups.
39
_expand = re.compile(ur'\\&')
41
def __init__(self, source=None):
44
self._pats = list(source._pats)
45
self._funs = list(source._funs)
50
def add(self, pat, fun):
51
r"""Add a pattern and replacement.
53
The pattern must not contain capturing groups.
54
The replacement might be either a string template in which \& will be
55
replaced with the match, or a function that will get the matching text
56
as argument. It does not get match object, because capturing is
60
self._pats.append(pat)
61
self._funs.append(fun)
63
def add_replacer(self, replacer):
64
r"""Add all patterns from another replacer.
66
All patterns and replacements from replacer are appended to the ones
70
self._pats.extend(replacer._pats)
71
self._funs.extend(replacer._funs)
73
def __call__(self, text):
75
self._pat = re.compile(
76
u'|'.join([u'(%s)' % p for p in self._pats]),
78
return self._pat.sub(self._do_sub, text)
81
fun = self._funs[m.lastindex - 1]
82
if hasattr(fun, '__call__'):
83
return fun(m.group(0))
85
return self._expand.sub(m.group(0), fun)
88
_sub_named = Replacer()
89
_sub_named.add(ur'\[:digit:\]', ur'\d')
90
_sub_named.add(ur'\[:space:\]', ur'\s')
91
_sub_named.add(ur'\[:alnum:\]', ur'\w')
92
_sub_named.add(ur'\[:ascii:\]', ur'\0-\x7f')
93
_sub_named.add(ur'\[:blank:\]', ur' \t')
94
_sub_named.add(ur'\[:cntrl:\]', ur'\0-\x1f\x7f-\x9f')
98
if m[1] in (u'!', u'^'):
99
return u'[^' + _sub_named(m[2:-1]) + u']'
100
return u'[' + _sub_named(m[1:-1]) + u']'
103
def _invalid_regex(repl):
105
warning(u"'%s' not allowed within a regular expression. "
106
"Replacing with '%s'" % (m, repl))
111
def _trailing_backslashes_regex(m):
112
"""Check trailing backslashes.
114
Does a head count on trailing backslashes to ensure there isn't an odd
115
one on the end that would escape the brackets we wrap the RE in.
117
if (len(m) % 2) != 0:
118
warning(u"Regular expressions cannot end with an odd number of '\\'. "
119
"Dropping the final '\\'.")
125
_sub_re.add(u'^RE:', u'')
126
_sub_re.add(u'\((?!\?)', u'(?:')
127
_sub_re.add(u'\(\?P<.*>', _invalid_regex(u'(?:'))
128
_sub_re.add(u'\(\?P=[^)]*\)', _invalid_regex(u''))
129
_sub_re.add(ur'\\+$', _trailing_backslashes_regex)
132
_sub_fullpath = Replacer()
133
_sub_fullpath.add(ur'^RE:.*', _sub_re) # RE:<anything> is a regex
134
_sub_fullpath.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group
135
_sub_fullpath.add(ur'(?:(?<=/)|^)(?:\.?/)+', u'') # canonicalize path
136
_sub_fullpath.add(ur'\\.', ur'\&') # keep anything backslashed
137
_sub_fullpath.add(ur'[(){}|^$+.]', ur'\\&') # escape specials
138
_sub_fullpath.add(ur'(?:(?<=/)|^)\*\*+/', ur'(?:.*/)?') # **/ after ^ or /
139
_sub_fullpath.add(ur'\*+', ur'[^/]*') # * elsewhere
140
_sub_fullpath.add(ur'\?', ur'[^/]') # ? everywhere
143
_sub_basename = Replacer()
144
_sub_basename.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group
145
_sub_basename.add(ur'\\.', ur'\&') # keep anything backslashed
146
_sub_basename.add(ur'[(){}|^$+.]', ur'\\&') # escape specials
147
_sub_basename.add(ur'\*+', ur'.*') # * everywhere
148
_sub_basename.add(ur'\?', ur'.') # ? everywhere
151
def _sub_extension(pattern):
152
return _sub_basename(pattern[2:])
155
class Globster(object):
156
"""A simple wrapper for a set of glob patterns.
158
Provides the capability to search the patterns to find a match for
159
a given filename (including the full path).
161
Patterns are translated to regular expressions to expidite matching.
163
The regular expressions for multiple patterns are aggregated into
164
a super-regex containing groups of up to 99 patterns.
165
The 99 limitation is due to the grouping limit of the Python re module.
166
The resulting super-regex and associated patterns are stored as a list of
167
(regex,[patterns]) in _regex_patterns.
169
For performance reasons the patterns are categorised as extension patterns
170
(those that match against a file extension), basename patterns
171
(those that match against the basename of the filename),
172
and fullpath patterns (those that match against the full path).
173
The translations used for extensions and basenames are relatively simpler
174
and therefore faster to perform than the fullpath patterns.
176
Also, the extension patterns are more likely to find a match and
177
so are matched first, then the basename patterns, then the fullpath
180
def __init__(self, patterns):
181
self._regex_patterns = []
186
pat = normalize_pattern(pat)
187
if pat.startswith(u'RE:') or u'/' in pat:
188
path_patterns.append(pat)
189
elif pat.startswith(u'*.'):
190
ext_patterns.append(pat)
192
base_patterns.append(pat)
193
self._add_patterns(ext_patterns,_sub_extension,
194
prefix=r'(?:.*/)?(?!.*/)(?:.*\.)')
195
self._add_patterns(base_patterns,_sub_basename,
196
prefix=r'(?:.*/)?(?!.*/)')
197
self._add_patterns(path_patterns,_sub_fullpath)
199
def _add_patterns(self, patterns, translator, prefix=''):
201
grouped_rules = ['(%s)' % translator(pat) for pat in patterns[:99]]
202
joined_rule = '%s(?:%s)$' % (prefix, '|'.join(grouped_rules))
203
self._regex_patterns.append((re.compile(joined_rule, re.UNICODE),
205
patterns = patterns[99:]
207
def match(self, filename):
208
"""Searches for a pattern that matches the given filename.
210
:return A matching pattern or None if there is no matching pattern.
212
for regex, patterns in self._regex_patterns:
213
match = regex.match(filename)
215
return patterns[match.lastindex -1]
219
class _OrderedGlobster(Globster):
220
"""A Globster that keeps pattern order."""
222
def __init__(self, patterns):
225
:param patterns: sequence of glob patterns
227
# Note: This could be smarter by running like sequences together
228
self._regex_patterns = []
230
pat = normalize_pattern(pat)
231
if pat.startswith(u'RE:') or u'/' in pat:
232
self._add_patterns([pat], _sub_fullpath)
233
elif pat.startswith(u'*.'):
234
self._add_patterns([pat], _sub_extension,
235
prefix=r'(?:.*/)?(?!.*/)(?:.*\.)')
237
self._add_patterns([pat], _sub_basename,
238
prefix=r'(?:.*/)?(?!.*/)')
241
_slashes = re.compile(r'[\\/]+')
242
def normalize_pattern(pattern):
243
"""Converts backslashes in path patterns to forward slashes.
245
Doesn't normalize regular expressions - they may contain escapes.
248
if not pattern.startswith('RE:'):
249
pattern = _slashes.sub('/', pattern)
251
pattern = pattern.rstrip('/')