~bzr-pqm/bzr/bzr.dev : contents of bzrlib/globbing.py at revision 3169

~bzr-pqm/bzr/bzr.dev : (revision 3169)

# Copyright (C) 2006 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Tools for converting globs to regular expressions.

This module provides functions for converting shell-like globs to regular
expressions.
"""

import re

from bzrlib.trace import (
    warning
    )


class Replacer(object):
    """Do a multiple-pattern substitution.

    The patterns and substitutions are combined into one, so the result of
    one replacement is never substituted again. Add the patterns and
    replacements via the add method and then call the object. The patterns
    must not contain capturing groups.
    """

    _expand = re.compile(ur'\\&')

    def __init__(self, source=None):
        self._pat = None
        if source:
            self._pats = list(source._pats)
            self._funs = list(source._funs)
        else:
            self._pats = []
            self._funs = []

    def add(self, pat, fun):
        r"""Add a pattern and replacement.

        The pattern must not contain capturing groups.
        The replacement might be either a string template in which \& will be
        replaced with the match, or a function that will get the matching text  
        as argument. It does not get match object, because capturing is 
        forbidden anyway.
        """
        self._pat = None
        self._pats.append(pat)
        self._funs.append(fun)

    def add_replacer(self, replacer):
        r"""Add all patterns from another replacer.

        All patterns and replacements from replacer are appended to the ones
        already defined.
        """
        self._pat = None
        self._pats.extend(replacer._pats)
        self._funs.extend(replacer._funs)

    def __call__(self, text):
        if not self._pat:
            self._pat = re.compile(
                    u'|'.join([u'(%s)' % p for p in self._pats]),
                    re.UNICODE)
        return self._pat.sub(self._do_sub, text)

    def _do_sub(self, m):
        fun = self._funs[m.lastindex - 1]
        if hasattr(fun, '__call__'):
            return fun(m.group(0))
        else:
            return self._expand.sub(m.group(0), fun)


_sub_named = Replacer()
_sub_named.add(ur'\[:digit:\]', ur'\d')
_sub_named.add(ur'\[:space:\]', ur'\s')
_sub_named.add(ur'\[:alnum:\]', ur'\w')
_sub_named.add(ur'\[:ascii:\]', ur'\0-\x7f')
_sub_named.add(ur'\[:blank:\]', ur' \t')
_sub_named.add(ur'\[:cntrl:\]', ur'\0-\x1f\x7f-\x9f')


def _sub_group(m):
    if m[1] in (u'!', u'^'):
        return u'[^' + _sub_named(m[2:-1]) + u']'
    return u'[' + _sub_named(m[1:-1]) + u']'


def _invalid_regex(repl):
    def _(m):
        warning(u"'%s' not allowed within a regular expression. "
                "Replacing with '%s'" % (m, repl))
        return repl
    return _


def _trailing_backslashes_regex(m):
    """Check trailing backslashes.

    Does a head count on trailing backslashes to ensure there isn't an odd
    one on the end that would escape the brackets we wrap the RE in.
    """
    if (len(m) % 2) != 0:
        warning(u"Regular expressions cannot end with an odd number of '\\'. "
                "Dropping the final '\\'.")
        return m[:-1]
    return m


_sub_re = Replacer()
_sub_re.add(u'^RE:', u'')
_sub_re.add(u'\((?!\?)', u'(?:')
_sub_re.add(u'\(\?P<.*>', _invalid_regex(u'(?:'))
_sub_re.add(u'\(\?P=[^)]*\)', _invalid_regex(u''))
_sub_re.add(ur'\\+$', _trailing_backslashes_regex)


_sub_fullpath = Replacer()
_sub_fullpath.add(ur'^RE:.*', _sub_re) # RE:<anything> is a regex
_sub_fullpath.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group
_sub_fullpath.add(ur'(?:(?<=/)|^)(?:\.?/)+', u'') # canonicalize path
_sub_fullpath.add(ur'\\.', ur'\&') # keep anything backslashed
_sub_fullpath.add(ur'[(){}|^$+.]', ur'\\&') # escape specials
_sub_fullpath.add(ur'(?:(?<=/)|^)\*\*+/', ur'(?:.*/)?') # **/ after ^ or /
_sub_fullpath.add(ur'\*+', ur'[^/]*') # * elsewhere
_sub_fullpath.add(ur'\?', ur'[^/]') # ? everywhere


_sub_basename = Replacer()
_sub_basename.add(ur'\[\^?\]?(?:[^][]|\[:[^]]+:\])+\]', _sub_group) # char group
_sub_basename.add(ur'\\.', ur'\&') # keep anything backslashed
_sub_basename.add(ur'[(){}|^$+.]', ur'\\&') # escape specials
_sub_basename.add(ur'\*+', ur'.*') # * everywhere
_sub_basename.add(ur'\?', ur'.') # ? everywhere


def _sub_extension(pattern):
    return _sub_basename(pattern[2:])


class Globster(object):
    """A simple wrapper for a set of glob patterns.

    Provides the capability to search the patterns to find a match for
    a given filename (including the full path).

    Patterns are translated to regular expressions to expidite matching.

    The regular expressions for multiple patterns are aggregated into 
    a super-regex containing groups of up to 99 patterns.  
    The 99 limitation is due to the grouping limit of the Python re module.
    The resulting super-regex and associated patterns are stored as a list of
    (regex,[patterns]) in _regex_patterns.
    
    For performance reasons the patterns are categorised as extension patterns
    (those that match against a file extension), basename patterns
    (those that match against the basename of the filename),
    and fullpath patterns (those that match against the full path).
    The translations used for extensions and basenames are relatively simpler 
    and therefore faster to perform than the fullpath patterns.

    Also, the extension patterns are more likely to find a match and 
    so are matched first, then the basename patterns, then the fullpath
    patterns.
    """
    def __init__(self, patterns):
        self._regex_patterns = []
        path_patterns = []
        base_patterns = []
        ext_patterns = []
        for pat in patterns:
            pat = normalize_pattern(pat)
            if pat.startswith(u'RE:') or u'/' in pat:
                path_patterns.append(pat)
            elif pat.startswith(u'*.'):
                ext_patterns.append(pat)
            else:
                base_patterns.append(pat)
        self._add_patterns(ext_patterns,_sub_extension,
            prefix=r'(?:.*/)?(?!.*/)(?:.*\.)')
        self._add_patterns(base_patterns,_sub_basename, 
            prefix=r'(?:.*/)?(?!.*/)')
        self._add_patterns(path_patterns,_sub_fullpath) 

    def _add_patterns(self, patterns, translator, prefix=''):
        while patterns:
            grouped_rules = ['(%s)' % translator(pat) for pat in patterns[:99]]
            joined_rule = '%s(?:%s)$' % (prefix, '|'.join(grouped_rules))
            self._regex_patterns.append((re.compile(joined_rule, re.UNICODE), 
                patterns[:99]))
            patterns = patterns[99:]

    def match(self, filename):
        """Searches for a pattern that matches the given filename.
        
        :return A matching pattern or None if there is no matching pattern.
        """
        for regex, patterns in self._regex_patterns:
            match = regex.match(filename)
            if match:
                return patterns[match.lastindex -1]
        return None
        

def normalize_pattern(pattern):
    """Converts backslashes in path patterns to forward slashes.
    
    Doesn't normalize regular expressions - they may contain escapes.
    """
    if not pattern.startswith('RE:'):
        pattern = pattern.replace('\\','/')
    return pattern.rstrip('/')

2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	1	# Copyright (C) 2006 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	"""Tools for converting globs to regular expressions.
	18
	19	This module provides functions for converting shell-like globs to regular
	20	expressions.
	21	"""
	22
	23	import re
	24
	25	from bzrlib.trace import (
	26	warning
	27	)
	28
	29
	30	class Replacer(object):
	31	"""Do a multiple-pattern substitution.
	32
	33	The patterns and substitutions are combined into one, so the result of
	34	one replacement is never substituted again. Add the patterns and
	35	replacements via the add method and then call the object. The patterns
	36	must not contain capturing groups.
	37	"""
	38
	39	_expand = re.compile(ur'\\&')
	40
	41	def __init__(self, source=None):
	42	self._pat = None
	43	if source:
	44	self._pats = list(source._pats)
	45	self._funs = list(source._funs)
	46	else:
	47	self._pats = []
	48	self._funs = []
	49
	50	def add(self, pat, fun):
	51	r"""Add a pattern and replacement.
	52
	53	The pattern must not contain capturing groups.
	54	The replacement might be either a string template in which \& will be
	55	replaced with the match, or a function that will get the matching text
	56	as argument. It does not get match object, because capturing is
	57	forbidden anyway.
	58	"""
	59	self._pat = None
	60	self._pats.append(pat)
	61	self._funs.append(fun)
	62
	63	def add_replacer(self, replacer):
	64	r"""Add all patterns from another replacer.
65
66	All patterns and replacements from replacer are appended to the ones
67	already defined.
68	"""
69	self._pat = None
70	self._pats.extend(replacer._pats)
71	self._funs.extend(replacer._funs)
72
73	def __call__(self, text):
74	if not self._pat:
75	self._pat = re.compile(
76	u'\|'.join([u'(%s)' % p for p in self._pats]),
77	re.UNICODE)
78	return self._pat.sub(self._do_sub, text)
79
80	def _do_sub(self, m):
81	fun = self._funs[m.lastindex - 1]
82	if hasattr(fun, '__call__'):
83	return fun(m.group(0))
84	else:
85	return self._expand.sub(m.group(0), fun)
86
87
88	_sub_named = Replacer()
89	_sub_named.add(ur'\[:digit:\]', ur'\d')
90	_sub_named.add(ur'\[:space:\]', ur'\s')
91	_sub_named.add(ur'\[:alnum:\]', ur'\w')
92	_sub_named.add(ur'\[:ascii:\]', ur'\0-\x7f')
93	_sub_named.add(ur'\[:blank:\]', ur' \t')
94	_sub_named.add(ur'\[:cntrl:\]', ur'\0-\x1f\x7f-\x9f')
95
96
97	def _sub_group(m):
98	if m[1] in (u'!', u'^'):
99	return u'[^' + _sub_named(m[2:-1]) + u']'
100	return u'[' + _sub_named(m[1:-1]) + u']'
101
102
103	def _invalid_regex(repl):
104	def _(m):
2135.2.7 by Kent Gibson Implement JAM's review suggestions.	105	warning(u"'%s' not allowed within a regular expression. "
2135.2.7 by Kent Gibson Implement JAM's review suggestions.	106	"Replacing with '%s'" % (m, repl))
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	107	return repl
	108	return _
	109
	110
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	111	def _trailing_backslashes_regex(m):
2298.8.2 by Kent Gibson Review fixes for lp86451 patch.	112	"""Check trailing backslashes.
	113
	114	Does a head count on trailing backslashes to ensure there isn't an odd
	115	one on the end that would escape the brackets we wrap the RE in.
	116	"""
	117	if (len(m) % 2) != 0:
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	118	warning(u"Regular expressions cannot end with an odd number of '\\'. "
	119	"Dropping the final '\\'.")
	120	return m[:-1]
	121	return m
	122
	123
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	124	_sub_re = Replacer()
	125	_sub_re.add(u'^RE:', u'')
	126	_sub_re.add(u'\((?!\?)', u'(?:')
	127	_sub_re.add(u'\(\?P<.*>', _invalid_regex(u'(?:'))
	128	_sub_re.add(u'\(\?P=[^)]*\)', _invalid_regex(u''))
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	129	_sub_re.add(ur'\\+$', _trailing_backslashes_regex)
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	130
	131
2135.2.2 by Kent Gibson Ignore pattern matcher (glob.py) patches:	132	_sub_fullpath = Replacer()
	133	_sub_fullpath.add(ur'^RE:.*', _sub_re) # RE:<anything> is a regex
	134	_sub_fullpath.add(ur'\[\^?\]?(?:[^][]\|\[:[^]]+:\])+\]', _sub_group) # char group
	135	_sub_fullpath.add(ur'(?:(?<=/)\|^)(?:\.?/)+', u'') # canonicalize path
	136	_sub_fullpath.add(ur'\\.', ur'\&') # keep anything backslashed
	137	_sub_fullpath.add(ur'[(){}\|^$+.]', ur'\\&') # escape specials
	138	_sub_fullpath.add(ur'(?:(?<=/)\|^)\\+/', ur'(?:./)?') # */ after ^ or /
	139	_sub_fullpath.add(ur'\+', ur'[^/]') # * elsewhere
	140	_sub_fullpath.add(ur'\?', ur'[^/]') # ? everywhere
	141
	142
	143	_sub_basename = Replacer()
	144	_sub_basename.add(ur'\[\^?\]?(?:[^][]\|\[:[^]]+:\])+\]', _sub_group) # char group
	145	_sub_basename.add(ur'\\.', ur'\&') # keep anything backslashed
	146	_sub_basename.add(ur'[(){}\|^$+.]', ur'\\&') # escape specials
	147	_sub_basename.add(ur'\+', ur'.') # * everywhere
	148	_sub_basename.add(ur'\?', ur'.') # ? everywhere
	149
	150
	151	def _sub_extension(pattern):
	152	return _sub_basename(pattern[2:])
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	153
	154
	155	class Globster(object):
	156	"""A simple wrapper for a set of glob patterns.
	157
	158	Provides the capability to search the patterns to find a match for
	159	a given filename (including the full path).
	160
	161	Patterns are translated to regular expressions to expidite matching.
	162
	163	The regular expressions for multiple patterns are aggregated into
	164	a super-regex containing groups of up to 99 patterns.
	165	The 99 limitation is due to the grouping limit of the Python re module.
	166	The resulting super-regex and associated patterns are stored as a list of
	167	(regex,[patterns]) in _regex_patterns.
	168
	169	For performance reasons the patterns are categorised as extension patterns
	170	(those that match against a file extension), basename patterns
	171	(those that match against the basename of the filename),
	172	and fullpath patterns (those that match against the full path).
2135.2.2 by Kent Gibson Ignore pattern matcher (glob.py) patches:	173	The translations used for extensions and basenames are relatively simpler
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	174	and therefore faster to perform than the fullpath patterns.
	175
	176	Also, the extension patterns are more likely to find a match and
	177	so are matched first, then the basename patterns, then the fullpath
	178	patterns.
	179	"""
	180	def __init__(self, patterns):
	181	self._regex_patterns = []
	182	path_patterns = []
	183	base_patterns = []
	184	ext_patterns = []
	185	for pat in patterns:
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	186	pat = normalize_pattern(pat)
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	187	if pat.startswith(u'RE:') or u'/' in pat:
	188	path_patterns.append(pat)
	189	elif pat.startswith(u'*.'):
	190	ext_patterns.append(pat)
	191	else:
	192	base_patterns.append(pat)
2135.2.2 by Kent Gibson Ignore pattern matcher (glob.py) patches:	193	self._add_patterns(ext_patterns,_sub_extension,
	194	prefix=r'(?:./)?(?!./)(?:.*\.)')
	195	self._add_patterns(base_patterns,_sub_basename,
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	196	prefix=r'(?:./)?(?!./)')
2135.2.2 by Kent Gibson Ignore pattern matcher (glob.py) patches:	197	self._add_patterns(path_patterns,_sub_fullpath)
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	198
	199	def _add_patterns(self, patterns, translator, prefix=''):
	200	while patterns:
	201	grouped_rules = ['(%s)' % translator(pat) for pat in patterns[:99]]
	202	joined_rule = '%s(?:%s)$' % (prefix, '\|'.join(grouped_rules))
2135.2.4 by Kent Gibson Reverted case-insensitive matches on case-insensitive platforms.	203	self._regex_patterns.append((re.compile(joined_rule, re.UNICODE),
2135.2.1 by Kent Gibson Added glob module to replace broken fnmatch based ignore pattern matching (#57637)	204	patterns[:99]))
	205	patterns = patterns[99:]
	206
	207	def match(self, filename):
	208	"""Searches for a pattern that matches the given filename.
	209
	210	:return A matching pattern or None if there is no matching pattern.
	211	"""
	212	for regex, patterns in self._regex_patterns:
	213	match = regex.match(filename)
	214	if match:
	215	return patterns[match.lastindex -1]
	216	return None
	217
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	218
	219	def normalize_pattern(pattern):
	220	"""Converts backslashes in path patterns to forward slashes.
2298.8.2 by Kent Gibson Review fixes for lp86451 patch.	221
2298.8.2 by Kent Gibson Review fixes for lp86451 patch.	222	Doesn't normalize regular expressions - they may contain escapes.
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	223	"""
	224	if not pattern.startswith('RE:'):
2298.8.2 by Kent Gibson Review fixes for lp86451 patch.	225	pattern = pattern.replace('\\','/')
2298.8.1 by Kent Gibson Normalise ignore patterns to use '/' path separator.	226	return pattern.rstrip('/')