~bzr-pqm/bzr/bzr.dev : contents of bzrlib/utextwrap.py at revision 5820.1.19

~bzr-pqm/bzr/bzr.dev : (revision 5820.1.19)

# Copyright (C) 2011 Canonical Ltd
#
# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
# wrap and fill is copied from Python's textwrap module
# (under PSF license) and modified for support CJK.
# Original Copyright for these functions:
#
# Copyright (C) 1999-2001 Gregory P. Ward.
# Copyright (C) 2002, 2003 Python Software Foundation.
#
# Written by Greg Ward <gward@python.net>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

import sys
import textwrap
from unicodedata import east_asian_width as _eawidth

from bzrlib import osutils

__all__ = ["UTextWrapper", "fill", "wrap"]

class UTextWrapper(textwrap.TextWrapper):
    """
    Extend TextWrapper for Unicode.

    This textwrapper handles east asian double width and split word
    even if !break_long_words when word contains double width
    characters.

    :param ambiguous_width: (keyword argument) width for character when
                            unicodedata.east_asian_width(c) == 'A'
                            (default: 2)
    """

    def __init__(self, width=None, **kwargs):
        if width is None:
            width = (osutils.terminal_width() or
                        osutils.default_terminal_width) - 1

        ambi_width = kwargs.pop('ambiguous_width', 2)
        if ambi_width == 1:
            self._east_asian_doublewidth = 'FW'
        elif ambi_width == 2:
            self._east_asian_doublewidth = 'FWA'
        else:
            raise ValueError("ambiguous_width should be 1 or 2")

        # No drop_whitespace param before Python 2.6 it was always dropped
        if sys.version_info < (2, 6):
            self.drop_whitespace = kwargs.pop("drop_whitespace", True)
            if not self.drop_whitespace:
                raise ValueError("TextWrapper version must drop whitespace")
        textwrap.TextWrapper.__init__(self, width, **kwargs)

    def _unicode_char_width(self, uc):
        """Return width of character `uc`.

        :param:     uc      Single unicode character.
        """
        # 'A' means width of the character is not be able to determine.
        # We assume that it's width is 2 because longer wrap may over
        # terminal width but shorter wrap may be acceptable.
        return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1

    def _width(self, s):
        """Returns width for s.
        
        When s is unicode, take care of east asian width.
        When s is bytes, treat all byte is single width character.
        """
        assert isinstance(s, unicode)
        charwidth = self._unicode_char_width
        return sum(charwidth(c) for c in s)

    def _cut(self, s, width):
        """Returns head and rest of s. (head+rest == s)

        Head is large as long as _width(head) <= width.
        """
        assert isinstance(s, unicode)
        w = 0
        charwidth = self._unicode_char_width
        for pos, c in enumerate(s):
            w += charwidth(c)
            if w > width:
                return s[:pos], s[pos:]
        return s, u''

    def _handle_long_word(self, chunks, cur_line, cur_len, width):
        # Figure out when indent is larger than the specified width, and make
        # sure at least one character is stripped off on every pass
        if width < 2:
            space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
        else:
            space_left = width - cur_len

        # If we're allowed to break long words, then do so: put as much
        # of the next chunk onto the current line as will fit.
        if self.break_long_words:
            head, rest = self._cut(chunks[-1], space_left)
            cur_line.append(head)
            if rest:
                chunks[-1] = rest
            else:
                del chunks[-1]

        # Otherwise, we have to preserve the long word intact.  Only add
        # it to the current line if there's nothing already there --
        # that minimizes how much we violate the width constraint.
        elif not cur_line:
            cur_line.append(chunks.pop())

        # If we're not allowed to break long words, and there's already
        # text on the current line, do nothing.  Next time through the
        # main loop of _wrap_chunks(), we'll wind up here again, but
        # cur_len will be zero, so the next line will be entirely
        # devoted to the long word that we can't handle right now.

    def _wrap_chunks(self, chunks):
        lines = []
        if self.width <= 0:
            raise ValueError("invalid width %r (must be > 0)" % self.width)

        # Arrange in reverse order so items can be efficiently popped
        # from a stack of chucks.
        chunks.reverse()

        while chunks:

            # Start the list of chunks that will make up the current line.
            # cur_len is just the length of all the chunks in cur_line.
            cur_line = []
            cur_len = 0

            # Figure out which static string will prefix this line.
            if lines:
                indent = self.subsequent_indent
            else:
                indent = self.initial_indent

            # Maximum width for this line.
            width = self.width - len(indent)

            # First chunk on line is whitespace -- drop it, unless this
            # is the very beginning of the text (ie. no lines started yet).
            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
                del chunks[-1]

            while chunks:
                # Use _width instead of len for east asian width
                l = self._width(chunks[-1])

                # Can at least squeeze this chunk onto the current line.
                if cur_len + l <= width:
                    cur_line.append(chunks.pop())
                    cur_len += l

                # Nope, this line is full.
                else:
                    break

            # The current line is full, and the next chunk is too big to
            # fit on *any* line (not just this one).
            if chunks and self._width(chunks[-1]) > width:
                self._handle_long_word(chunks, cur_line, cur_len, width)

            # If the last chunk on this line is all whitespace, drop it.
            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
                del cur_line[-1]

            # Convert current line back to a string and store it in list
            # of all lines (return value).
            if cur_line:
                lines.append(indent + ''.join(cur_line))

        return lines

    def _split(self, text):
        chunks = textwrap.TextWrapper._split(self, unicode(text))
        cjk_split_chunks = []
        for chunk in chunks:
            assert chunk # TextWrapper._split removes empty chunk
            prev_pos = 0
            for pos, char in enumerate(chunk):
                # Treats all asian character are line breakable.
                # But it is not true because line breaking is
                # prohibited around some characters.
                # See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
                if _eawidth(char) in 'FWA':
                    if prev_pos < pos:
                        cjk_split_chunks.append(chunk[prev_pos:pos])
                    cjk_split_chunks.append(char)
                    prev_pos = pos+1
            if prev_pos < len(chunk):
                cjk_split_chunks.append(chunk[prev_pos:])
        return cjk_split_chunks

    def wrap(self, text):
        # ensure text is unicode
        return textwrap.TextWrapper.wrap(self, unicode(text))

# -- Convenience interface ---------------------------------------------

def wrap(text, width=None, **kwargs):
    """Wrap a single paragraph of text, returning a list of wrapped lines.

    Reformat the single paragraph in 'text' so it fits in lines of no
    more than 'width' columns, and return a list of wrapped lines.  By
    default, tabs in 'text' are expanded with string.expandtabs(), and
    all other whitespace characters (including newline) are converted to
    space.  See TextWrapper class for available keyword args to customize
    wrapping behaviour.
    """
    return UTextWrapper(width=width, **kwargs).wrap(text)

def fill(text, width=None, **kwargs):
    """Fill a single paragraph of text, returning a new string.

    Reformat the single paragraph in 'text' to fit in lines of no more
    than 'width' columns, and return a new string containing the entire
    wrapped paragraph.  As with wrap(), tabs are expanded and other
    whitespace characters converted to space.  See TextWrapper class for
    available keyword args to customize wrapping behaviour.
    """
    return UTextWrapper(width=width, **kwargs).fill(text)


5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	1	# Copyright (C) 2011 Canonical Ltd
	2	#
5820.1.18 by INADA Naoki Add copyright for some function.	3	# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
	4	# wrap and fill is copied from Python's textwrap module
	5	# (under PSF license) and modified for support CJK.
	6	# Original Copyright for these functions:
	7	#
	8	# Copyright (C) 1999-2001 Gregory P. Ward.
	9	# Copyright (C) 2002, 2003 Python Software Foundation.
	10	#
	11	# Written by Greg Ward <gward@python.net>
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	25
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	26	import sys
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	27	import textwrap
	28	from unicodedata import east_asian_width as _eawidth
	29
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	30	from bzrlib import osutils
	31
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	32	__all__ = ["UTextWrapper", "fill", "wrap"]
	33
	34	class UTextWrapper(textwrap.TextWrapper):
	35	"""
	36	Extend TextWrapper for Unicode.
	37
	38	This textwrapper handles east asian double width and split word
	39	even if !break_long_words when word contains double width
	40	characters.
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	41
	42	:param ambiguous_width: (keyword argument) width for character when
	43	unicodedata.east_asian_width(c) == 'A'
	44	(default: 2)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	45	"""
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	46
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	47	def __init__(self, width=None, **kwargs):
	48	if width is None:
	49	width = (osutils.terminal_width() or
	50	osutils.default_terminal_width) - 1
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	51
	52	ambi_width = kwargs.pop('ambiguous_width', 2)
	53	if ambi_width == 1:
	54	self._east_asian_doublewidth = 'FW'
	55	elif ambi_width == 2:
	56	self._east_asian_doublewidth = 'FWA'
	57	else:
	58	raise ValueError("ambiguous_width should be 1 or 2")
	59
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	60	# No drop_whitespace param before Python 2.6 it was always dropped
	61	if sys.version_info < (2, 6):
	62	self.drop_whitespace = kwargs.pop("drop_whitespace", True)
	63	if not self.drop_whitespace:
	64	raise ValueError("TextWrapper version must drop whitespace")
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	65	textwrap.TextWrapper.__init__(self, width, **kwargs)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	66
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	67	def _unicode_char_width(self, uc):
	68	"""Return width of character `uc`.
	69
	70	:param: uc Single unicode character.
	71	"""
	72	# 'A' means width of the character is not be able to determine.
	73	# We assume that it's width is 2 because longer wrap may over
	74	# terminal width but shorter wrap may be acceptable.
	75	return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
	76
	77	def _width(self, s):
	78	"""Returns width for s.
	79
	80	When s is unicode, take care of east asian width.
	81	When s is bytes, treat all byte is single width character.
	82	"""
	83	assert isinstance(s, unicode)
	84	charwidth = self._unicode_char_width
	85	return sum(charwidth(c) for c in s)
	86
	87	def _cut(self, s, width):
	88	"""Returns head and rest of s. (head+rest == s)
	89
	90	Head is large as long as _width(head) <= width.
	91	"""
	92	assert isinstance(s, unicode)
	93	w = 0
	94	charwidth = self._unicode_char_width
	95	for pos, c in enumerate(s):
	96	w += charwidth(c)
	97	if w > width:
	98	return s[:pos], s[pos:]
	99	return s, u''
	100
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	101	def _handle_long_word(self, chunks, cur_line, cur_len, width):
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	102	# Figure out when indent is larger than the specified width, and make
	103	# sure at least one character is stripped off on every pass
	104	if width < 2:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	105	space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	106	else:
	107	space_left = width - cur_len
	108
	109	# If we're allowed to break long words, then do so: put as much
	110	# of the next chunk onto the current line as will fit.
	111	if self.break_long_words:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	112	head, rest = self._cut(chunks[-1], space_left)
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	113	cur_line.append(head)
5820.1.5 by INADA Naoki Make UTextWrapper support byte string and add tests including Python's	114	if rest:
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	115	chunks[-1] = rest
	116	else:
	117	del chunks[-1]
	118
	119	# Otherwise, we have to preserve the long word intact. Only add
	120	# it to the current line if there's nothing already there --
	121	# that minimizes how much we violate the width constraint.
	122	elif not cur_line:
	123	cur_line.append(chunks.pop())
	124
	125	# If we're not allowed to break long words, and there's already
	126	# text on the current line, do nothing. Next time through the
	127	# main loop of _wrap_chunks(), we'll wind up here again, but
	128	# cur_len will be zero, so the next line will be entirely
	129	# devoted to the long word that we can't handle right now.
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	130
	131	def _wrap_chunks(self, chunks):
	132	lines = []
	133	if self.width <= 0:
	134	raise ValueError("invalid width %r (must be > 0)" % self.width)
	135
	136	# Arrange in reverse order so items can be efficiently popped
	137	# from a stack of chucks.
	138	chunks.reverse()
	139
	140	while chunks:
	141
	142	# Start the list of chunks that will make up the current line.
	143	# cur_len is just the length of all the chunks in cur_line.
	144	cur_line = []
	145	cur_len = 0
	146
	147	# Figure out which static string will prefix this line.
	148	if lines:
	149	indent = self.subsequent_indent
	150	else:
	151	indent = self.initial_indent
	152
	153	# Maximum width for this line.
	154	width = self.width - len(indent)
	155
	156	# First chunk on line is whitespace -- drop it, unless this
	157	# is the very beginning of the text (ie. no lines started yet).
	158	if self.drop_whitespace and chunks[-1].strip() == '' and lines:
	159	del chunks[-1]
	160
	161	while chunks:
	162	# Use _width instead of len for east asian width
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	163	l = self._width(chunks[-1])
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	164
	165	# Can at least squeeze this chunk onto the current line.
	166	if cur_len + l <= width:
	167	cur_line.append(chunks.pop())
	168	cur_len += l
	169
	170	# Nope, this line is full.
	171	else:
	172	break
	173
	174	# The current line is full, and the next chunk is too big to
	175	# fit on any line (not just this one).
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	176	if chunks and self._width(chunks[-1]) > width:
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	177	self._handle_long_word(chunks, cur_line, cur_len, width)
	178
	179	# If the last chunk on this line is all whitespace, drop it.
	180	if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
	181	del cur_line[-1]
	182
	183	# Convert current line back to a string and store it in list
	184	# of all lines (return value).
	185	if cur_line:
	186	lines.append(indent + ''.join(cur_line))
	187
	188	return lines
	189
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	190	def _split(self, text):
	191	chunks = textwrap.TextWrapper._split(self, unicode(text))
	192	cjk_split_chunks = []
	193	for chunk in chunks:
	194	assert chunk # TextWrapper._split removes empty chunk
	195	prev_pos = 0
	196	for pos, char in enumerate(chunk):
	197	# Treats all asian character are line breakable.
	198	# But it is not true because line breaking is
	199	# prohibited around some characters.
	200	# See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
	201	if _eawidth(char) in 'FWA':
	202	if prev_pos < pos:
	203	cjk_split_chunks.append(chunk[prev_pos:pos])
	204	cjk_split_chunks.append(char)
	205	prev_pos = pos+1
	206	if prev_pos < len(chunk):
	207	cjk_split_chunks.append(chunk[prev_pos:])
	208	return cjk_split_chunks
	209
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	210	def wrap(self, text):
	211	# ensure text is unicode
	212	return textwrap.TextWrapper.wrap(self, unicode(text))
	213
	214	# -- Convenience interface ---------------------------------------------
	215
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	216	def wrap(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	217	"""Wrap a single paragraph of text, returning a list of wrapped lines.
	218
	219	Reformat the single paragraph in 'text' so it fits in lines of no
	220	more than 'width' columns, and return a list of wrapped lines. By
	221	default, tabs in 'text' are expanded with string.expandtabs(), and
	222	all other whitespace characters (including newline) are converted to
	223	space. See TextWrapper class for available keyword args to customize
	224	wrapping behaviour.
	225	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	226	return UTextWrapper(width=width, **kwargs).wrap(text)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	227
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	228	def fill(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	229	"""Fill a single paragraph of text, returning a new string.
	230
	231	Reformat the single paragraph in 'text' to fit in lines of no more
	232	than 'width' columns, and return a new string containing the entire
	233	wrapped paragraph. As with wrap(), tabs are expanded and other
	234	whitespace characters converted to space. See TextWrapper class for
	235	available keyword args to customize wrapping behaviour.
	236	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	237	return UTextWrapper(width=width, **kwargs).fill(text)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	238