~bzr-pqm/bzr/bzr.dev

5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
1
# Copyright (C) 2011 Canonical Ltd
2
#
5820.1.18 by INADA Naoki
Add copyright for some function.
3
# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
4
# wrap and fill is copied from Python's textwrap module
5
# (under PSF license) and modified for support CJK.
6
# Original Copyright for these functions:
7
#
8
# Copyright (C) 1999-2001 Gregory P. Ward.
9
# Copyright (C) 2002, 2003 Python Software Foundation.
10
#
11
# Written by Greg Ward <gward@python.net>
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
12
# This program is free software; you can redistribute it and/or modify
13
# it under the terms of the GNU General Public License as published by
14
# the Free Software Foundation; either version 2 of the License, or
15
# (at your option) any later version.
16
#
17
# This program is distributed in the hope that it will be useful,
18
# but WITHOUT ANY WARRANTY; without even the implied warranty of
19
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
# GNU General Public License for more details.
21
#
22
# You should have received a copy of the GNU General Public License
23
# along with this program; if not, write to the Free Software
24
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
5820.1.15 by Martin
Cope with lack of TextWrapper.drop_whitespace before Python 2.6
26
import sys
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
27
import textwrap
28
from unicodedata import east_asian_width as _eawidth
29
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
30
from bzrlib import osutils
31
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
32
__all__ = ["UTextWrapper", "fill", "wrap"]
33
34
class UTextWrapper(textwrap.TextWrapper):
35
    """
36
    Extend TextWrapper for Unicode.
37
38
    This textwrapper handles east asian double width and split word
39
    even if !break_long_words when word contains double width
40
    characters.
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
41
42
    :param ambiguous_width: (keyword argument) width for character when
43
                            unicodedata.east_asian_width(c) == 'A'
44
                            (default: 2)
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
45
    """
5820.1.15 by Martin
Cope with lack of TextWrapper.drop_whitespace before Python 2.6
46
5820.1.9 by INADA Naoki
Default width of UTextWrapper is also osutils.terminal_widtth() and
47
    def __init__(self, width=None, **kwargs):
48
        if width is None:
49
            width = (osutils.terminal_width() or
50
                        osutils.default_terminal_width) - 1
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
51
52
        ambi_width = kwargs.pop('ambiguous_width', 2)
53
        if ambi_width == 1:
54
            self._east_asian_doublewidth = 'FW'
55
        elif ambi_width == 2:
56
            self._east_asian_doublewidth = 'FWA'
57
        else:
58
            raise ValueError("ambiguous_width should be 1 or 2")
59
5820.1.15 by Martin
Cope with lack of TextWrapper.drop_whitespace before Python 2.6
60
        # No drop_whitespace param before Python 2.6 it was always dropped
61
        if sys.version_info < (2, 6):
62
            self.drop_whitespace = kwargs.pop("drop_whitespace", True)
63
            if not self.drop_whitespace:
64
                raise ValueError("TextWrapper version must drop whitespace")
5820.1.9 by INADA Naoki
Default width of UTextWrapper is also osutils.terminal_widtth() and
65
        textwrap.TextWrapper.__init__(self, width, **kwargs)
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
66
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
67
    def _unicode_char_width(self, uc):
68
        """Return width of character `uc`.
69
70
        :param:     uc      Single unicode character.
71
        """
72
        # 'A' means width of the character is not be able to determine.
73
        # We assume that it's width is 2 because longer wrap may over
74
        # terminal width but shorter wrap may be acceptable.
75
        return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
76
77
    def _width(self, s):
78
        """Returns width for s.
79
        
80
        When s is unicode, take care of east asian width.
81
        When s is bytes, treat all byte is single width character.
82
        """
83
        assert isinstance(s, unicode)
84
        charwidth = self._unicode_char_width
85
        return sum(charwidth(c) for c in s)
86
87
    def _cut(self, s, width):
88
        """Returns head and rest of s. (head+rest == s)
89
90
        Head is large as long as _width(head) <= width.
91
        """
92
        assert isinstance(s, unicode)
93
        w = 0
94
        charwidth = self._unicode_char_width
95
        for pos, c in enumerate(s):
96
            w += charwidth(c)
97
            if w > width:
98
                return s[:pos], s[pos:]
99
        return s, u''
100
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
101
    def _handle_long_word(self, chunks, cur_line, cur_len, width):
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
102
        # Figure out when indent is larger than the specified width, and make
103
        # sure at least one character is stripped off on every pass
104
        if width < 2:
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
105
            space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
106
        else:
107
            space_left = width - cur_len
108
109
        # If we're allowed to break long words, then do so: put as much
110
        # of the next chunk onto the current line as will fit.
111
        if self.break_long_words:
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
112
            head, rest = self._cut(chunks[-1], space_left)
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
113
            cur_line.append(head)
5820.1.5 by INADA Naoki
Make UTextWrapper support byte string and add tests including Python's
114
            if rest:
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
115
                chunks[-1] = rest
116
            else:
117
                del chunks[-1]
118
119
        # Otherwise, we have to preserve the long word intact.  Only add
120
        # it to the current line if there's nothing already there --
121
        # that minimizes how much we violate the width constraint.
122
        elif not cur_line:
123
            cur_line.append(chunks.pop())
124
125
        # If we're not allowed to break long words, and there's already
126
        # text on the current line, do nothing.  Next time through the
127
        # main loop of _wrap_chunks(), we'll wind up here again, but
128
        # cur_len will be zero, so the next line will be entirely
129
        # devoted to the long word that we can't handle right now.
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
130
131
    def _wrap_chunks(self, chunks):
132
        lines = []
133
        if self.width <= 0:
134
            raise ValueError("invalid width %r (must be > 0)" % self.width)
135
136
        # Arrange in reverse order so items can be efficiently popped
137
        # from a stack of chucks.
138
        chunks.reverse()
139
140
        while chunks:
141
142
            # Start the list of chunks that will make up the current line.
143
            # cur_len is just the length of all the chunks in cur_line.
144
            cur_line = []
145
            cur_len = 0
146
147
            # Figure out which static string will prefix this line.
148
            if lines:
149
                indent = self.subsequent_indent
150
            else:
151
                indent = self.initial_indent
152
153
            # Maximum width for this line.
154
            width = self.width - len(indent)
155
156
            # First chunk on line is whitespace -- drop it, unless this
157
            # is the very beginning of the text (ie. no lines started yet).
158
            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
159
                del chunks[-1]
160
161
            while chunks:
162
                # Use _width instead of len for east asian width
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
163
                l = self._width(chunks[-1])
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
164
165
                # Can at least squeeze this chunk onto the current line.
166
                if cur_len + l <= width:
167
                    cur_line.append(chunks.pop())
168
                    cur_len += l
169
170
                # Nope, this line is full.
171
                else:
172
                    break
173
174
            # The current line is full, and the next chunk is too big to
175
            # fit on *any* line (not just this one).
5820.1.19 by INADA Naoki
Add keyword parameter 'ambiguous_width' that specifies width for character
176
            if chunks and self._width(chunks[-1]) > width:
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
177
                self._handle_long_word(chunks, cur_line, cur_len, width)
178
179
            # If the last chunk on this line is all whitespace, drop it.
180
            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
181
                del cur_line[-1]
182
183
            # Convert current line back to a string and store it in list
184
            # of all lines (return value).
185
            if cur_line:
186
                lines.append(indent + ''.join(cur_line))
187
188
        return lines
189
5820.1.10 by INADA Naoki
utextwrap: Change a way to split between CJK characters.
190
    def _split(self, text):
191
        chunks = textwrap.TextWrapper._split(self, unicode(text))
192
        cjk_split_chunks = []
193
        for chunk in chunks:
194
            assert chunk # TextWrapper._split removes empty chunk
195
            prev_pos = 0
196
            for pos, char in enumerate(chunk):
197
                # Treats all asian character are line breakable.
198
                # But it is not true because line breaking is
199
                # prohibited around some characters.
200
                # See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
201
                if _eawidth(char) in 'FWA':
202
                    if prev_pos < pos:
203
                        cjk_split_chunks.append(chunk[prev_pos:pos])
204
                    cjk_split_chunks.append(char)
205
                    prev_pos = pos+1
206
            if prev_pos < len(chunk):
207
                cjk_split_chunks.append(chunk[prev_pos:])
208
        return cjk_split_chunks
209
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
210
    def wrap(self, text):
211
        # ensure text is unicode
212
        return textwrap.TextWrapper.wrap(self, unicode(text))
213
214
# -- Convenience interface ---------------------------------------------
215
5820.1.2 by INADA Naoki
bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.
216
def wrap(text, width=None, **kwargs):
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
217
    """Wrap a single paragraph of text, returning a list of wrapped lines.
218
219
    Reformat the single paragraph in 'text' so it fits in lines of no
220
    more than 'width' columns, and return a list of wrapped lines.  By
221
    default, tabs in 'text' are expanded with string.expandtabs(), and
222
    all other whitespace characters (including newline) are converted to
223
    space.  See TextWrapper class for available keyword args to customize
224
    wrapping behaviour.
225
    """
5820.1.9 by INADA Naoki
Default width of UTextWrapper is also osutils.terminal_widtth() and
226
    return UTextWrapper(width=width, **kwargs).wrap(text)
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
227
5820.1.2 by INADA Naoki
bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.
228
def fill(text, width=None, **kwargs):
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
229
    """Fill a single paragraph of text, returning a new string.
230
231
    Reformat the single paragraph in 'text' to fit in lines of no more
232
    than 'width' columns, and return a new string containing the entire
233
    wrapped paragraph.  As with wrap(), tabs are expanded and other
234
    whitespace characters converted to space.  See TextWrapper class for
235
    available keyword args to customize wrapping behaviour.
236
    """
5820.1.9 by INADA Naoki
Default width of UTextWrapper is also osutils.terminal_widtth() and
237
    return UTextWrapper(width=width, **kwargs).fill(text)
5820.1.1 by INADA Naoki
Add utextwrap that is same to textwrap but supports double width characters in east asia.
238