~bzr-pqm/bzr/bzr.dev

Committer: INADA Naoki
Date: 2011-05-05 03:22:29 UTC
mto: This revision was merged to the branch mainline in revision 5874.
Revision ID: songofacandy@gmail.com-20110505032229-439iyvma4xv94nvu

utextwrap: Change a way to split between CJK characters.
_split() splits each double width character into a single chunk.
This way is simpler but slower.

files modified:
bzrlib/tests/test_utextwrap.py

bzrlib/utextwrap.py

Show diffs side-by-side

added added

removed removed

bzrlib/utextwrap.py

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

import osutils

# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,

# wrap and fill is copied from Python's textwrap module

# (under PSF license) and modified for support CJK.

import textwrap

from unicodedata import east_asian_width as _eawidth

from bzrlib import osutils

__all__ = ["UTextWrapper", "fill", "wrap"]

def _width(s):

w += (c in 'FWA' and 2) or 1

return w

def _break_cjkword(word, width):

"""Split `word` by `width`.

Returns a tuple contains 2 strings. First string is head of

`word` that's length is less than `width`. Second string is

rest of `word`.

The border of head and rest is next to double width character.

Because spaces is not used as word separator on CJK.

When ``_width(word) < width``, returns ``(word, '')``.

When can't split anywhere, returns ``('', word)``.

def _cut(s, width):

"""Returns head and rest of s. (head+rest == s).

Head is large as long as _width(head) <= width.

"""

if isinstance(s, str):

return s[:width], s[width:]

assert isinstance(s, unicode)

w = 0

for pos, c in enumerate(word):

nw = _width(c)

if w + nw > width:

break

w += nw

else:

return word, ''

if pos>0 and _width(word[pos]) == 2:

# "sssDDD" and pos=3 => "sss", "DDD" (D is double width)

return word[:pos], word[pos:]

# "DDDssss" and pos=4 => "DDD", "ssss"

while pos > 0 and _width(word[pos-1]) != 2:

pos -= 1

if pos == 0:

return '', word

return word[:pos], word[pos:]

for pos, c in enumerate(s):

w += (_eawidth(c) in 'FWA' and 2) or 1

if w > width:

return s[:pos], s[pos:]

return s, ''

class UTextWrapper(textwrap.TextWrapper):

textwrap.TextWrapper.__init__(self, width, **kwargs)

def _handle_long_word(self, chunks, cur_line, cur_len, width):

head, rest = _break_cjkword(chunks[-1], width)

if head:

chunks.pop()

# Figure out when indent is larger than the specified width, and make

# sure at least one character is stripped off on every pass

if width < 2:

space_left = chunks[-1] and _width(chunks[-1][0]) or 1

else:

space_left = width - cur_len

# If we're allowed to break long words, then do so: put as much

# of the next chunk onto the current line as will fit.

if self.break_long_words:

head, rest = _cut(chunks[-1], space_left)

cur_line.append(head)

if rest:

chunks.append(rest)

chunks.append(head)

return

textwrap.TextWrapper._handle_long_word(

self, chunks, cur_line, cur_len, width)

chunks[-1] = rest

else:

del chunks[-1]

# Otherwise, we have to preserve the long word intact. Only add

# it to the current line if there's nothing already there --

# that minimizes how much we violate the width constraint.

elif not cur_line:

cur_line.append(chunks.pop())

100

# If we're not allowed to break long words, and there's already

101

# text on the current line, do nothing. Next time through the

102

# main loop of _wrap_chunks(), we'll wind up here again, but

103

# cur_len will be zero, so the next line will be entirely

104

# devoted to the long word that we can't handle right now.

105

106

def _wrap_chunks(self, chunks):

107

lines = []

137

145

138

146

# Nope, this line is full.

139

147

else:

140

# break CJK words

141

head, rest = _break_cjkword(chunks[-1], width-cur_len)

142

if head:

143

cur_line.append(head)

144

cur_len += _width(head)

145

assert rest

146

chunks[-1] = rest

147

148

break

148

149

150

# The current line is full, and the next chunk is too big to

162

163

164

return lines

164

165

166

def _split(self, text):

167

chunks = textwrap.TextWrapper._split(self, unicode(text))

168

cjk_split_chunks = []

169

for chunk in chunks:

170

assert chunk # TextWrapper._split removes empty chunk

171

prev_pos = 0

172

for pos, char in enumerate(chunk):

173

# Treats all asian character are line breakable.

174

# But it is not true because line breaking is

175

# prohibited around some characters.

176

# See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"

177

if _eawidth(char) in 'FWA':

178

if prev_pos < pos:

179

cjk_split_chunks.append(chunk[prev_pos:pos])

180

cjk_split_chunks.append(char)

181

prev_pos = pos+1

182

if prev_pos < len(chunk):

183

cjk_split_chunks.append(chunk[prev_pos:])

184

return cjk_split_chunks

185

165

186

def wrap(self, text):

166

187

# ensure text is unicode

167

188

return textwrap.TextWrapper.wrap(self, unicode(text))

Older »