~bzr-pqm/bzr/bzr.dev : contents of bzrlib/utextwrap.py at revision 6001

~bzr-pqm/bzr/bzr.dev : (revision 6001)

5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	1	# Copyright (C) 2011 Canonical Ltd
	2	#
5820.1.18 by INADA Naoki Add copyright for some function.	3	# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	4	# UTextWrapper._fix_sentence_endings, wrap and fill is copied from Python's
	5	# textwrap module (under PSF license) and modified for support CJK.
5820.1.18 by INADA Naoki Add copyright for some function.	6	# Original Copyright for these functions:
	7	#
	8	# Copyright (C) 1999-2001 Gregory P. Ward.
	9	# Copyright (C) 2002, 2003 Python Software Foundation.
	10	#
	11	# Written by Greg Ward <gward@python.net>
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	25
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	26	import sys
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	27	import textwrap
	28	from unicodedata import east_asian_width as _eawidth
	29
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	30	from bzrlib import osutils
	31
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	32	__all__ = ["UTextWrapper", "fill", "wrap"]
	33
	34	class UTextWrapper(textwrap.TextWrapper):
	35	"""
	36	Extend TextWrapper for Unicode.
	37
	38	This textwrapper handles east asian double width and split word
	39	even if !break_long_words when word contains double width
	40	characters.
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	41
	42	:param ambiguous_width: (keyword argument) width for character when
	43	unicodedata.east_asian_width(c) == 'A'
5820.1.21 by INADA Naoki Change default value of ambiguous_width from 2 to 1.	44	(default: 1)
5820.1.22 by INADA Naoki Add document of some limitations in docstring.	45
	46	Limitations:
	47	* expand_tabs doesn't fixed. It uses len() for calculating width
	48	of string on left of TAB.
	49	* Handles one codeunit as a single character having 1 or 2 width.
	50	This is not correct when there are surrogate pairs, combined
	51	characters or zero-width characters.
	52	* Treats all asian character are line breakable. But it is not
	53	true because line breaking is prohibited around some characters.
	54	(For example, breaking before punctation mark is prohibited.)
	55	See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	56	"""
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	57
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	58	def __init__(self, width=None, **kwargs):
	59	if width is None:
	60	width = (osutils.terminal_width() or
	61	osutils.default_terminal_width) - 1
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	62
5820.1.21 by INADA Naoki Change default value of ambiguous_width from 2 to 1.	63	ambi_width = kwargs.pop('ambiguous_width', 1)
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	64	if ambi_width == 1:
	65	self._east_asian_doublewidth = 'FW'
	66	elif ambi_width == 2:
	67	self._east_asian_doublewidth = 'FWA'
	68	else:
	69	raise ValueError("ambiguous_width should be 1 or 2")
	70
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	71	# No drop_whitespace param before Python 2.6 it was always dropped
	72	if sys.version_info < (2, 6):
	73	self.drop_whitespace = kwargs.pop("drop_whitespace", True)
	74	if not self.drop_whitespace:
	75	raise ValueError("TextWrapper version must drop whitespace")
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	76	textwrap.TextWrapper.__init__(self, width, **kwargs)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	77
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	78	def _unicode_char_width(self, uc):
	79	"""Return width of character `uc`.
	80
	81	:param: uc Single unicode character.
	82	"""
	83	# 'A' means width of the character is not be able to determine.
	84	# We assume that it's width is 2 because longer wrap may over
	85	# terminal width but shorter wrap may be acceptable.
	86	return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
	87
	88	def _width(self, s):
	89	"""Returns width for s.
5820.1.26 by INADA Naoki Cleanup. Remove spaces in empty line and shorten line having 80 characters.	90
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	91	When s is unicode, take care of east asian width.
	92	When s is bytes, treat all byte is single width character.
	93	"""
	94	charwidth = self._unicode_char_width
	95	return sum(charwidth(c) for c in s)
	96
	97	def _cut(self, s, width):
	98	"""Returns head and rest of s. (head+rest == s)
	99
	100	Head is large as long as _width(head) <= width.
	101	"""
	102	w = 0
	103	charwidth = self._unicode_char_width
	104	for pos, c in enumerate(s):
	105	w += charwidth(c)
	106	if w > width:
	107	return s[:pos], s[pos:]
	108	return s, u''
	109
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	110	def _fix_sentence_endings(self, chunks):
	111	"""_fix_sentence_endings(chunks : [string])
	112
	113	Correct for sentence endings buried in 'chunks'. Eg. when the
	114	original text contains "... foo.\nBar ...", munge_whitespace()
	115	and split() will convert that to [..., "foo.", " ", "Bar", ...]
	116	which has one too few spaces; this method simply changes the one
	117	space to two.
	118
	119	Note: This function is copied from textwrap.TextWrap and modified
	120	to use unicode always.
	121	"""
	122	i = 0
	123	L = len(chunks)-1
	124	patsearch = self.sentence_end_re.search
	125	while i < L:
	126	if chunks[i+1] == u" " and patsearch(chunks[i]):
	127	chunks[i+1] = u" "
	128	i += 2
	129	else:
	130	i += 1
	131
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	132	def _handle_long_word(self, chunks, cur_line, cur_len, width):
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	133	# Figure out when indent is larger than the specified width, and make
	134	# sure at least one character is stripped off on every pass
	135	if width < 2:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	136	space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	137	else:
	138	space_left = width - cur_len
	139
	140	# If we're allowed to break long words, then do so: put as much
	141	# of the next chunk onto the current line as will fit.
	142	if self.break_long_words:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	143	head, rest = self._cut(chunks[-1], space_left)
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	144	cur_line.append(head)
5820.1.5 by INADA Naoki Make UTextWrapper support byte string and add tests including Python's	145	if rest:
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	146	chunks[-1] = rest
	147	else:
	148	del chunks[-1]
	149
	150	# Otherwise, we have to preserve the long word intact. Only add
	151	# it to the current line if there's nothing already there --
	152	# that minimizes how much we violate the width constraint.
	153	elif not cur_line:
	154	cur_line.append(chunks.pop())
	155
	156	# If we're not allowed to break long words, and there's already
	157	# text on the current line, do nothing. Next time through the
	158	# main loop of _wrap_chunks(), we'll wind up here again, but
	159	# cur_len will be zero, so the next line will be entirely
	160	# devoted to the long word that we can't handle right now.
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	161
	162	def _wrap_chunks(self, chunks):
	163	lines = []
	164	if self.width <= 0:
	165	raise ValueError("invalid width %r (must be > 0)" % self.width)
	166
	167	# Arrange in reverse order so items can be efficiently popped
	168	# from a stack of chucks.
	169	chunks.reverse()
	170
	171	while chunks:
	172
	173	# Start the list of chunks that will make up the current line.
	174	# cur_len is just the length of all the chunks in cur_line.
	175	cur_line = []
	176	cur_len = 0
	177
	178	# Figure out which static string will prefix this line.
	179	if lines:
	180	indent = self.subsequent_indent
	181	else:
	182	indent = self.initial_indent
	183
	184	# Maximum width for this line.
	185	width = self.width - len(indent)
	186
	187	# First chunk on line is whitespace -- drop it, unless this
	188	# is the very beginning of the text (ie. no lines started yet).
	189	if self.drop_whitespace and chunks[-1].strip() == '' and lines:
	190	del chunks[-1]
	191
	192	while chunks:
	193	# Use _width instead of len for east asian width
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	194	l = self._width(chunks[-1])
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	195
	196	# Can at least squeeze this chunk onto the current line.
	197	if cur_len + l <= width:
	198	cur_line.append(chunks.pop())
	199	cur_len += l
	200
	201	# Nope, this line is full.
	202	else:
	203	break
	204
	205	# The current line is full, and the next chunk is too big to
	206	# fit on any line (not just this one).
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	207	if chunks and self._width(chunks[-1]) > width:
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	208	self._handle_long_word(chunks, cur_line, cur_len, width)
	209
	210	# If the last chunk on this line is all whitespace, drop it.
5820.1.26 by INADA Naoki Cleanup. Remove spaces in empty line and shorten line having 80 characters.	211	if self.drop_whitespace and cur_line and not cur_line[-1].strip():
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	212	del cur_line[-1]
	213
	214	# Convert current line back to a string and store it in list
	215	# of all lines (return value).
	216	if cur_line:
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	217	lines.append(indent + u''.join(cur_line))
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	218
	219	return lines
	220
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	221	def _split(self, text):
	222	chunks = textwrap.TextWrapper._split(self, unicode(text))
	223	cjk_split_chunks = []
	224	for chunk in chunks:
	225	prev_pos = 0
	226	for pos, char in enumerate(chunk):
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	227	if self._unicode_char_width(char) == 2:
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	228	if prev_pos < pos:
	229	cjk_split_chunks.append(chunk[prev_pos:pos])
	230	cjk_split_chunks.append(char)
	231	prev_pos = pos+1
	232	if prev_pos < len(chunk):
	233	cjk_split_chunks.append(chunk[prev_pos:])
	234	return cjk_split_chunks
	235
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	236	def wrap(self, text):
	237	# ensure text is unicode
	238	return textwrap.TextWrapper.wrap(self, unicode(text))
	239
	240	# -- Convenience interface ---------------------------------------------
	241
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	242	def wrap(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	243	"""Wrap a single paragraph of text, returning a list of wrapped lines.
	244
	245	Reformat the single paragraph in 'text' so it fits in lines of no
	246	more than 'width' columns, and return a list of wrapped lines. By
	247	default, tabs in 'text' are expanded with string.expandtabs(), and
	248	all other whitespace characters (including newline) are converted to
	249	space. See TextWrapper class for available keyword args to customize
	250	wrapping behaviour.
	251	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	252	return UTextWrapper(width=width, **kwargs).wrap(text)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	253
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	254	def fill(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	255	"""Fill a single paragraph of text, returning a new string.
	256
	257	Reformat the single paragraph in 'text' to fit in lines of no more
	258	than 'width' columns, and return a new string containing the entire
	259	wrapped paragraph. As with wrap(), tabs are expanded and other
	260	whitespace characters converted to space. See TextWrapper class for
	261	available keyword args to customize wrapping behaviour.
	262	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	263	return UTextWrapper(width=width, **kwargs).fill(text)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	264