~bzr-pqm/bzr/bzr.dev : contents of bzrlib/tests/EncodingAdapter.py at revision 5462.5.1

~bzr-pqm/bzr/bzr.dev : (revision 5462.5.1)

5247.1.1 by Vincent Ladeuil Merge previous attempt into current trunk	1	# Copyright (C) 2006, 2009, 2010 Canonical Ltd
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	2	# -- coding: utf-8 --
1685.1.78 by Wouter van Heyst more code cleanup	3	#
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
1685.1.78 by Wouter van Heyst more code cleanup	8	#
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
1685.1.78 by Wouter van Heyst more code cleanup	13	#
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	16	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	17
1685.1.76 by Wouter van Heyst codecleanup	18	"""Adapter for running test cases against multiple encodings."""
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	19
	20	from copy import deepcopy
	21
1685.1.3 by John Arbash Meinel Minor cleanups	22	# prefix for micro (1/1000000)
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	23	_mu = u'\xb5'
	24
1711.4.7 by John Arbash Meinel Adding cp437, which is my default encoding on windows	25	# greek letter omega, not to be confused with
	26	# the Ohm sign, u'\u2126'. Though they are probably identical
	27	# cp437 can handle the first, but not the second
	28	_omega = u'\u03a9'
	29
	30	# smallest error possible, epsilon
	31	# cp437 handles u03b5, but not u2208 the 'element of' operator
	32	_epsilon = u'\u03b5'
	33
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	34	# Swedish?
	35	_erik = u'Erik B\xe5gfors'
	36
	37	# Swedish 'räksmörgås' means shrimp sandwich
	38	_shrimp_sandwich = u'r\xe4ksm\xf6rg\xe5s'
	39
	40	# Arabic, probably only Unicode encodings can handle this one
	41	_juju = u'\u062c\u0648\u062c\u0648'
	42
	43	# iso-8859-1 alternative for juju
	44	_juju_alt = u'j\xfbj\xfa'
	45
	46	# Russian, 'Alexander' in russian
	47	_alexander = u'\u0410\u043b\u0435\u043a\u0441\u0430\u043d\u0434\u0440'
1711.4.3 by John Arbash Meinel Alexander recommended a better short russian string.	48	# The word 'test' in Russian
	49	_russian_test = u'\u0422\u0435\u0441\u0442'
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	50
	51	# Kanji
	52	# It is a kanji sequence for nihonjin, or Japanese in English.
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	53	#
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	54	# '\u4eba' being person, 'u\65e5' sun and '\u672c' origin. Ie,
	55	# sun-origin-person, 'native from the land where the sun rises'. Note, I'm
	56	# not a fluent speaker, so this is just my crude breakdown.
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	57	#
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	58	# Wouter van Heyst
	59	_nihonjin = u'\u65e5\u672c\u4eba'
	60
	61	# Czech
	62	# It's what is usually used for showing how fonts look, because it contains
	63	# most accented characters, ie. in places where Englishman use 'Quick brown fox
	64	# jumped over a lazy dog'. The literal translation of the Czech version would
	65	# be something like 'Yellow horse groaned devilish codes'. Actually originally
	66	# the last word used to be 'ódy' (odes). The 'k' was added as a pun when using
	67	# the sentece to check whether one has properly set encoding.
	68	_yellow_horse = (u'\u017dlu\u0165ou\u010dk\xfd k\u016f\u0148'
	69	u' \xfap\u011bl \u010f\xe1belsk\xe9 k\xf3dy')
1185.85.72 by John Arbash Meinel Fix some of the tests.	70	_yellow = u'\u017dlu\u0165ou\u010dk\xfd'
	71	_someone = u'Some\u016f\u0148\u011b'
	72	_something = u'\u0165ou\u010dk\xfd'
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	73
1185.85.79 by John Arbash Meinel Adding Hebrew characters.	74	# Hebrew
1185.85.82 by John Arbash Meinel Fixing translation of Hebrew word.	75	# Shalom -> 'hello' or 'peace', used as a common greeting
	76	_shalom = u'\u05e9\u05dc\u05d5\u05dd'
1185.85.69 by John Arbash Meinel New encoder with multiple strings.	77
1685.1.3 by John Arbash Meinel Minor cleanups	78
4084.5.1 by Robert Collins Bulk update all test adaptation into a single approach, using multiply_tests rather than test adapters.	79	encoding_scenarios = [
1185.85.70 by John Arbash Meinel Hooked up EncodingAdapter, and updated test_non_ascii.	80	# Permutation 1 of utf-8
4084.5.2 by Robert Collins Fix mis-transcribed encoding test scenarios.	81	('utf-8,1', {
	82	'info': {
	83	'committer': _erik,
	84	'message': _yellow_horse,
	85	'filename': _shrimp_sandwich,
	86	'directory': _nihonjin,
	87	},
	88	'encoding': 'utf-8',
	89	}),
1185.85.70 by John Arbash Meinel Hooked up EncodingAdapter, and updated test_non_ascii.	90	# Permutation 2 of utf-8
4084.5.2 by Robert Collins Fix mis-transcribed encoding test scenarios.	91	('utf-8,2', {
	92	'info': {
	93	'committer': _alexander,
	94	'message': u'Testing ' + _mu,
	95	'filename': _shalom,
	96	'directory': _juju,
	97	},
	98	'encoding': 'utf-8',
	99	}),
	100	('iso-8859-1', {
	101	'info': {
	102	'committer': _erik,
	103	'message': u'Testing ' + _mu,
	104	'filename': _juju_alt,
	105	'directory': _shrimp_sandwich,
	106	},
	107	'encoding': 'iso-8859-1',
	108	}),
	109	('iso-8859-2', {
	110	'info': {
	111	'committer': _someone,
	112	'message': _yellow_horse,
	113	'filename': _yellow,
	114	'directory': _something,
	115	},
	116	'encoding': 'iso-8859-2',
	117	}),
	118	('cp1251', {
	119	'info': {
	120	'committer': _alexander,
	121	'message': u'Testing ' + _mu,
	122	'filename': _russian_test,
	123	'directory': _russian_test + 'dir',
	124	},
	125	'encoding': 'cp1251',
	126	}),
1711.4.12 by John Arbash Meinel Remove cp437 from the set of encodings, it isn't strictly needed	127	# The iso-8859-1 tests run on a default windows cp437 installation
	128	# and it takes a long time to run an extra permutation of the tests
	129	# But just in case we want to add this back in:
4084.5.1 by Robert Collins Bulk update all test adaptation into a single approach, using multiply_tests rather than test adapters.	130	# ('cp437', {'committer':_erik
1711.4.12 by John Arbash Meinel Remove cp437 from the set of encodings, it isn't strictly needed	131	# , 'message':u'Testing ' + _mu
	132	# , 'filename':'file_' + _omega
4084.5.2 by Robert Collins Fix mis-transcribed encoding test scenarios.	133	# , 'directory':_epsilon + '_dir',
	134	# 'encoding': 'cp437'}),
1185.85.70 by John Arbash Meinel Hooked up EncodingAdapter, and updated test_non_ascii.	135	]