Issue #1858376 by jhodgdon: Provide tests and documentation for long Unicode characters.

8.0.x
catch 2013-02-06 13:58:33 +00:00
parent 63792c76df
commit fbec0a1e3e
5 changed files with 40 additions and 12 deletions

View File

@ -172,7 +172,8 @@ class PHPTransliteration implements TransliterationInterface {
* PHPTransliteration::$dataDirectory. These files should set up an array * PHPTransliteration::$dataDirectory. These files should set up an array
* variable $overrides with an element whose key is $langcode and whose value * variable $overrides with an element whose key is $langcode and whose value
* is an array whose keys are character codes, and whose values are their * is an array whose keys are character codes, and whose values are their
* transliterations in this language. * transliterations in this language. The character codes can be for any valid
* Unicode character, independent of the number of bytes.
* *
* @param $langcode * @param $langcode
* Code for the language to read. * Code for the language to read.
@ -200,7 +201,8 @@ class PHPTransliteration implements TransliterationInterface {
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files * hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
* should set up a variable $bank containing an array whose numerical indices * should set up a variable $bank containing an array whose numerical indices
* are the remaining two bytes of the character code, and whose values are the * are the remaining two bytes of the character code, and whose values are the
* transliterations of these characters into US-ASCII. * transliterations of these characters into US-ASCII. Note that the maximum
* Unicode character that can be encoded in this way is 4 bytes.
* *
* @param $bank * @param $bank
* First two bytes of the Unicode character, or 0 for the ASCII range. * First two bytes of the Unicode character, or 0 for the ASCII range.

View File

@ -13,6 +13,7 @@ use Drupal\Component\Transliteration\PHPTransliteration as BaseTransliteration;
* Enhances PHPTransliteration with an alter hook. * Enhances PHPTransliteration with an alter hook.
* *
* @ingroup transliteration * @ingroup transliteration
* @see hook_transliteration_overrides_alter()
*/ */
class PHPTransliteration extends BaseTransliteration { class PHPTransliteration extends BaseTransliteration {

View File

@ -177,11 +177,14 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
* vs. initial capital letter only) is not taken into account, and in * vs. initial capital letter only) is not taken into account, and in
* transliterations of capital letters that result in two or more letters, by * transliterations of capital letters that result in two or more letters, by
* convention only the first is capitalized in the Drupal transliteration * convention only the first is capitalized in the Drupal transliteration
* result. So, the process has limitations; however, since the reason for * result. Also, only Unicode characters of 4 bytes or less can be
* transliteration is typically to create machine names or file names, this * transliterated in the base system; language-specific overrides can be made
* should not really be a problem. After transliteration, other transformation * for longer Unicode characters. So, the process has limitations; however,
* or validation may be necessary, such as converting spaces to another * since the reason for transliteration is typically to create machine names or
* character, removing non-printable characters, lower-casing, etc. * file names, this should not really be a problem. After transliteration,
* other transformation or validation may be necessary, such as converting
* spaces to another character, removing non-printable characters,
* lower-casing, etc.
* *
* Here is a code snippet to transliterate some text: * Here is a code snippet to transliterate some text:
* @code * @code
@ -196,13 +199,20 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
* Drupal Core provides the generic transliteration character tables and * Drupal Core provides the generic transliteration character tables and
* overrides for a few common languages; modules can implement * overrides for a few common languages; modules can implement
* hook_transliteration_overrides_alter() to provide further language-specific * hook_transliteration_overrides_alter() to provide further language-specific
* overrides. Modules can also completely override the transliteration classes * overrides (including providing transliteration for Unicode characters that
* in \Drupal\Core\CoreBundle. * are longer than 4 bytes). Modules can also completely override the
* transliteration classes in \Drupal\Core\CoreBundle.
*/ */
/** /**
* Provide language-specific overrides for transliteration. * Provide language-specific overrides for transliteration.
* *
* If the overrides you want to provide are standard for your language, consider
* providing a patch for the Drupal Core transliteration system instead of using
* this hook. This hook can be used temporarily until Drupal Core's
* transliteration tables are fixed, or for sites that want to use a
* non-standard transliteration system.
*
* @param array $overrides * @param array $overrides
* Associative array of language-specific overrides whose keys are integer * Associative array of language-specific overrides whose keys are integer
* Unicode character codes, and whose values are the transliterations of those * Unicode character codes, and whose values are the transliterations of those

View File

@ -43,6 +43,13 @@ class TransliterationTest extends DrupalUnitTestBase {
// This is a Canadian Aboriginal character like a triangle. See // This is a Canadian Aboriginal character like a triangle. See
// http://www.unicode.org/charts/PDF/U1400.pdf // http://www.unicode.org/charts/PDF/U1400.pdf
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
// These are two Gothic alphabet letters. See
// http://en.wikipedia.org/wiki/Gothic_alphabet
// They are not in our tables, but should at least give us '?' (unknown).
$five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
// Five-byte characters do not work in MySQL, so make a printable version.
$five_byte_printable = '𐌰𐌸';
$cases = array( $cases = array(
// Each test case is (language code, input, output). // Each test case is (language code, input, output).
// Test ASCII in English. // Test ASCII in English.
@ -55,6 +62,8 @@ class TransliterationTest extends DrupalUnitTestBase {
// directly from the data files. // directly from the data files.
array('fr', $three_byte, 'c'), array('fr', $three_byte, 'c'),
array('fr', $four_byte, 'wii'), array('fr', $four_byte, 'wii'),
// Test 5-byte characters.
array('en', $five_byte, '??', $five_byte_printable),
// Test a language with no overrides. // Test a language with no overrides.
array('en', $two_byte, 'A O U A O aouaohello'), array('en', $two_byte, 'A O U A O aouaohello'),
// Test language overrides provided by core. // Test language overrides provided by core.
@ -64,9 +73,10 @@ class TransliterationTest extends DrupalUnitTestBase {
array('dk', $random, $random), array('dk', $random, $random),
array('kg', $three_byte, 'ts'), array('kg', $three_byte, 'ts'),
// Test the language override hook in the test module, which changes // Test the language override hook in the test module, which changes
// the transliteration of Ä to Z. // the transliteration of Ä to Z and provides for the 5-byte characters.
array('zz', $two_byte, 'Z O U A O aouaohello'), array('zz', $two_byte, 'Z O U A O aouaohello'),
array('zz', $random, $random), array('zz', $random, $random),
array('zz', $five_byte, 'ATh', $five_byte_printable),
// Test strings in some other languages. // Test strings in some other languages.
// Turkish, provided by drupal.org user Kartagis. // Turkish, provided by drupal.org user Kartagis.
array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'), array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'),
@ -78,10 +88,11 @@ class TransliterationTest extends DrupalUnitTestBase {
foreach($cases as $case) { foreach($cases as $case) {
list($langcode, $original, $expected) = $case; list($langcode, $original, $expected) = $case;
$printable = (isset($case[3])) ? $case[3] : $original;
$transliterator_class = new PHPTransliteration(); $transliterator_class = new PHPTransliteration();
$actual = $transliterator_class->transliterate($original, $langcode); $actual = $transliterator_class->transliterate($original, $langcode);
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array( $this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array(
'@original' => $original, '@original' => $printable,
'@langcode' => $langcode, '@langcode' => $langcode,
'@expected' => $expected, '@expected' => $expected,
'@actual' => $actual, '@actual' => $actual,
@ -89,7 +100,7 @@ class TransliterationTest extends DrupalUnitTestBase {
$actual = $transliterator_service->transliterate($original, $langcode); $actual = $transliterator_service->transliterate($original, $langcode);
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array( $this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array(
'@original' => $original, '@original' => $printable,
'@langcode' => $langcode, '@langcode' => $langcode,
'@expected' => $expected, '@expected' => $expected,
'@actual' => $actual, '@actual' => $actual,

View File

@ -12,5 +12,9 @@ function transliterate_test_transliteration_overrides_alter(&$overrides, $langco
if ($langcode == 'zz') { if ($langcode == 'zz') {
// The default transliteration of Ä is A, but change it to Z for testing. // The default transliteration of Ä is A, but change it to Z for testing.
$overrides[0xC4] = 'Z'; $overrides[0xC4] = 'Z';
// Also provide transliterations of two 5-byte characters from
// http://en.wikipedia.org/wiki/Gothic_alphabet.
$overrides[0x10330] = 'A';
$overrides[0x10338] = 'Th';
} }
} }