diff --git a/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php index a495ee8448b1..80982d27f61a 100644 --- a/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php +++ b/core/lib/Drupal/Component/Transliteration/PHPTransliteration.php @@ -172,7 +172,8 @@ class PHPTransliteration implements TransliterationInterface { * PHPTransliteration::$dataDirectory. These files should set up an array * variable $overrides with an element whose key is $langcode and whose value * is an array whose keys are character codes, and whose values are their - * transliterations in this language. + * transliterations in this language. The character codes can be for any valid + * Unicode character, independent of the number of bytes. * * @param $langcode * Code for the language to read. @@ -200,7 +201,8 @@ class PHPTransliteration implements TransliterationInterface { * hexidecimal notation) in PHPTransliteration::$dataDirectory. These files * should set up a variable $bank containing an array whose numerical indices * are the remaining two bytes of the character code, and whose values are the - * transliterations of these characters into US-ASCII. + * transliterations of these characters into US-ASCII. Note that the maximum + * Unicode character that can be encoded in this way is 4 bytes. * * @param $bank * First two bytes of the Unicode character, or 0 for the ASCII range. diff --git a/core/lib/Drupal/Core/Transliteration/PHPTransliteration.php b/core/lib/Drupal/Core/Transliteration/PHPTransliteration.php index 951ce1e526c2..6af4708379f7 100644 --- a/core/lib/Drupal/Core/Transliteration/PHPTransliteration.php +++ b/core/lib/Drupal/Core/Transliteration/PHPTransliteration.php @@ -13,6 +13,7 @@ use Drupal\Component\Transliteration\PHPTransliteration as BaseTransliteration; * Enhances PHPTransliteration with an alter hook. * * @ingroup transliteration + * @see hook_transliteration_overrides_alter() */ class PHPTransliteration extends BaseTransliteration { diff --git a/core/modules/system/language.api.php b/core/modules/system/language.api.php index ed81b9235be5..7c7375f0db72 100644 --- a/core/modules/system/language.api.php +++ b/core/modules/system/language.api.php @@ -177,11 +177,14 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) { * vs. initial capital letter only) is not taken into account, and in * transliterations of capital letters that result in two or more letters, by * convention only the first is capitalized in the Drupal transliteration - * result. So, the process has limitations; however, since the reason for - * transliteration is typically to create machine names or file names, this - * should not really be a problem. After transliteration, other transformation - * or validation may be necessary, such as converting spaces to another - * character, removing non-printable characters, lower-casing, etc. + * result. Also, only Unicode characters of 4 bytes or less can be + * transliterated in the base system; language-specific overrides can be made + * for longer Unicode characters. So, the process has limitations; however, + * since the reason for transliteration is typically to create machine names or + * file names, this should not really be a problem. After transliteration, + * other transformation or validation may be necessary, such as converting + * spaces to another character, removing non-printable characters, + * lower-casing, etc. * * Here is a code snippet to transliterate some text: * @code @@ -196,13 +199,20 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) { * Drupal Core provides the generic transliteration character tables and * overrides for a few common languages; modules can implement * hook_transliteration_overrides_alter() to provide further language-specific - * overrides. Modules can also completely override the transliteration classes - * in \Drupal\Core\CoreBundle. + * overrides (including providing transliteration for Unicode characters that + * are longer than 4 bytes). Modules can also completely override the + * transliteration classes in \Drupal\Core\CoreBundle. */ /** * Provide language-specific overrides for transliteration. * + * If the overrides you want to provide are standard for your language, consider + * providing a patch for the Drupal Core transliteration system instead of using + * this hook. This hook can be used temporarily until Drupal Core's + * transliteration tables are fixed, or for sites that want to use a + * non-standard transliteration system. + * * @param array $overrides * Associative array of language-specific overrides whose keys are integer * Unicode character codes, and whose values are the transliterations of those diff --git a/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php index d717d8b9a1ea..7630b0bf1fb1 100644 --- a/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php +++ b/core/modules/system/lib/Drupal/system/Tests/Transliteration/TransliterationTest.php @@ -43,6 +43,13 @@ class TransliterationTest extends DrupalUnitTestBase { // This is a Canadian Aboriginal character like a triangle. See // http://www.unicode.org/charts/PDF/U1400.pdf $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); + // These are two Gothic alphabet letters. See + // http://en.wikipedia.org/wiki/Gothic_alphabet + // They are not in our tables, but should at least give us '?' (unknown). + $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8'); + // Five-byte characters do not work in MySQL, so make a printable version. + $five_byte_printable = '𐌰𐌸'; + $cases = array( // Each test case is (language code, input, output). // Test ASCII in English. @@ -55,6 +62,8 @@ class TransliterationTest extends DrupalUnitTestBase { // directly from the data files. array('fr', $three_byte, 'c'), array('fr', $four_byte, 'wii'), + // Test 5-byte characters. + array('en', $five_byte, '??', $five_byte_printable), // Test a language with no overrides. array('en', $two_byte, 'A O U A O aouaohello'), // Test language overrides provided by core. @@ -64,9 +73,10 @@ class TransliterationTest extends DrupalUnitTestBase { array('dk', $random, $random), array('kg', $three_byte, 'ts'), // Test the language override hook in the test module, which changes - // the transliteration of Ä to Z. + // the transliteration of Ä to Z and provides for the 5-byte characters. array('zz', $two_byte, 'Z O U A O aouaohello'), array('zz', $random, $random), + array('zz', $five_byte, 'ATh', $five_byte_printable), // Test strings in some other languages. // Turkish, provided by drupal.org user Kartagis. array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'), @@ -78,10 +88,11 @@ class TransliterationTest extends DrupalUnitTestBase { foreach($cases as $case) { list($langcode, $original, $expected) = $case; + $printable = (isset($case[3])) ? $case[3] : $original; $transliterator_class = new PHPTransliteration(); $actual = $transliterator_class->transliterate($original, $langcode); $this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array( - '@original' => $original, + '@original' => $printable, '@langcode' => $langcode, '@expected' => $expected, '@actual' => $actual, @@ -89,7 +100,7 @@ class TransliterationTest extends DrupalUnitTestBase { $actual = $transliterator_service->transliterate($original, $langcode); $this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array( - '@original' => $original, + '@original' => $printable, '@langcode' => $langcode, '@expected' => $expected, '@actual' => $actual, diff --git a/core/modules/system/tests/modules/transliterate_test/transliterate_test.module b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module index ee71069db368..636fde8a188d 100644 --- a/core/modules/system/tests/modules/transliterate_test/transliterate_test.module +++ b/core/modules/system/tests/modules/transliterate_test/transliterate_test.module @@ -12,5 +12,9 @@ function transliterate_test_transliteration_overrides_alter(&$overrides, $langco if ($langcode == 'zz') { // The default transliteration of Ä is A, but change it to Z for testing. $overrides[0xC4] = 'Z'; + // Also provide transliterations of two 5-byte characters from + // http://en.wikipedia.org/wiki/Gothic_alphabet. + $overrides[0x10330] = 'A'; + $overrides[0x10338] = 'Th'; } }