Issue #1858376 by jhodgdon: Provide tests and documentation for long Unicode characters.
parent
63792c76df
commit
fbec0a1e3e
|
@ -172,7 +172,8 @@ class PHPTransliteration implements TransliterationInterface {
|
||||||
* PHPTransliteration::$dataDirectory. These files should set up an array
|
* PHPTransliteration::$dataDirectory. These files should set up an array
|
||||||
* variable $overrides with an element whose key is $langcode and whose value
|
* variable $overrides with an element whose key is $langcode and whose value
|
||||||
* is an array whose keys are character codes, and whose values are their
|
* is an array whose keys are character codes, and whose values are their
|
||||||
* transliterations in this language.
|
* transliterations in this language. The character codes can be for any valid
|
||||||
|
* Unicode character, independent of the number of bytes.
|
||||||
*
|
*
|
||||||
* @param $langcode
|
* @param $langcode
|
||||||
* Code for the language to read.
|
* Code for the language to read.
|
||||||
|
@ -200,7 +201,8 @@ class PHPTransliteration implements TransliterationInterface {
|
||||||
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
|
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
|
||||||
* should set up a variable $bank containing an array whose numerical indices
|
* should set up a variable $bank containing an array whose numerical indices
|
||||||
* are the remaining two bytes of the character code, and whose values are the
|
* are the remaining two bytes of the character code, and whose values are the
|
||||||
* transliterations of these characters into US-ASCII.
|
* transliterations of these characters into US-ASCII. Note that the maximum
|
||||||
|
* Unicode character that can be encoded in this way is 4 bytes.
|
||||||
*
|
*
|
||||||
* @param $bank
|
* @param $bank
|
||||||
* First two bytes of the Unicode character, or 0 for the ASCII range.
|
* First two bytes of the Unicode character, or 0 for the ASCII range.
|
||||||
|
|
|
@ -13,6 +13,7 @@ use Drupal\Component\Transliteration\PHPTransliteration as BaseTransliteration;
|
||||||
* Enhances PHPTransliteration with an alter hook.
|
* Enhances PHPTransliteration with an alter hook.
|
||||||
*
|
*
|
||||||
* @ingroup transliteration
|
* @ingroup transliteration
|
||||||
|
* @see hook_transliteration_overrides_alter()
|
||||||
*/
|
*/
|
||||||
class PHPTransliteration extends BaseTransliteration {
|
class PHPTransliteration extends BaseTransliteration {
|
||||||
|
|
||||||
|
|
|
@ -177,11 +177,14 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
|
||||||
* vs. initial capital letter only) is not taken into account, and in
|
* vs. initial capital letter only) is not taken into account, and in
|
||||||
* transliterations of capital letters that result in two or more letters, by
|
* transliterations of capital letters that result in two or more letters, by
|
||||||
* convention only the first is capitalized in the Drupal transliteration
|
* convention only the first is capitalized in the Drupal transliteration
|
||||||
* result. So, the process has limitations; however, since the reason for
|
* result. Also, only Unicode characters of 4 bytes or less can be
|
||||||
* transliteration is typically to create machine names or file names, this
|
* transliterated in the base system; language-specific overrides can be made
|
||||||
* should not really be a problem. After transliteration, other transformation
|
* for longer Unicode characters. So, the process has limitations; however,
|
||||||
* or validation may be necessary, such as converting spaces to another
|
* since the reason for transliteration is typically to create machine names or
|
||||||
* character, removing non-printable characters, lower-casing, etc.
|
* file names, this should not really be a problem. After transliteration,
|
||||||
|
* other transformation or validation may be necessary, such as converting
|
||||||
|
* spaces to another character, removing non-printable characters,
|
||||||
|
* lower-casing, etc.
|
||||||
*
|
*
|
||||||
* Here is a code snippet to transliterate some text:
|
* Here is a code snippet to transliterate some text:
|
||||||
* @code
|
* @code
|
||||||
|
@ -196,13 +199,20 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
|
||||||
* Drupal Core provides the generic transliteration character tables and
|
* Drupal Core provides the generic transliteration character tables and
|
||||||
* overrides for a few common languages; modules can implement
|
* overrides for a few common languages; modules can implement
|
||||||
* hook_transliteration_overrides_alter() to provide further language-specific
|
* hook_transliteration_overrides_alter() to provide further language-specific
|
||||||
* overrides. Modules can also completely override the transliteration classes
|
* overrides (including providing transliteration for Unicode characters that
|
||||||
* in \Drupal\Core\CoreBundle.
|
* are longer than 4 bytes). Modules can also completely override the
|
||||||
|
* transliteration classes in \Drupal\Core\CoreBundle.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provide language-specific overrides for transliteration.
|
* Provide language-specific overrides for transliteration.
|
||||||
*
|
*
|
||||||
|
* If the overrides you want to provide are standard for your language, consider
|
||||||
|
* providing a patch for the Drupal Core transliteration system instead of using
|
||||||
|
* this hook. This hook can be used temporarily until Drupal Core's
|
||||||
|
* transliteration tables are fixed, or for sites that want to use a
|
||||||
|
* non-standard transliteration system.
|
||||||
|
*
|
||||||
* @param array $overrides
|
* @param array $overrides
|
||||||
* Associative array of language-specific overrides whose keys are integer
|
* Associative array of language-specific overrides whose keys are integer
|
||||||
* Unicode character codes, and whose values are the transliterations of those
|
* Unicode character codes, and whose values are the transliterations of those
|
||||||
|
|
|
@ -43,6 +43,13 @@ class TransliterationTest extends DrupalUnitTestBase {
|
||||||
// This is a Canadian Aboriginal character like a triangle. See
|
// This is a Canadian Aboriginal character like a triangle. See
|
||||||
// http://www.unicode.org/charts/PDF/U1400.pdf
|
// http://www.unicode.org/charts/PDF/U1400.pdf
|
||||||
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
|
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
|
||||||
|
// These are two Gothic alphabet letters. See
|
||||||
|
// http://en.wikipedia.org/wiki/Gothic_alphabet
|
||||||
|
// They are not in our tables, but should at least give us '?' (unknown).
|
||||||
|
$five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
|
||||||
|
// Five-byte characters do not work in MySQL, so make a printable version.
|
||||||
|
$five_byte_printable = '𐌰𐌸';
|
||||||
|
|
||||||
$cases = array(
|
$cases = array(
|
||||||
// Each test case is (language code, input, output).
|
// Each test case is (language code, input, output).
|
||||||
// Test ASCII in English.
|
// Test ASCII in English.
|
||||||
|
@ -55,6 +62,8 @@ class TransliterationTest extends DrupalUnitTestBase {
|
||||||
// directly from the data files.
|
// directly from the data files.
|
||||||
array('fr', $three_byte, 'c'),
|
array('fr', $three_byte, 'c'),
|
||||||
array('fr', $four_byte, 'wii'),
|
array('fr', $four_byte, 'wii'),
|
||||||
|
// Test 5-byte characters.
|
||||||
|
array('en', $five_byte, '??', $five_byte_printable),
|
||||||
// Test a language with no overrides.
|
// Test a language with no overrides.
|
||||||
array('en', $two_byte, 'A O U A O aouaohello'),
|
array('en', $two_byte, 'A O U A O aouaohello'),
|
||||||
// Test language overrides provided by core.
|
// Test language overrides provided by core.
|
||||||
|
@ -64,9 +73,10 @@ class TransliterationTest extends DrupalUnitTestBase {
|
||||||
array('dk', $random, $random),
|
array('dk', $random, $random),
|
||||||
array('kg', $three_byte, 'ts'),
|
array('kg', $three_byte, 'ts'),
|
||||||
// Test the language override hook in the test module, which changes
|
// Test the language override hook in the test module, which changes
|
||||||
// the transliteration of Ä to Z.
|
// the transliteration of Ä to Z and provides for the 5-byte characters.
|
||||||
array('zz', $two_byte, 'Z O U A O aouaohello'),
|
array('zz', $two_byte, 'Z O U A O aouaohello'),
|
||||||
array('zz', $random, $random),
|
array('zz', $random, $random),
|
||||||
|
array('zz', $five_byte, 'ATh', $five_byte_printable),
|
||||||
// Test strings in some other languages.
|
// Test strings in some other languages.
|
||||||
// Turkish, provided by drupal.org user Kartagis.
|
// Turkish, provided by drupal.org user Kartagis.
|
||||||
array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'),
|
array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'),
|
||||||
|
@ -78,10 +88,11 @@ class TransliterationTest extends DrupalUnitTestBase {
|
||||||
|
|
||||||
foreach($cases as $case) {
|
foreach($cases as $case) {
|
||||||
list($langcode, $original, $expected) = $case;
|
list($langcode, $original, $expected) = $case;
|
||||||
|
$printable = (isset($case[3])) ? $case[3] : $original;
|
||||||
$transliterator_class = new PHPTransliteration();
|
$transliterator_class = new PHPTransliteration();
|
||||||
$actual = $transliterator_class->transliterate($original, $langcode);
|
$actual = $transliterator_class->transliterate($original, $langcode);
|
||||||
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array(
|
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array(
|
||||||
'@original' => $original,
|
'@original' => $printable,
|
||||||
'@langcode' => $langcode,
|
'@langcode' => $langcode,
|
||||||
'@expected' => $expected,
|
'@expected' => $expected,
|
||||||
'@actual' => $actual,
|
'@actual' => $actual,
|
||||||
|
@ -89,7 +100,7 @@ class TransliterationTest extends DrupalUnitTestBase {
|
||||||
|
|
||||||
$actual = $transliterator_service->transliterate($original, $langcode);
|
$actual = $transliterator_service->transliterate($original, $langcode);
|
||||||
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array(
|
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array(
|
||||||
'@original' => $original,
|
'@original' => $printable,
|
||||||
'@langcode' => $langcode,
|
'@langcode' => $langcode,
|
||||||
'@expected' => $expected,
|
'@expected' => $expected,
|
||||||
'@actual' => $actual,
|
'@actual' => $actual,
|
||||||
|
|
|
@ -12,5 +12,9 @@ function transliterate_test_transliteration_overrides_alter(&$overrides, $langco
|
||||||
if ($langcode == 'zz') {
|
if ($langcode == 'zz') {
|
||||||
// The default transliteration of Ä is A, but change it to Z for testing.
|
// The default transliteration of Ä is A, but change it to Z for testing.
|
||||||
$overrides[0xC4] = 'Z';
|
$overrides[0xC4] = 'Z';
|
||||||
|
// Also provide transliterations of two 5-byte characters from
|
||||||
|
// http://en.wikipedia.org/wiki/Gothic_alphabet.
|
||||||
|
$overrides[0x10330] = 'A';
|
||||||
|
$overrides[0x10338] = 'Th';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue