Issue #1858376 by jhodgdon: Provide tests and documentation for long Unicode characters.
parent
63792c76df
commit
fbec0a1e3e
|
@ -172,7 +172,8 @@ class PHPTransliteration implements TransliterationInterface {
|
|||
* PHPTransliteration::$dataDirectory. These files should set up an array
|
||||
* variable $overrides with an element whose key is $langcode and whose value
|
||||
* is an array whose keys are character codes, and whose values are their
|
||||
* transliterations in this language.
|
||||
* transliterations in this language. The character codes can be for any valid
|
||||
* Unicode character, independent of the number of bytes.
|
||||
*
|
||||
* @param $langcode
|
||||
* Code for the language to read.
|
||||
|
@ -200,7 +201,8 @@ class PHPTransliteration implements TransliterationInterface {
|
|||
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
|
||||
* should set up a variable $bank containing an array whose numerical indices
|
||||
* are the remaining two bytes of the character code, and whose values are the
|
||||
* transliterations of these characters into US-ASCII.
|
||||
* transliterations of these characters into US-ASCII. Note that the maximum
|
||||
* Unicode character that can be encoded in this way is 4 bytes.
|
||||
*
|
||||
* @param $bank
|
||||
* First two bytes of the Unicode character, or 0 for the ASCII range.
|
||||
|
|
|
@ -13,6 +13,7 @@ use Drupal\Component\Transliteration\PHPTransliteration as BaseTransliteration;
|
|||
* Enhances PHPTransliteration with an alter hook.
|
||||
*
|
||||
* @ingroup transliteration
|
||||
* @see hook_transliteration_overrides_alter()
|
||||
*/
|
||||
class PHPTransliteration extends BaseTransliteration {
|
||||
|
||||
|
|
|
@ -177,11 +177,14 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
|
|||
* vs. initial capital letter only) is not taken into account, and in
|
||||
* transliterations of capital letters that result in two or more letters, by
|
||||
* convention only the first is capitalized in the Drupal transliteration
|
||||
* result. So, the process has limitations; however, since the reason for
|
||||
* transliteration is typically to create machine names or file names, this
|
||||
* should not really be a problem. After transliteration, other transformation
|
||||
* or validation may be necessary, such as converting spaces to another
|
||||
* character, removing non-printable characters, lower-casing, etc.
|
||||
* result. Also, only Unicode characters of 4 bytes or less can be
|
||||
* transliterated in the base system; language-specific overrides can be made
|
||||
* for longer Unicode characters. So, the process has limitations; however,
|
||||
* since the reason for transliteration is typically to create machine names or
|
||||
* file names, this should not really be a problem. After transliteration,
|
||||
* other transformation or validation may be necessary, such as converting
|
||||
* spaces to another character, removing non-printable characters,
|
||||
* lower-casing, etc.
|
||||
*
|
||||
* Here is a code snippet to transliterate some text:
|
||||
* @code
|
||||
|
@ -196,13 +199,20 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
|
|||
* Drupal Core provides the generic transliteration character tables and
|
||||
* overrides for a few common languages; modules can implement
|
||||
* hook_transliteration_overrides_alter() to provide further language-specific
|
||||
* overrides. Modules can also completely override the transliteration classes
|
||||
* in \Drupal\Core\CoreBundle.
|
||||
* overrides (including providing transliteration for Unicode characters that
|
||||
* are longer than 4 bytes). Modules can also completely override the
|
||||
* transliteration classes in \Drupal\Core\CoreBundle.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Provide language-specific overrides for transliteration.
|
||||
*
|
||||
* If the overrides you want to provide are standard for your language, consider
|
||||
* providing a patch for the Drupal Core transliteration system instead of using
|
||||
* this hook. This hook can be used temporarily until Drupal Core's
|
||||
* transliteration tables are fixed, or for sites that want to use a
|
||||
* non-standard transliteration system.
|
||||
*
|
||||
* @param array $overrides
|
||||
* Associative array of language-specific overrides whose keys are integer
|
||||
* Unicode character codes, and whose values are the transliterations of those
|
||||
|
|
|
@ -43,6 +43,13 @@ class TransliterationTest extends DrupalUnitTestBase {
|
|||
// This is a Canadian Aboriginal character like a triangle. See
|
||||
// http://www.unicode.org/charts/PDF/U1400.pdf
|
||||
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
|
||||
// These are two Gothic alphabet letters. See
|
||||
// http://en.wikipedia.org/wiki/Gothic_alphabet
|
||||
// They are not in our tables, but should at least give us '?' (unknown).
|
||||
$five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
|
||||
// Five-byte characters do not work in MySQL, so make a printable version.
|
||||
$five_byte_printable = '𐌰𐌸';
|
||||
|
||||
$cases = array(
|
||||
// Each test case is (language code, input, output).
|
||||
// Test ASCII in English.
|
||||
|
@ -55,6 +62,8 @@ class TransliterationTest extends DrupalUnitTestBase {
|
|||
// directly from the data files.
|
||||
array('fr', $three_byte, 'c'),
|
||||
array('fr', $four_byte, 'wii'),
|
||||
// Test 5-byte characters.
|
||||
array('en', $five_byte, '??', $five_byte_printable),
|
||||
// Test a language with no overrides.
|
||||
array('en', $two_byte, 'A O U A O aouaohello'),
|
||||
// Test language overrides provided by core.
|
||||
|
@ -64,9 +73,10 @@ class TransliterationTest extends DrupalUnitTestBase {
|
|||
array('dk', $random, $random),
|
||||
array('kg', $three_byte, 'ts'),
|
||||
// Test the language override hook in the test module, which changes
|
||||
// the transliteration of Ä to Z.
|
||||
// the transliteration of Ä to Z and provides for the 5-byte characters.
|
||||
array('zz', $two_byte, 'Z O U A O aouaohello'),
|
||||
array('zz', $random, $random),
|
||||
array('zz', $five_byte, 'ATh', $five_byte_printable),
|
||||
// Test strings in some other languages.
|
||||
// Turkish, provided by drupal.org user Kartagis.
|
||||
array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'),
|
||||
|
@ -78,10 +88,11 @@ class TransliterationTest extends DrupalUnitTestBase {
|
|||
|
||||
foreach($cases as $case) {
|
||||
list($langcode, $original, $expected) = $case;
|
||||
$printable = (isset($case[3])) ? $case[3] : $original;
|
||||
$transliterator_class = new PHPTransliteration();
|
||||
$actual = $transliterator_class->transliterate($original, $langcode);
|
||||
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array(
|
||||
'@original' => $original,
|
||||
'@original' => $printable,
|
||||
'@langcode' => $langcode,
|
||||
'@expected' => $expected,
|
||||
'@actual' => $actual,
|
||||
|
@ -89,7 +100,7 @@ class TransliterationTest extends DrupalUnitTestBase {
|
|||
|
||||
$actual = $transliterator_service->transliterate($original, $langcode);
|
||||
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array(
|
||||
'@original' => $original,
|
||||
'@original' => $printable,
|
||||
'@langcode' => $langcode,
|
||||
'@expected' => $expected,
|
||||
'@actual' => $actual,
|
||||
|
|
|
@ -12,5 +12,9 @@ function transliterate_test_transliteration_overrides_alter(&$overrides, $langco
|
|||
if ($langcode == 'zz') {
|
||||
// The default transliteration of Ä is A, but change it to Z for testing.
|
||||
$overrides[0xC4] = 'Z';
|
||||
// Also provide transliterations of two 5-byte characters from
|
||||
// http://en.wikipedia.org/wiki/Gothic_alphabet.
|
||||
$overrides[0x10330] = 'A';
|
||||
$overrides[0x10338] = 'Th';
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue