- Patch #326016 by jhodgdon: PREG_CLASS_CJK doesn't include all CJK characters.
parent
daede057fd
commit
2ece32e674
|
@ -103,12 +103,25 @@ define('PREG_CLASS_PUNCTUATION',
|
||||||
'\x{ff65}');
|
'\x{ff65}');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Matches all CJK characters that are candidates for auto-splitting
|
* Matches CJK (Chinese, Japanese, Korean) letter-like characters.
|
||||||
* (Chinese, Japanese, Korean).
|
*
|
||||||
* Contains kana and BMP ideographs.
|
* This list is derived from the "East Asian Scripts" section of
|
||||||
|
* http://www.unicode.org/charts/index.html, as well as a comment on
|
||||||
|
* http://unicode.org/reports/tr11/tr11-11.html listing some character
|
||||||
|
* ranges that are reserved for additional CJK ideographs.
|
||||||
|
*
|
||||||
|
* The character ranges do not include numbers, punctuation, or symbols, since
|
||||||
|
* these are handled separately in search. Note that radicals and strokes are
|
||||||
|
* considered symbols. (See
|
||||||
|
* http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
|
||||||
|
*
|
||||||
|
* @see search_expand_cjk()
|
||||||
*/
|
*/
|
||||||
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}' .
|
define('PREG_CLASS_CJK', '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
|
||||||
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
|
'\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
|
||||||
|
'\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
|
||||||
|
'\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
|
||||||
|
'\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implements hook_help().
|
* Implements hook_help().
|
||||||
|
@ -447,28 +460,45 @@ function search_simplify($text) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
|
* Splits CJK (Chinese, Japanese, Korean) text into tokens.
|
||||||
* sequences of characters ('minimum_word_size' long).
|
*
|
||||||
|
* The Search module matches exact words, where a word is defined to be a
|
||||||
|
* sequence of characters delimited by spaces or punctuation. CJK languages are
|
||||||
|
* written in long strings of characters, though, not split up into words. So
|
||||||
|
* in order to allow search matching, we split up CJK text into tokens
|
||||||
|
* consisting of consecutive, overlapping sequences of characters whose length
|
||||||
|
* is equal to the 'minimum_word_size' variable. This tokenizing is only done if
|
||||||
|
* the 'overlap_cjk' variable is TRUE.
|
||||||
|
*
|
||||||
|
* @param $matches
|
||||||
|
* This function is a callback for preg_replace_callback(), which is called
|
||||||
|
* from search_simplify(). So, $matches is an array of regular expression
|
||||||
|
* matches, which means that $matches[0] contains the matched text -- a string
|
||||||
|
* of CJK characters to tokenize.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* Tokenized text, starting and ending with a space character.
|
||||||
*/
|
*/
|
||||||
function search_expand_cjk($matches) {
|
function search_expand_cjk($matches) {
|
||||||
$min = variable_get('minimum_word_size', 3);
|
$min = variable_get('minimum_word_size', 3);
|
||||||
$str = $matches[0];
|
$str = $matches[0];
|
||||||
$l = drupal_strlen($str);
|
$length = drupal_strlen($str);
|
||||||
// Passthrough short words
|
// If the text is shorter than the minimum word size, don't tokenize it.
|
||||||
if ($l <= $min) {
|
if ($length <= $min) {
|
||||||
return ' ' . $str . ' ';
|
return ' ' . $str . ' ';
|
||||||
}
|
}
|
||||||
$tokens = ' ';
|
$tokens = ' ';
|
||||||
// FIFO queue of characters
|
// Build a FIFO queue of characters.
|
||||||
$chars = array();
|
$chars = array();
|
||||||
// Begin loop
|
for ($i = 0; $i < $length; $i++) {
|
||||||
for ($i = 0; $i < $l; ++$i) {
|
// Add the next character off the beginning of the string to the queue.
|
||||||
// Grab next character
|
|
||||||
$current = drupal_substr($str, 0, 1);
|
$current = drupal_substr($str, 0, 1);
|
||||||
$str = substr($str, strlen($current));
|
$str = substr($str, strlen($current));
|
||||||
$chars[] = $current;
|
$chars[] = $current;
|
||||||
if ($i >= $min - 1) {
|
if ($i >= $min - 1) {
|
||||||
|
// Make a token of $min characters, and add it to the token string.
|
||||||
$tokens .= implode('', $chars) . ' ';
|
$tokens .= implode('', $chars) . ' ';
|
||||||
|
// Shift out the first character in the queue.
|
||||||
array_shift($chars);
|
array_shift($chars);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -872,3 +872,153 @@ class SearchConfigSettingsForm extends DrupalWebTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the CJK tokenizer.
|
||||||
|
*/
|
||||||
|
class SearchTokenizerTestCase extends DrupalWebTestCase {
|
||||||
|
public static function getInfo() {
|
||||||
|
return array(
|
||||||
|
'name' => 'CJK tokenizer',
|
||||||
|
'description' => 'Check that CJK tokenizer works as intended.',
|
||||||
|
'group' => 'Search',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function setUp() {
|
||||||
|
parent::setUp('search');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that strings of CJK characters are tokenized.
|
||||||
|
*
|
||||||
|
* The search_simplify() function does special things with numbers, symbols,
|
||||||
|
* and punctuation. So we only test that CJK characters that are not in these
|
||||||
|
* character classes are tokenized properly. See PREG_CLASS_CKJ for more
|
||||||
|
* information.
|
||||||
|
*/
|
||||||
|
function testTokenizer() {
|
||||||
|
// Set the minimum word size to 1 (to split all CJK characters) and make
|
||||||
|
// sure CJK tokenizing is turned on.
|
||||||
|
variable_set('minimum_word_size', 1);
|
||||||
|
variable_set('overlap_cjk', TRUE);
|
||||||
|
$this->refreshVariables();
|
||||||
|
|
||||||
|
// Create a string of CJK characters from various character ranges in
|
||||||
|
// the Unicode tables.
|
||||||
|
|
||||||
|
// Beginnings of the character ranges.
|
||||||
|
$starts = array(
|
||||||
|
'CJK unified' => 0x4e00,
|
||||||
|
'CJK Ext A' => 0x3400,
|
||||||
|
'CJK Compat' => 0xf900,
|
||||||
|
'Hangul Jamo' => 0x1100,
|
||||||
|
'Hangul Ext A' => 0xa960,
|
||||||
|
'Hangul Ext B' => 0xd7b0,
|
||||||
|
'Hangul Compat' => 0x3131,
|
||||||
|
'Half non-punct 1' => 0xff21,
|
||||||
|
'Half non-punct 2' => 0xff41,
|
||||||
|
'Half non-punct 3' => 0xff66,
|
||||||
|
'Hangul Syllables' => 0xac00,
|
||||||
|
'Hiragana' => 0x3040,
|
||||||
|
'Katakana' => 0x30a1,
|
||||||
|
'Katakana Ext' => 0x31f0,
|
||||||
|
'CJK Reserve 1' => 0x20000,
|
||||||
|
'CJK Reserve 2' => 0x30000,
|
||||||
|
'Bomofo' => 0x3100,
|
||||||
|
'Bomofo Ext' => 0x31a0,
|
||||||
|
'Lisu' => 0xa4d0,
|
||||||
|
'Yi' => 0xa000,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ends of the character ranges.
|
||||||
|
$ends = array(
|
||||||
|
'CJK unified' => 0x9fcf,
|
||||||
|
'CJK Ext A' => 0x4dbf,
|
||||||
|
'CJK Compat' => 0xfaff,
|
||||||
|
'Hangul Jamo' => 0x11ff,
|
||||||
|
'Hangul Ext A' => 0xa97f,
|
||||||
|
'Hangul Ext B' => 0xd7ff,
|
||||||
|
'Hangul Compat' => 0x318e,
|
||||||
|
'Half non-punct 1' => 0xff3a,
|
||||||
|
'Half non-punct 2' => 0xff5a,
|
||||||
|
'Half non-punct 3' => 0xffdc,
|
||||||
|
'Hangul Syllables' => 0xd7af,
|
||||||
|
'Hiragana' => 0x309f,
|
||||||
|
'Katakana' => 0x30ff,
|
||||||
|
'Katakana Ext' => 0x31ff,
|
||||||
|
'CJK Reserve 1' => 0x2fffd,
|
||||||
|
'CJK Reserve 2' => 0x3fffd,
|
||||||
|
'Bomofo' => 0x312f,
|
||||||
|
'Bomofo Ext' => 0x31b7,
|
||||||
|
'Lisu' => 0xa4fd,
|
||||||
|
'Yi' => 0xa48f,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Generate characters consisting of starts, midpoints, and ends.
|
||||||
|
$chars = array();
|
||||||
|
$charcodes = array();
|
||||||
|
foreach ($starts as $key => $value) {
|
||||||
|
$charcodes[] = $starts[$key];
|
||||||
|
$chars[] = $this->code2utf($starts[$key]);
|
||||||
|
$mid = round(0.5 * ($starts[$key] + $ends[$key]));
|
||||||
|
$charcodes[] = $mid;
|
||||||
|
$chars[] = $this->code2utf($mid);
|
||||||
|
$charcodes[] = $ends[$key];
|
||||||
|
$chars[] = $this->code2utf($ends[$key]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge into a string and tokenize.
|
||||||
|
$string = implode('', $chars);
|
||||||
|
$out = trim(search_simplify($string));
|
||||||
|
$expected = drupal_strtolower(implode(' ', $chars));
|
||||||
|
|
||||||
|
// Verify that the output matches what we expect.
|
||||||
|
$this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that strings of non-CJK characters are not tokenized.
|
||||||
|
*
|
||||||
|
* This is just a sanity check - it verifies that strings of letters are
|
||||||
|
* not tokenized.
|
||||||
|
*/
|
||||||
|
function testNoTokenizer() {
|
||||||
|
// Set the minimum word size to 1 (to split all CJK characters) and make
|
||||||
|
// sure CJK tokenizing is turned on.
|
||||||
|
variable_set('minimum_word_size', 1);
|
||||||
|
variable_set('overlap_cjk', TRUE);
|
||||||
|
$this->refreshVariables();
|
||||||
|
|
||||||
|
$letters = 'abcdefghijklmnopqrstuvwxyz';
|
||||||
|
$out = trim(search_simplify($letters));
|
||||||
|
|
||||||
|
$this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Like PHP chr() function, but for unicode characters.
|
||||||
|
*
|
||||||
|
* chr() only works for ASCII characters up to character 255. This function
|
||||||
|
* converts a number to the corresponding unicode character. Adapted from
|
||||||
|
* functions supplied in comments on several functions on php.net.
|
||||||
|
*/
|
||||||
|
function code2utf($num) {
|
||||||
|
if ($num < 128) {
|
||||||
|
return chr($num);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($num < 2048) {
|
||||||
|
return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($num < 65536) {
|
||||||
|
return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($num < 2097152) {
|
||||||
|
return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
|
||||||
|
}
|
||||||
|
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue