Issue #916086 by jhodgdon, mcarbone, pwolanin, Albert Volkman: Search_excerpt() doesn't highlight words that are matched via search_simplify().

8.0.x
Nathaniel Catchpole 2013-12-20 12:15:04 +00:00
parent e16798a856
commit f82a06840f
4 changed files with 216 additions and 157 deletions

View File

@ -19,7 +19,7 @@ class SearchExcerptTest extends WebTestBase {
*
* @var array
*/
public static $modules = array('search');
public static $modules = array('search', 'search_langcode_test');
public static function getInfo() {
return array(
@ -48,10 +48,19 @@ class SearchExcerptTest extends WebTestBase {
$this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Entire string is returned when keyword is not found in short string');
$result = preg_replace('| +|', ' ', search_excerpt('fox', $text));
$this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog ...', 'Found keyword is highlighted');
$this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog', 'Found keyword is highlighted');
$expected = '<strong>The</strong> quick brown fox &amp; jumps over <strong>the</strong> lazy dog';
$result = preg_replace('| +|', ' ', search_excerpt('The', $text));
$this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at beginning of short string');
$expected = 'The quick brown fox &amp; jumps over the lazy <strong>dog</strong>';
$result = preg_replace('| +|', ' ', search_excerpt('dog', $text));
$this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at end of short string');
$longtext = str_repeat($text . ' ', 10);
$result = preg_replace('| +|', ' ', search_excerpt('nothing', $longtext));
$expected = 'The quick brown fox &amp; jumps over the lazy dog';
$this->assertTrue(strpos($result, $expected) === 0, 'When keyword is not found in long string, return value starts as expected');
$entities = str_repeat('k&eacute;sz&iacute;t&eacute;se ', 20);
@ -101,10 +110,10 @@ class SearchExcerptTest extends WebTestBase {
// Test phrases with characters which are being truncated.
$result = preg_replace('| +|', ' ', search_excerpt('"ipsum _"', $text));
$this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');
$this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');
$result = preg_replace('| +|', ' ', search_excerpt('"ipsum 0000"', $text));
$this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');
$this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');
// Test combination of the valid keyword and keyword containing only
// characters which are being truncated during simplification.
@ -113,5 +122,43 @@ class SearchExcerptTest extends WebTestBase {
$result = preg_replace('| +|', ' ', search_excerpt('ipsum 0000', $text));
$this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid keyword is highlighted and invalid keyword "0000" is ignored.');
// Test using the hook_search_preprocess() from the test module.
// The hook replaces "finding" or "finds" with "find".
// So, if we search for "find" or "finds" or "finding", we should
// highlight "finding".
$text = "this tests finding a string";
$result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for finds');
$result = preg_replace('| +|', ' ', search_excerpt('find', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for find');
// Just to be sure, test with the replacement at the beginning and end.
$text = "finding at the beginning";
$result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at start');
$text = "at the end finding";
$result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at end');
// Testing with a one-to-many replacement: the test module replaces DIC
// with Dependency Injection Container.
$text = "something about the DIC is happening";
$result = preg_replace('| +|', ' ', search_excerpt('Dependency', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym first word');
$result = preg_replace('| +|', ' ', search_excerpt('Injection', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym second word');
$result = preg_replace('| +|', ' ', search_excerpt('Container', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym third word');
// Testing with a many-to-one replacement: the test module replaces
// hypertext markup language with HTML.
$text = "we always use hypertext markup language to describe things";
$result = preg_replace('| +|', ' ', search_excerpt('html', $text, 'ex'));
$this->assertTrue(strpos($result, '<strong>hypertext markup language</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym many to one');
}
}

View File

@ -53,11 +53,13 @@ class SearchPreprocessLangcodeTest extends SearchTestBase {
// function manually is needed to finish the indexing process.
search_update_totals();
// Search for the title of the node with a POST query.
$edit = array('or' => $node->label());
// Search for the additional text that is added by the preprocess
// function. If you search for text that is in the node, preprocess is
// not invoked on the node during the search excerpt generation.
$edit = array('or' => 'Additional text');
$this->drupalPostForm('search/node', $edit, t('Advanced search'));
// Checks if the langcode has been passed by hook_search_preprocess().
// Checks if the langcode message has been set by hook_search_preprocess().
$this->assertText('Langcode Preprocess Test: en');
}

View File

@ -621,24 +621,23 @@ function search_mark_for_reindex($type, $sid) {
*/
/**
* Returns snippets from a piece of text, with certain keywords highlighted.
* Returns snippets from a piece of text, with search keywords highlighted.
*
* Used for formatting search results.
*
* @param $keys
* @param string $keys
* A string containing a search query.
*
* @param $text
* @param string $text
* The text to extract fragments from.
*
* @return
* @return string
* A string containing HTML for the excerpt.
*/
function search_excerpt($keys, $text, $langcode = NULL) {
// We highlight around non-indexable or CJK characters.
$boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';
// Extract positive keywords and phrases
// Extract positive keywords and phrases.
preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
$keys = array_merge($matches[2], $matches[3]);
@ -646,78 +645,77 @@ function search_excerpt($keys, $text, $langcode = NULL) {
$text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
$text = decode_entities($text);
// Slash-escape quotes in the search keyword string.
array_walk($keys, '_search_excerpt_replace');
$workkeys = $keys;
// Make a list of unique keywords that are actually found in the text,
// which could be items in $keys or replacements that are equivalent through
// search_simplify().
$temp_keys = array();
foreach ($keys as $key) {
$key = _search_find_match_with_simplify($key, $text, $boundary, $langcode);
if (isset($key)) {
// Quote slashes so they can be used in regular expressions.
$temp_keys[] = preg_quote($key, '/');
}
}
// Several keywords could have simplified down to the same thing, so pick
// out the unique ones.
$keys = array_unique($temp_keys);
// Extract fragments around keywords.
// First we collect ranges of text around each keyword, starting/ending
// at spaces, trying to get to 256 characters.
// If the sum of all fragments is too short, we look for second occurrences.
// Extract fragments of about 60 characters around keywords, bounded by word
// boundary characters. Try to reach 256 characters, using second occurrences
// if necessary.
$ranges = array();
$included = array();
$foundkeys = array();
$length = 0;
while ($length < 256 && count($workkeys)) {
foreach ($workkeys as $k => $key) {
if (strlen($key) == 0) {
unset($workkeys[$k]);
unset($keys[$k]);
continue;
}
$look_start = array();
$remaining_keys = $keys;
while ($length < 256 && !empty($remaining_keys)) {
$found_keys = array();
foreach ($remaining_keys as $key) {
if ($length >= 256) {
break;
}
// Remember occurrence of key so we can skip over it if more occurrences
// are desired.
if (!isset($included[$key])) {
$included[$key] = 0;
// Remember where we last found $key, in case we are coming through a
// second time.
if (!isset($look_start[$key])) {
$look_start[$key] = 0;
}
// Locate a keyword (position $p, always >0 because $text starts with a
// space). First try bare keyword, but if that doesn't work, try to find a
// derived form from search_simplify().
$p = 0;
if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
$p = $match[0][1];
}
else {
$info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary, $langcode);
if ($info['where']) {
$p = $info['where'];
if ($info['keyword']) {
$foundkeys[] = $info['keyword'];
// See if we can find $key after where we found it the last time. Since
// we are requiring a match on a word boundary, make sure $text starts
// and ends with a space.
$matches = array();
if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
$found_position = $matches[0][1];
$look_start[$key] = $found_position + 1;
// Keep track of which keys we found this time, in case we need to
// pass through again to find more text.
$found_keys[] = $key;
// Locate a space before and after this match, leaving about 60
// characters of context on each end.
$before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
if ($before !== FALSE && $before <= $found_position) {
$after = strrpos(' ' . $text . ' ', ' ', min($found_position + 61, strlen($text) + 1));
if ($after !== FALSE && $after > $found_position) {
// Account for the spaces we added.
$before = max($before - 1, 0);
$after = min($after - 1, strlen($text));
if ($before < $after) {
// Save this range.
$ranges[$before] = $after;
$length += $after - $before;
}
}
}
}
// Now locate a space in front (position $q) and behind it (position $s),
// leaving about 60 characters extra before and after for context.
// Note that a space was added to the front and end of $text above.
if ($p) {
if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
$end = substr($text . ' ', $p, 80);
if (($s = strrpos($end, ' ')) !== FALSE) {
// Account for the added spaces.
$q = max($q - 1, 0);
$s = min($s, strlen($end) - 1);
$ranges[$q] = $p + $s;
$length += $p + $s - $q;
$included[$key] = $p + 1;
}
else {
unset($workkeys[$k]);
}
}
else {
unset($workkeys[$k]);
}
}
else {
unset($workkeys[$k]);
}
}
// Next time through this loop, only look for keys we found this time,
// if any.
$remaining_keys = $found_keys;
}
if (count($ranges) == 0) {
if (empty($ranges)) {
// We didn't find any keyword matches, so just return the first part of the
// text. We also need to re-encode any HTML special characters that we
// entity-decoded above.
@ -727,43 +725,46 @@ function search_excerpt($keys, $text, $langcode = NULL) {
// Sort the text ranges by starting position.
ksort($ranges);
// Now we collapse overlapping text ranges into one. The sorting makes it O(n).
$newranges = array();
foreach ($ranges as $from2 => $to2) {
if (!isset($from1)) {
$from1 = $from2;
$to1 = $to2;
// Collapse overlapping text ranges into one. The sorting makes it O(n).
$new_ranges = array();
$max_end = 0;
foreach ($ranges as $this_from => $this_to) {
$max_end = max($max_end, $this_to);
if (!isset($working_from)) {
// This is the first time through this loop: initialize.
$working_from = $this_from;
$working_to = $this_to;
continue;
}
if ($from2 <= $to1) {
$to1 = max($to1, $to2);
if ($this_from <= $working_to) {
// The ranges overlap: combine them.
$working_to = max($working_to, $this_to);
}
else {
$newranges[$from1] = $to1;
$from1 = $from2;
$to1 = $to2;
// The ranges do not overlap: save the working range and start a new one.
$new_ranges[$working_from] = $working_to;
$working_from = $this_from;
$working_to = $this_to;
}
}
$newranges[$from1] = $to1;
// Save the remaining working range.
$new_ranges[$working_from] = $working_to;
// Fetch text
// Fetch text within the combined ranges we found.
$out = array();
foreach ($newranges as $from => $to) {
foreach ($new_ranges as $from => $to) {
$out[] = substr($text, $from, $to - $from);
}
// Let translators have the ... separator text as one chunk.
// Combine the text chunks with "..." separators. The "..." needs to be
// translated. Let translators have the ... separator text as one chunk.
$dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
$text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
$text = (isset($new_ranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . (($max_end < strlen($text) - 1) ? $dots[2] : '');
$text = check_plain($text);
// Slash-escape quotes in keys found in a derived form and merge with original keys.
array_walk($foundkeys, '_search_excerpt_replace');
$keys = array_merge($keys, $foundkeys);
// Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
$text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', $text);
// Highlight keywords. Must be done at once to prevent conflicts ('strong'
// and '<strong>').
$text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
return $text;
}
@ -772,83 +773,81 @@ function search_excerpt($keys, $text, $langcode = NULL) {
*/
/**
* Helper function for array_walk() in search_excerpt().
*/
function _search_excerpt_replace(&$text) {
$text = preg_quote($text, '/');
}
/**
* Finds words in the original text that matched via search_simplify().
*
* This is called in search_excerpt() if an exact match is not found in the
* text, so that we can find the derived form that matches.
* Finds an appropriate keyword in text.
*
* @param $key
* The keyword to find.
* @param $text
* The text to search for the keyword.
* @param $offset
* Offset position in $text to start searching at.
* @param $boundary
* Text to include in a regular expression that will match a word boundary.
* Regular expression for boundary characters between words.
* @param $langcode
* Language code.
*
* @return
* FALSE if no match is found. If a match is found, return an associative
* array with element 'where' giving the position of the match, and element
* 'keyword' giving the actual word found in the text at that position.
* A segment of $text that is between word boundary characters that either
* matches $key directly, or matches $key when both this text segment and
* $key are processed by search_simplify(). If a matching text segment is
* not located, NULL is returned.
*/
function search_simplify_excerpt_match($key, $text, $offset, $boundary, $langcode = NULL) {
$pos = NULL;
$simplified_key = search_simplify($key, $langcode);
$simplified_text = search_simplify($text, $langcode);
// Return immediately if simplified key or text are empty.
if (!$simplified_key || !$simplified_text) {
return FALSE;
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
// See if $key appears as-is. When testing, make sure $text starts/ends with
// a space, because we require $key to be surrounded by word boundary
// characters.
$temp = trim($key);
if ($temp == '') {
return NULL;
}
if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) {
return $key;
}
// Check if we have a match after simplification in the text.
if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) {
return FALSE;
// Run both text and key through search_simplify.
$simplified_key = trim(search_simplify($key, $langcode));
$simplified_text = trim(search_simplify($text, $langcode));
if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
// The simplfied keyword and text do not match at all, or are empty.
return NULL;
}
// If we get here, we have a match. Now find the exact location of the match
// and the original text that matched. Start by splitting up the text by all
// potential starting points of the matching text and iterating through them.
$split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter');
foreach ($split as $value) {
// Skip starting points before the offset.
if ($value[1] < $offset) {
continue;
}
// Split $text into words, keeping track of where the word boundaries are.
$words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE);
// Add an entry pointing to the end of the string, for the loop below.
$words[] = array('', strlen($text));
$num_words = count($words);
// Check a window of 80 characters after the starting point for a match,
// based on the size of the excerpt window.
$window = substr($text, $value[1], 80);
$simplified_window = search_simplify($window);
if (strpos($simplified_window, $simplified_key) === 0) {
// We have a match in this window. Store the position of the match.
$pos = $value[1];
// Iterate through the text in the window until we find the full original
// matching text.
$length = strlen($window);
for ($i = 1; $i <= $length; $i++) {
$keyfound = substr($text, $value[1], $i);
if ($simplified_key == search_simplify($keyfound)) {
break;
}
// Find the smallest segment of complete words in $text that we can simplify
// to match $simplified_key.
$start_position = 0;
$word_end = 0;
for ($word_index = 0; $word_index < $num_words; $word_index++) {
// See if we can move the starting position out from our previously-saved
// best position to here and still have a match.
$trial_position = $words[$word_index][1];
if ($trial_position < strlen($text)) {
$candidate = substr($text, $trial_position);
$test_text = trim(search_simplify($candidate, $langcode));
if (strpos($test_text, $simplified_key) !== FALSE) {
$start_position = $trial_position;
$word_end = $trial_position + strlen($words[$word_index][0]);
continue;
}
break;
}
// See if we can end at our currently-saved word-ending position and still
// match, in which case this is the minimal matching string.
if ($word_end > $start_position) {
$candidate = substr($text, $start_position, $word_end - $start_position);
$test_text = trim(search_simplify($candidate, $langcode));
if (strpos($test_text, $simplified_key) !== FALSE) {
return $candidate;
}
}
// Save the end position of this word for the next time through this loop.
$word_end = $trial_position + strlen($words[$word_index][0]);
}
return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE;
}
/**
* Helper function for array_filter() in search_search_excerpt_match().
*/
function _search_excerpt_match_filter($var) {
return strlen(trim($var[0]));
// If we get here, we couldn't find a match.
return NULL;
}

View File

@ -15,14 +15,25 @@ function search_langcode_test_search_preprocess($text, $langcode = NULL) {
if ($text == 'we are testing') {
$text .= ' test tested';
}
// Prints the langcode for testPreprocessLangcode().
// Prints the langcode for testPreprocessLangcode() and adds some
// extra text.
else {
drupal_set_message('Langcode Preprocess Test: ' . $langcode);
$text .= 'Additional text';
}
}
// Prints the langcode for testPreprocessLangcode().
elseif (isset($langcode)) {
drupal_set_message('Langcode Preprocess Test: ' . $langcode);
// Preprocessing for the excerpt test.
if ($langcode == 'ex') {
$text = str_replace('finding', 'find', $text);
$text = str_replace('finds', 'find', $text);
$text = str_replace('dic', ' dependency injection container', $text);
$text = str_replace('hypertext markup language', 'html', $text);
}
}
return $text;
}