Issue #916086 by jhodgdon, mcarbone, pwolanin, Albert Volkman: Search_excerpt() doesn't highlight words that are matched via search_simplify().

2013-12-20 12:15:04 +00:00 · 2013-12-20 12:15:04 +00:00 · f82a06840f
parent e16798a856
commit f82a06840f
4 changed files with 216 additions and 157 deletions
--- a/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php
+++ b/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php
@ -19,7 +19,7 @@ class SearchExcerptTest extends WebTestBase {
   *
   * @var array
   */
-  public static $modules = array('search');
+  public static $modules = array('search', 'search_langcode_test');

  public static function getInfo() {
    return array(
@ -48,10 +48,19 @@ class SearchExcerptTest extends WebTestBase {
    $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Entire string is returned when keyword is not found in short string');

    $result = preg_replace('| +|', ' ', search_excerpt('fox', $text));
-    $this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog ...', 'Found keyword is highlighted');
+    $this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog', 'Found keyword is highlighted');
+
+    $expected = '<strong>The</strong> quick brown fox &amp; jumps over <strong>the</strong> lazy dog';
+    $result = preg_replace('| +|', ' ', search_excerpt('The', $text));
+    $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at beginning of short string');
+
+    $expected = 'The quick brown fox &amp; jumps over the lazy <strong>dog</strong>';
+    $result = preg_replace('| +|', ' ', search_excerpt('dog', $text));
+    $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at end of short string');

    $longtext = str_repeat($text . ' ', 10);
    $result = preg_replace('| +|', ' ', search_excerpt('nothing', $longtext));
+    $expected = 'The quick brown fox &amp; jumps over the lazy dog';
    $this->assertTrue(strpos($result, $expected) === 0, 'When keyword is not found in long string, return value starts as expected');

    $entities = str_repeat('k&eacute;sz&iacute;t&eacute;se ', 20);
@ -101,10 +110,10 @@ class SearchExcerptTest extends WebTestBase {

    // Test phrases with characters which are being truncated.
    $result = preg_replace('| +|', ' ', search_excerpt('"ipsum _"', $text));
-    $this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');
+    $this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');

    $result = preg_replace('| +|', ' ', search_excerpt('"ipsum 0000"', $text));
-    $this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');
+    $this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');

    // Test combination of the valid keyword and keyword containing only
    // characters which are being truncated during simplification.
@ -113,5 +122,43 @@ class SearchExcerptTest extends WebTestBase {

    $result = preg_replace('| +|', ' ', search_excerpt('ipsum 0000', $text));
    $this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid keyword is highlighted and invalid keyword "0000" is ignored.');
+
+    // Test using the hook_search_preprocess() from the test module.
+    // The hook replaces "finding" or "finds" with "find".
+    // So, if we search for "find" or "finds" or "finding", we should
+    // highlight "finding".
+    $text = "this tests finding a string";
+    $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for finds');
+    $result = preg_replace('| +|', ' ', search_excerpt('find', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for find');
+
+    // Just to be sure, test with the replacement at the beginning and end.
+    $text = "finding at the beginning";
+    $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at start');
+
+    $text = "at the end finding";
+    $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at end');
+
+    // Testing with a one-to-many replacement: the test module replaces DIC
+    // with Dependency Injection Container.
+    $text = "something about the DIC is happening";
+    $result = preg_replace('| +|', ' ', search_excerpt('Dependency', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym first word');
+
+    $result = preg_replace('| +|', ' ', search_excerpt('Injection', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym second word');
+
+    $result = preg_replace('| +|', ' ', search_excerpt('Container', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym third word');
+
+    // Testing with a many-to-one replacement: the test module replaces
+    // hypertext markup language with HTML.
+    $text = "we always use hypertext markup language to describe things";
+    $result = preg_replace('| +|', ' ', search_excerpt('html', $text, 'ex'));
+    $this->assertTrue(strpos($result, '<strong>hypertext markup language</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym many to one');
+
  }
 }
--- a/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php
+++ b/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php
@ -53,11 +53,13 @@ class SearchPreprocessLangcodeTest extends SearchTestBase {
    // function manually is needed to finish the indexing process.
    search_update_totals();

-    // Search for the title of the node with a POST query.
-    $edit = array('or' => $node->label());
+    // Search for the additional text that is added by the preprocess
+    // function. If you search for text that is in the node, preprocess is
+    // not invoked on the node during the search excerpt generation.
+    $edit = array('or' => 'Additional text');
    $this->drupalPostForm('search/node', $edit, t('Advanced search'));

-    // Checks if the langcode has been passed by hook_search_preprocess().
+    // Checks if the langcode message has been set by hook_search_preprocess().
    $this->assertText('Langcode Preprocess Test: en');
  }

--- a/core/modules/search/search.module
+++ b/core/modules/search/search.module
@ -621,24 +621,23 @@ function search_mark_for_reindex($type, $sid) {
 */

 /**
- * Returns snippets from a piece of text, with certain keywords highlighted.
+ * Returns snippets from a piece of text, with search keywords highlighted.
 *
 * Used for formatting search results.
 *
- * @param $keys
+ * @param string $keys
 *   A string containing a search query.
- *
- * @param $text
+ * @param string $text
 *   The text to extract fragments from.
 *
- * @return
+ * @return string
 *   A string containing HTML for the excerpt.
 */
 function search_excerpt($keys, $text, $langcode = NULL) {
  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';

-  // Extract positive keywords and phrases
+  // Extract positive keywords and phrases.
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

@ -646,78 +645,77 @@ function search_excerpt($keys, $text, $langcode = NULL) {
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
  $text = decode_entities($text);

-  // Slash-escape quotes in the search keyword string.
-  array_walk($keys, '_search_excerpt_replace');
-  $workkeys = $keys;
+  // Make a list of unique keywords that are actually found in the text,
+  // which could be items in $keys or replacements that are equivalent through
+  // search_simplify().
+  $temp_keys = array();
+  foreach ($keys as $key) {
+    $key = _search_find_match_with_simplify($key, $text, $boundary, $langcode);
+    if (isset($key)) {
+      // Quote slashes so they can be used in regular expressions.
+      $temp_keys[] = preg_quote($key, '/');
+    }
+  }
+  // Several keywords could have simplified down to the same thing, so pick
+  // out the unique ones.
+  $keys = array_unique($temp_keys);

-  // Extract fragments around keywords.
-  // First we collect ranges of text around each keyword, starting/ending
-  // at spaces, trying to get to 256 characters.
-  // If the sum of all fragments is too short, we look for second occurrences.
+  // Extract fragments of about 60 characters around keywords, bounded by word
+  // boundary characters. Try to reach 256 characters, using second occurrences
+  // if necessary.
  $ranges = array();
-  $included = array();
-  $foundkeys = array();
  $length = 0;
-  while ($length < 256 && count($workkeys)) {
-    foreach ($workkeys as $k => $key) {
-      if (strlen($key) == 0) {
-        unset($workkeys[$k]);
-        unset($keys[$k]);
-        continue;
-      }
+  $look_start = array();
+  $remaining_keys = $keys;
+
+  while ($length < 256 && !empty($remaining_keys)) {
+    $found_keys = array();
+    foreach ($remaining_keys as $key) {
      if ($length >= 256) {
        break;
      }
-      // Remember occurrence of key so we can skip over it if more occurrences
-      // are desired.
-      if (!isset($included[$key])) {
-        $included[$key] = 0;
+
+      // Remember where we last found $key, in case we are coming through a
+      // second time.
+      if (!isset($look_start[$key])) {
+        $look_start[$key] = 0;
      }
-      // Locate a keyword (position $p, always >0 because $text starts with a
-      // space). First try bare keyword, but if that doesn't work, try to find a
-      // derived form from search_simplify().
-      $p = 0;
-      if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
-        $p = $match[0][1];
-      }
-      else {
-        $info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary, $langcode);
-        if ($info['where']) {
-          $p = $info['where'];
-          if ($info['keyword']) {
-            $foundkeys[] = $info['keyword'];
+
+      // See if we can find $key after where we found it the last time. Since
+      // we are requiring a match on a word boundary, make sure $text starts
+      // and ends with a space.
+      $matches = array();
+      if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
+        $found_position = $matches[0][1];
+        $look_start[$key] = $found_position + 1;
+        // Keep track of which keys we found this time, in case we need to
+        // pass through again to find more text.
+        $found_keys[] = $key;
+
+        // Locate a space before and after this match, leaving about 60
+        // characters of context on each end.
+        $before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
+        if ($before !== FALSE && $before <= $found_position) {
+          $after = strrpos(' ' . $text . ' ', ' ', min($found_position + 61, strlen($text) + 1));
+          if ($after !== FALSE && $after > $found_position) {
+            // Account for the spaces we added.
+            $before = max($before - 1, 0);
+            $after = min($after - 1, strlen($text));
+            if ($before < $after) {
+              // Save this range.
+              $ranges[$before] = $after;
+              $length += $after - $before;
+            }
          }
        }
      }
-      // Now locate a space in front (position $q) and behind it (position $s),
-      // leaving about 60 characters extra before and after for context.
-      // Note that a space was added to the front and end of $text above.
-      if ($p) {
-        if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
-          $end = substr($text . ' ', $p, 80);
-          if (($s = strrpos($end, ' ')) !== FALSE) {
-            // Account for the added spaces.
-            $q = max($q - 1, 0);
-            $s = min($s, strlen($end) - 1);
-            $ranges[$q] = $p + $s;
-            $length += $p + $s - $q;
-            $included[$key] = $p + 1;
-          }
-          else {
-            unset($workkeys[$k]);
-          }
-        }
-        else {
-          unset($workkeys[$k]);
-        }
-      }
-      else {
-        unset($workkeys[$k]);
-      }
    }
+    // Next time through this loop, only look for keys we found this time,
+    // if any.
+    $remaining_keys = $found_keys;
  }

-  if (count($ranges) == 0) {
+  if (empty($ranges)) {
    // We didn't find any keyword matches, so just return the first part of the
    // text. We also need to re-encode any HTML special characters that we
    // entity-decoded above.
@ -727,43 +725,46 @@ function search_excerpt($keys, $text, $langcode = NULL) {
  // Sort the text ranges by starting position.
  ksort($ranges);

-  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
-  $newranges = array();
-  foreach ($ranges as $from2 => $to2) {
-    if (!isset($from1)) {
-      $from1 = $from2;
-      $to1 = $to2;
+  // Collapse overlapping text ranges into one. The sorting makes it O(n).
+  $new_ranges = array();
+  $max_end = 0;
+  foreach ($ranges as $this_from => $this_to) {
+    $max_end = max($max_end, $this_to);
+    if (!isset($working_from)) {
+      // This is the first time through this loop: initialize.
+      $working_from = $this_from;
+      $working_to = $this_to;
      continue;
    }
-    if ($from2 <= $to1) {
-      $to1 = max($to1, $to2);
+    if ($this_from <= $working_to) {
+      // The ranges overlap: combine them.
+      $working_to = max($working_to, $this_to);
    }
    else {
-      $newranges[$from1] = $to1;
-      $from1 = $from2;
-      $to1 = $to2;
+      // The ranges do not overlap: save the working range and start a new one.
+      $new_ranges[$working_from] = $working_to;
+      $working_from = $this_from;
+      $working_to = $this_to;
    }
  }
-  $newranges[$from1] = $to1;
+  // Save the remaining working range.
+  $new_ranges[$working_from] = $working_to;

-  // Fetch text
+  // Fetch text within the combined ranges we found.
  $out = array();
-  foreach ($newranges as $from => $to) {
+  foreach ($new_ranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }

-  // Let translators have the ... separator text as one chunk.
+  // Combine the text chunks with "..." separators. The "..." needs to be
+  // translated. Let translators have the ... separator text as one chunk.
  $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
-
-  $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
+  $text = (isset($new_ranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . (($max_end < strlen($text) - 1) ? $dots[2] : '');
  $text = check_plain($text);

-  // Slash-escape quotes in keys found in a derived form and merge with original keys.
-  array_walk($foundkeys, '_search_excerpt_replace');
-  $keys = array_merge($keys, $foundkeys);
-
-  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
-  $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', $text);
+  // Highlight keywords. Must be done at once to prevent conflicts ('strong'
+  // and '<strong>').
+  $text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
  return $text;
 }

@ -772,83 +773,81 @@ function search_excerpt($keys, $text, $langcode = NULL) {
 */

 /**
- * Helper function for array_walk() in search_excerpt().
- */
-function _search_excerpt_replace(&$text) {
-  $text = preg_quote($text, '/');
-}
-
-/**
- * Finds words in the original text that matched via search_simplify().
- *
- * This is called in search_excerpt() if an exact match is not found in the
- * text, so that we can find the derived form that matches.
+ * Finds an appropriate keyword in text.
 *
 * @param $key
 *   The keyword to find.
 * @param $text
 *   The text to search for the keyword.
- * @param $offset
- *   Offset position in $text to start searching at.
 * @param $boundary
- *   Text to include in a regular expression that will match a word boundary.
+ *   Regular expression for boundary characters between words.
+ * @param $langcode
+ *   Language code.
 *
 * @return
- *   FALSE if no match is found. If a match is found, return an associative
- *   array with element 'where' giving the position of the match, and element
- *   'keyword' giving the actual word found in the text at that position.
+ *   A segment of $text that is between word boundary characters that either
+ *   matches $key directly, or matches $key when both this text segment and
+ *   $key are processed by search_simplify(). If a matching text segment is
+ *   not located, NULL is returned.
 */
-function search_simplify_excerpt_match($key, $text, $offset, $boundary, $langcode = NULL) {
-  $pos = NULL;
-  $simplified_key = search_simplify($key, $langcode);
-  $simplified_text = search_simplify($text, $langcode);
-
-  // Return immediately if simplified key or text are empty.
-  if (!$simplified_key || !$simplified_text) {
-    return FALSE;
+function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
+  // See if $key appears as-is. When testing, make sure $text starts/ends with
+  // a space, because we require $key to be surrounded by word boundary
+  // characters.
+  $temp = trim($key);
+  if ($temp == '') {
+    return NULL;
+  }
+  if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) {
+    return $key;
  }

-  // Check if we have a match after simplification in the text.
-  if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) {
-    return FALSE;
+  // Run both text and key through search_simplify.
+  $simplified_key = trim(search_simplify($key, $langcode));
+  $simplified_text = trim(search_simplify($text, $langcode));
+  if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
+    // The simplfied keyword and text do not match at all, or are empty.
+    return NULL;
  }

-  // If we get here, we have a match. Now find the exact location of the match
-  // and the original text that matched. Start by splitting up the text by all
-  // potential starting points of the matching text and iterating through them.
-  $split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter');
-  foreach ($split as $value) {
-    // Skip starting points before the offset.
-    if ($value[1] < $offset) {
-      continue;
-    }
+  // Split $text into words, keeping track of where the word boundaries are.
+  $words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE);
+  // Add an entry pointing to the end of the string, for the loop below.
+  $words[] = array('', strlen($text));
+  $num_words = count($words);

-    // Check a window of 80 characters after the starting point for a match,
-    // based on the size of the excerpt window.
-    $window = substr($text, $value[1], 80);
-    $simplified_window = search_simplify($window);
-    if (strpos($simplified_window, $simplified_key) === 0) {
-      // We have a match in this window. Store the position of the match.
-      $pos = $value[1];
-      // Iterate through the text in the window until we find the full original
-      // matching text.
-      $length = strlen($window);
-      for ($i = 1; $i <= $length; $i++) {
-        $keyfound = substr($text, $value[1], $i);
-        if ($simplified_key == search_simplify($keyfound)) {
-          break;
-        }
+  // Find the smallest segment of complete words in $text that we can simplify
+  // to match $simplified_key.
+  $start_position = 0;
+  $word_end = 0;
+  for ($word_index = 0; $word_index < $num_words; $word_index++) {
+    // See if we can move the starting position out from our previously-saved
+    // best position to here and still have a match.
+    $trial_position = $words[$word_index][1];
+    if ($trial_position < strlen($text)) {
+      $candidate = substr($text, $trial_position);
+      $test_text = trim(search_simplify($candidate, $langcode));
+      if (strpos($test_text, $simplified_key) !== FALSE) {
+        $start_position = $trial_position;
+        $word_end = $trial_position + strlen($words[$word_index][0]);
+        continue;
      }
-      break;
    }
+
+    // See if we can end at our currently-saved word-ending position and still
+    // match, in which case this is the minimal matching string.
+    if ($word_end > $start_position) {
+      $candidate = substr($text, $start_position, $word_end - $start_position);
+      $test_text = trim(search_simplify($candidate, $langcode));
+      if (strpos($test_text, $simplified_key) !== FALSE) {
+        return $candidate;
+      }
+    }
+
+    // Save the end position of this word for the next time through this loop.
+    $word_end = $trial_position + strlen($words[$word_index][0]);
  }

-  return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE;
-}
-
-/**
- * Helper function for array_filter() in search_search_excerpt_match().
- */
-function _search_excerpt_match_filter($var) {
-  return strlen(trim($var[0]));
+  // If we get here, we couldn't find a match.
+  return NULL;
 }
--- a/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module
+++ b/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module
@ -15,14 +15,25 @@ function search_langcode_test_search_preprocess($text, $langcode = NULL) {
    if ($text == 'we are testing') {
      $text .= ' test tested';
    }
-    // Prints the langcode for testPreprocessLangcode().
+    // Prints the langcode for testPreprocessLangcode() and adds some
+    // extra text.
    else {
      drupal_set_message('Langcode Preprocess Test: ' . $langcode);
+      $text .= 'Additional text';
    }
  }
  // Prints the langcode for testPreprocessLangcode().
  elseif (isset($langcode)) {
    drupal_set_message('Langcode Preprocess Test: ' . $langcode);
+
+    // Preprocessing for the excerpt test.
+    if ($langcode == 'ex') {
+      $text = str_replace('finding', 'find', $text);
+      $text = str_replace('finds', 'find', $text);
+      $text = str_replace('dic', ' dependency injection container', $text);
+      $text = str_replace('hypertext markup language', 'html', $text);
+    }
  }
+
  return $text;
 }