' . t('About') . ''; $output .= '
' . t('The Search module provides the ability to index and search for content by exact keywords, and for users by username or e-mail. For more information, see the online handbook entry for Search module.', array('@search-module' => 'http://drupal.org/documentation/modules/search', '@search' => url('search'))) . '
'; $output .= '' . t('The search engine maintains an index of words found in your site\'s content. To build and maintain this index, a correctly configured cron maintenance task is required. Indexing behavior can be adjusted using the settings below.', array('@cron' => url('admin/reports/status'))) . '
'; } } /** * Implements hook_theme(). */ function search_theme() { return array( 'search_result' => array( 'variables' => array('result' => NULL, 'plugin_id' => NULL), 'file' => 'search.pages.inc', 'template' => 'search-result', ), ); } /** * Implements hook_permission(). */ function search_permission() { return array( 'administer search' => array( 'title' => t('Administer search'), ), 'search content' => array( 'title' => t('Use search'), ), 'use advanced search' => array( 'title' => t('Use advanced search'), ), ); } /** * Implements hook_preprocess_HOOK() for block templates. */ function search_preprocess_block(&$variables) { if ($variables['plugin_id'] == 'search_form_block') { $variables['attributes']['role'] = 'search'; $variables['content_attributes']['class'][] = 'container-inline'; } } /** * Clears either a part of, or the entire search index. * * @param $sid * (optional) The ID of the item to remove from the search index. If * specified, $type must also be given. Omit both $sid and $type to clear * the entire search index. * @param $type * (optional) The plugin ID or other machine-readable type for the item to * remove from the search index. * @param $langcode * (optional) Language code for the operation. If not provided, all * index records for the $sid and $type will be deleted. */ function search_reindex($sid = NULL, $type = NULL, $langcode = NULL) { if ($type == NULL && $sid == NULL) { /** @var $search_page_repository \Drupal\search\SearchPageRepositoryInterface */ $search_page_repository = \Drupal::service('search.search_page_repository'); foreach ($search_page_repository->getIndexableSearchPages() as $entity) { $entity->getPlugin()->resetIndex(); } } else { $query = db_delete('search_dataset') ->condition('sid', $sid) ->condition('type', $type); if (!empty($langcode)) { $query->condition('langcode', $langcode); } $query->execute(); $query = db_delete('search_index') ->condition('sid', $sid) ->condition('type', $type); if (!empty($langcode)) { $query->condition('langcode', $langcode); } $query->execute(); } } /** * Marks a word as "dirty" (changed), or retrieves the list of dirty words. * * This is used during indexing (cron). Words that are dirty have outdated * total counts in the search_total table, and need to be recounted. */ function search_dirty($word = NULL) { $dirty = &drupal_static(__FUNCTION__, array()); if ($word !== NULL) { $dirty[$word] = TRUE; } else { return $dirty; } } /** * Implements hook_cron(). * * Fires updateIndex() in the plugins for all indexable active search pages, * and cleans up dirty words. * * @see search_dirty() */ function search_cron() { // We register a shutdown function to ensure that search_total is always up // to date. drupal_register_shutdown_function('search_update_totals'); /** @var $search_page_repository \Drupal\search\SearchPageRepositoryInterface */ $search_page_repository = \Drupal::service('search.search_page_repository'); foreach ($search_page_repository->getIndexableSearchPages() as $entity) { $entity->getPlugin()->updateIndex(); } } /** * Updates the {search_total} database table. * * This function is called on shutdown to ensure that {search_total} is always * up to date (even if cron times out or otherwise fails). */ function search_update_totals() { // Update word IDF (Inverse Document Frequency) counts for new/changed words. foreach (search_dirty() as $word => $dummy) { // Get total count $total = db_query("SELECT SUM(score) FROM {search_index} WHERE word = :word", array(':word' => $word), array('target' => 'slave'))->fetchField(); // Apply Zipf's law to equalize the probability distribution. $total = log10(1 + 1/(max(1, $total))); db_merge('search_total') ->key('word', $word) ->fields(array('count' => $total)) ->execute(); } // Find words that were deleted from search_index, but are still in // search_total. We use a LEFT JOIN between the two tables and keep only the // rows which fail to join. $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL", array(), array('target' => 'slave')); $or = db_or(); foreach ($result as $word) { $or->condition('word', $word->realword); } if (count($or) > 0) { db_delete('search_total') ->condition($or) ->execute(); } } /** * Simplifies a string according to indexing rules. * * @param $text * Text to simplify. * * @return * Simplified text. * * @see hook_search_preprocess() */ function search_simplify($text, $langcode = NULL) { // Decode entities to UTF-8 $text = decode_entities($text); // Lowercase $text = drupal_strtolower($text); // Call an external processor for word handling. search_invoke_preprocess($text, $langcode); // Simple CJK handling if (\Drupal::config('search.settings')->get('index.overlap_cjk')) { $text = preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text); } // To improve searching for numerical data such as dates, IP addresses // or version numbers, we consider a group of numerical characters // separated only by punctuation characters to be one piece. // This also means that searching for e.g. '20/03/1984' also returns // results with '20-03-1984' in them. // Readable regexp: ([number]+)[punctuation]+(?=[number]) $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); // Multiple dot and dash groups are word boundaries and replaced with space. // No need to use the unicode modifer here because 0-127 ASCII characters // can't match higher UTF-8 characters as the leftmost bit of those are 1. $text = preg_replace('/[.-]{2,}/', ' ', $text); // The dot, underscore and dash are simply removed. This allows meaningful // search behavior with acronyms and URLs. See unicode note directly above. $text = preg_replace('/[._-]+/', '', $text); // With the exception of the rules above, we consider all punctuation, // marks, spacers, etc, to be a word boundary. $text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text); // Truncate everything to 50 characters. $words = explode(' ', $text); array_walk($words, '_search_index_truncate'); $text = implode(' ', $words); return $text; } /** * Splits CJK (Chinese, Japanese, Korean) text into tokens. * * The Search module matches exact words, where a word is defined to be a * sequence of characters delimited by spaces or punctuation. CJK languages are * written in long strings of characters, though, not split up into words. So * in order to allow search matching, we split up CJK text into tokens * consisting of consecutive, overlapping sequences of characters whose length * is equal to the 'minimum_word_size' variable. This tokenizing is only done * if the 'overlap_cjk' variable is TRUE. * * @param $matches * This function is a callback for preg_replace_callback(), which is called * from search_simplify(). So, $matches is an array of regular expression * matches, which means that $matches[0] contains the matched text -- a * string of CJK characters to tokenize. * * @return * Tokenized text, starting and ending with a space character. */ function search_expand_cjk($matches) { $min = \Drupal::config('search.settings')->get('index.minimum_word_size'); $str = $matches[0]; $length = drupal_strlen($str); // If the text is shorter than the minimum word size, don't tokenize it. if ($length <= $min) { return ' ' . $str . ' '; } $tokens = ' '; // Build a FIFO queue of characters. $chars = array(); for ($i = 0; $i < $length; $i++) { // Add the next character off the beginning of the string to the queue. $current = drupal_substr($str, 0, 1); $str = substr($str, strlen($current)); $chars[] = $current; if ($i >= $min - 1) { // Make a token of $min characters, and add it to the token string. $tokens .= implode('', $chars) . ' '; // Shift out the first character in the queue. array_shift($chars); } } return $tokens; } /** * Simplifies and splits a string into tokens for indexing. */ function search_index_split($text, $langcode = NULL) { $last = &drupal_static(__FUNCTION__); $lastsplit = &drupal_static(__FUNCTION__ . ':lastsplit'); if ($last == $text) { return $lastsplit; } // Process words $text = search_simplify($text, $langcode); $words = explode(' ', $text); // Save last keyword result $last = $text; $lastsplit = $words; return $words; } /** * Helper function for array_walk in search_index_split. */ function _search_index_truncate(&$text) { if (is_numeric($text)) { $text = ltrim($text, '0'); } $text = truncate_utf8($text, 50); } /** * Invokes hook_search_preprocess() in modules. */ function search_invoke_preprocess(&$text, $langcode = NULL) { foreach (\Drupal::moduleHandler()->getImplementations('search_preprocess') as $module) { $text = \Drupal::moduleHandler()->invoke($module, 'search_preprocess', array($text, $langcode)); } } /** * Updates the full-text search index for a particular item. * * @param $sid * An ID number identifying this particular item (e.g., node ID). * @param $type * The plugin ID or other machine-readable type of this item, * which should be less than 64 bytes. * @param $text * The content of this item. Must be a piece of HTML or plain text. * @param $langcode * Language code for text being indexed. * * @ingroup search */ function search_index($sid, $type, $text, $langcode) { $minimum_word_size = \Drupal::config('search.settings')->get('index.minimum_word_size'); // Multipliers for scores of words inside certain HTML tags. The weights are // stored in config so that modules can overwrite the default weights. // Note: 'a' must be included for link ranking to work. $tags = \Drupal::config('search.settings')->get('index.tag_weights'); // Strip off all ignored tags to speed up processing, but insert space before // and after them to keep word boundaries. $text = str_replace(array('<', '>'), array(' <', '> '), $text); $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>'); // Split HTML tags from plain text. $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); // Note: PHP ensures the array consists of alternating delimiters and literals // and begins and ends with a literal (inserting $null as required). $tag = FALSE; // Odd/even counter. Tag or no tag. $score = 1; // Starting score per word $accum = ' '; // Accumulator for cleaned up data $tagstack = array(); // Stack with open tags $tagwords = 0; // Counter for consecutive words $focus = 1; // Focus state $scored_words = array(); // Accumulator for words for index foreach ($split as $value) { if ($tag) { // Increase or decrease score per word based on tag list($tagname) = explode(' ', $value, 2); $tagname = drupal_strtolower($tagname); // Closing or opening tag? if ($tagname[0] == '/') { $tagname = substr($tagname, 1); // If we encounter unexpected tags, reset score to avoid incorrect boosting. if (!count($tagstack) || $tagstack[0] != $tagname) { $tagstack = array(); $score = 1; } else { // Remove from tag stack and decrement score $score = max(1, $score - $tags[array_shift($tagstack)]); } } else { if (isset($tagstack[0]) && $tagstack[0] == $tagname) { // None of the tags we look for make sense when nested identically. // If they are, it's probably broken HTML. $tagstack = array(); $score = 1; } else { // Add to open tag stack and increment score array_unshift($tagstack, $tagname); $score += $tags[$tagname]; } } // A tag change occurred, reset counter. $tagwords = 0; } else { // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values if ($value != '') { $words = search_index_split($value, $langcode); foreach ($words as $word) { // Add word to accumulator $accum .= $word . ' '; // Check wordlength if (is_numeric($word) || drupal_strlen($word) >= $minimum_word_size) { if (!isset($scored_words[$word])) { $scored_words[$word] = 0; } $scored_words[$word] += $score * $focus; // Focus is a decaying value in terms of the amount of unique words up to this point. // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words. $focus = min(1, .01 + 3.5 / (2 + count($scored_words) * .015)); } $tagwords++; // Too many words inside a single tag probably mean a tag was accidentally left open. if (count($tagstack) && $tagwords >= 15) { $tagstack = array(); $score = 1; } } } } $tag = !$tag; } search_reindex($sid, $type, $langcode); // Insert cleaned up data into dataset db_insert('search_dataset') ->fields(array( 'sid' => $sid, 'langcode' => $langcode, 'type' => $type, 'data' => $accum, 'reindex' => 0, )) ->execute(); // Insert results into search index foreach ($scored_words as $word => $score) { // If a word already exists in the database, its score gets increased // appropriately. If not, we create a new record with the appropriate // starting score. db_merge('search_index') ->keys(array( 'word' => $word, 'sid' => $sid, 'langcode' => $langcode, 'type' => $type, )) ->fields(array('score' => $score)) ->expression('score', 'score + :score', array(':score' => $score)) ->execute(); search_dirty($word); } } /** * Changes the timestamp on an indexed item to 'now' to force reindexing. * * @param $type * The plugin ID or other machine-readable type of this item. * @param $sid * An ID number identifying this particular item (e.g., node ID). */ function search_mark_for_reindex($type, $sid) { db_update('search_dataset') ->fields(array('reindex' => REQUEST_TIME)) ->condition('type', $type) ->condition('sid', $sid) ->execute(); } /** * @defgroup search Search interface * @{ * The Drupal search interface manages a global search mechanism. * * Modules may plug into this system to provide searches of different types of * data. Most of the system is handled by the Search module, so this must be * enabled for all of the search features to work. * * To be discovered, the plugins must implement * \Drupal\search\Plugin\SearchInterface and be annotated as * \Drupal\search\Annotation\SearchPlugin plugins. Defining a plugin will allow * administrators to set up one or more search pages using this plugin. * * There are three ways to interact with the search system: * - Specifically for searching nodes, you can implement * hook_node_update_index() and hook_node_search_result(). However, note that * the search system already indexes all visible output of a node; i.e., * everything displayed normally during node viewing. This is * usually sufficient. You should only use this mechanism if you want * additional, non-visible data to be indexed. * - Define a plugin implementing \Drupal\search\Plugin\SearchInterface and * annotated as \Drupal\search\Annotation\SearchPlugin. This will create a * search page type that users can use to set up one or more search pages. * Each of these corresponds to a tab on the /search page, which can be * used to perform searches. You will also need to implement the execute() * method from the interface to perform the search. A base class is provided * in \Drupal\search\Plugin\SearchPluginBase. * * If your module needs to provide a more complicated search form, then you * need to implement it yourself. In that case, you may wish to define it as a * local task (tab) under the /search page (e.g. /search/mymodule) so that users * can easily find it. */ /** * Returns snippets from a piece of text, with search keywords highlighted. * * Used for formatting search results. * * @param string $keys * A string containing a search query. * @param string $text * The text to extract fragments from. * * @return string * A string containing HTML for the excerpt. */ function search_excerpt($keys, $text, $langcode = NULL) { // We highlight around non-indexable or CJK characters. $boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))'; // Extract positive keywords and phrases. preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches); $keys = array_merge($matches[2], $matches[3]); // Prepare text by stripping HTML tags and decoding HTML entities. $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)); $text = decode_entities($text); // Make a list of unique keywords that are actually found in the text, // which could be items in $keys or replacements that are equivalent through // search_simplify(). $temp_keys = array(); foreach ($keys as $key) { $key = _search_find_match_with_simplify($key, $text, $boundary, $langcode); if (isset($key)) { // Quote slashes so they can be used in regular expressions. $temp_keys[] = preg_quote($key, '/'); } } // Several keywords could have simplified down to the same thing, so pick // out the unique ones. $keys = array_unique($temp_keys); // Extract fragments of about 60 characters around keywords, bounded by word // boundary characters. Try to reach 256 characters, using second occurrences // if necessary. $ranges = array(); $length = 0; $look_start = array(); $remaining_keys = $keys; while ($length < 256 && !empty($remaining_keys)) { $found_keys = array(); foreach ($remaining_keys as $key) { if ($length >= 256) { break; } // Remember where we last found $key, in case we are coming through a // second time. if (!isset($look_start[$key])) { $look_start[$key] = 0; } // See if we can find $key after where we found it the last time. Since // we are requiring a match on a word boundary, make sure $text starts // and ends with a space. $matches = array(); if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) { $found_position = $matches[0][1]; $look_start[$key] = $found_position + 1; // Keep track of which keys we found this time, in case we need to // pass through again to find more text. $found_keys[] = $key; // Locate a space before and after this match, leaving about 60 // characters of context on each end. $before = strpos(' ' . $text, ' ', max(0, $found_position - 61)); if ($before !== FALSE && $before <= $found_position) { $after = strrpos(' ' . $text . ' ', ' ', min($found_position + 61, strlen($text) + 1)); if ($after !== FALSE && $after > $found_position) { // Account for the spaces we added. $before = max($before - 1, 0); $after = min($after - 1, strlen($text)); if ($before < $after) { // Save this range. $ranges[$before] = $after; $length += $after - $before; } } } } } // Next time through this loop, only look for keys we found this time, // if any. $remaining_keys = $found_keys; } if (empty($ranges)) { // We didn't find any keyword matches, so just return the first part of the // text. We also need to re-encode any HTML special characters that we // entity-decoded above. return check_plain(truncate_utf8($text, 256, TRUE, TRUE)); } // Sort the text ranges by starting position. ksort($ranges); // Collapse overlapping text ranges into one. The sorting makes it O(n). $new_ranges = array(); $max_end = 0; foreach ($ranges as $this_from => $this_to) { $max_end = max($max_end, $this_to); if (!isset($working_from)) { // This is the first time through this loop: initialize. $working_from = $this_from; $working_to = $this_to; continue; } if ($this_from <= $working_to) { // The ranges overlap: combine them. $working_to = max($working_to, $this_to); } else { // The ranges do not overlap: save the working range and start a new one. $new_ranges[$working_from] = $working_to; $working_from = $this_from; $working_to = $this_to; } } // Save the remaining working range. $new_ranges[$working_from] = $working_to; // Fetch text within the combined ranges we found. $out = array(); foreach ($new_ranges as $from => $to) { $out[] = substr($text, $from, $to - $from); } // Combine the text chunks with "..." separators. The "..." needs to be // translated. Let translators have the ... separator text as one chunk. $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...')); $text = (isset($new_ranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . (($max_end < strlen($text) - 1) ? $dots[2] : ''); $text = check_plain($text); // Highlight keywords. Must be done at once to prevent conflicts ('strong' // and ''). $text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', ' ' . $text . ' ')); return $text; } /** * @} End of "defgroup search". */ /** * Finds an appropriate keyword in text. * * @param $key * The keyword to find. * @param $text * The text to search for the keyword. * @param $boundary * Regular expression for boundary characters between words. * @param $langcode * Language code. * * @return * A segment of $text that is between word boundary characters that either * matches $key directly, or matches $key when both this text segment and * $key are processed by search_simplify(). If a matching text segment is * not located, NULL is returned. */ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) { // See if $key appears as-is. When testing, make sure $text starts/ends with // a space, because we require $key to be surrounded by word boundary // characters. $temp = trim($key); if ($temp == '') { return NULL; } if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) { return $key; } // Run both text and key through search_simplify. $simplified_key = trim(search_simplify($key, $langcode)); $simplified_text = trim(search_simplify($text, $langcode)); if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) { // The simplfied keyword and text do not match at all, or are empty. return NULL; } // Split $text into words, keeping track of where the word boundaries are. $words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE); // Add an entry pointing to the end of the string, for the loop below. $words[] = array('', strlen($text)); $num_words = count($words); // Find the smallest segment of complete words in $text that we can simplify // to match $simplified_key. $start_position = 0; $word_end = 0; for ($word_index = 0; $word_index < $num_words; $word_index++) { // See if we can move the starting position out from our previously-saved // best position to here and still have a match. $trial_position = $words[$word_index][1]; if ($trial_position < strlen($text)) { $candidate = substr($text, $trial_position); $test_text = trim(search_simplify($candidate, $langcode)); if (strpos($test_text, $simplified_key) !== FALSE) { $start_position = $trial_position; $word_end = $trial_position + strlen($words[$word_index][0]); continue; } } // See if we can end at our currently-saved word-ending position and still // match, in which case this is the minimal matching string. if ($word_end > $start_position) { $candidate = substr($text, $start_position, $word_end - $start_position); $test_text = trim(search_simplify($candidate, $langcode)); if (strpos($test_text, $simplified_key) !== FALSE) { return $candidate; } } // Save the end position of this word for the next time through this loop. $word_end = $trial_position + strlen($words[$word_index][0]); } // If we get here, we couldn't find a match. return NULL; } /** * Implements hook_module_preinstall(). */ function search_module_preinstall() { // @todo Remove in https://drupal.org/node/2155635. \Drupal::service('plugin.manager.search')->clearCachedDefinitions(); } /** * Implements hook_form_FORM_ID_alter() for the search_block_form form. * * Since the exposed form is a GET form, we don't want it to send the form * tokens. However, you cannot make this happen in the form builder function * itself, because the tokens are added to the form after the builder function * is called. So, we have to do it in a form_alter. * * @see \Drupal\search\Form\SearchBlockForm */ function search_form_search_block_form_alter(&$form, &$form_state) { $form['form_build_id']['#access'] = FALSE; $form['form_token']['#access'] = FALSE; $form['form_id']['#access'] = FALSE; }