'Fuzzysearch', 'description' => 'Fuzzysearch settings allow you to index certain node data', 'page callback' => 'fuzzysearch_admin', 'access arguments' => array('administer fuzzysearch'), 'type' => MENU_NORMAL_ITEM, ); $items['fuzzysearch/results'] = array( 'title' => 'Search', 'page callback' => 'fuzzysearch_show_results', 'access arguments' => array('fuzzysearch content'), 'type' => MENU_DYNAMIC_ITEM, ); $items['admin/reports/fuzzysearch'] = array( 'title' => 'Top fuzzysearch phrases', 'description' => 'View most popular fuzzysearch phrases.', 'page callback' => 'dblog_top', 'page arguments' => array('fuzzysearch'), 'access arguments' => array('access site reports'), 'file' => 'dblog.admin.inc', 'file path' => drupal_get_path('module', 'dblog'), ); return $items; } /** * Implementation of hook_perm(). */ function fuzzysearch_perm() { return array('administer fuzzysearch', 'fuzzysearch content'); } /** * Implementation of hook_theme(). */ function fuzzysearch_theme() { return array( 'fuzzysearch_box_form' => array( 'args' => array( 'form' => NULL, ), ), 'fuzzysearch_form' => array( 'args' => array( 'form' => NULL, ), ), 'fuzzysearch_show_results' => array( 'args' => array( 'keys' => NULL, ), ), 'fuzzysearch_results_title' => array( 'args' => array( 'results' => NULL, ), ), 'fuzzysearch_results' => array( 'args' => array( 'results' => NULL, ), ), 'fuzzysearch_result' => array( 'template' => 'fuzzysearch-result', 'arguments' => array('node' => NULL, 'teaser' => FALSE, 'page' => FALSE), ), ); } /** * Implementation of hook_content_build_modes(). */ function fuzzysearch_content_build_modes() { return array( 'fuzzysearch' => array( 'title' => t('Fuzzy Search'), 'build modes' => array( NODE_BUILD_SEARCH_INDEX => array( 'title' => t('Search Index'), 'views style' => FALSE, ), NODE_BUILD_SEARCH_RESULT => array( 'title' => t('Search Result'), 'views style' => FALSE, ), ), ), ); } function fuzzysearch_preprocess_fuzzysearch_result(&$variables) { $node = $variables['node']; if (module_exists('taxonomy')) { $variables['taxonomy'] = taxonomy_link('taxonomy terms', $node); } else { $variables['taxonomy'] = array(); } if ($variables['teaser'] && $node->teaser) { $variables['content'] = $node->teaser; } elseif (isset($node->body)) { $variables['content'] = $node->body; } else { $variables['content'] = ''; } $variables['date'] = format_date($node->created); $variables['links'] = !empty($node->links) ? theme('links', $node->links, array('class' => 'links inline')) : ''; $variables['name'] = theme('username', $node); $variables['node_url'] = url('node/'. $node->nid); $variables['terms'] = theme('links', $variables['taxonomy'], array('class' => 'links inline')); $variables['title'] = check_plain($node->title); // Flatten the node object's member fields. $variables = array_merge((array)$node, $variables); // Display info only on certain node types. if (theme_get_setting('toggle_node_info_'. $node->type)) { $variables['submitted'] = theme('node_submitted', $node); $variables['picture'] = theme_get_setting('toggle_node_user_picture') ? theme('user_picture', $node) : ''; } else { $variables['submitted'] = ''; $variables['picture'] = ''; } } /** * Build the administration settings panel. */ function fuzzysearch_admin() { $total = db_result(db_query("SELECT COUNT(*) FROM {node}")); $remaining = db_result(db_query("SELECT COUNT(*) FROM {fuzzysearch_index_queue}")); $count = format_plural($remaining, 'There is 1 item left to index.', 'There are @count items left to index.'); $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) .'%'; $status = '

'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'

'; $output .= $status; $output .= drupal_get_form('fuzzysearch_admin_form'); $output .= drupal_get_form('fuzzysearch_scoring'); return $output; } /** * Module Administration (clear index). */ function fuzzysearch_admin_form() { $form['index'] = array( '#title' => t('Index settings'), '#type' => 'fieldset' ); $form['index']['ngram_length'] = array( '#type' => 'textfield', '#title' => t('Ngram length'), '#size' => 1, '#maxlength' => 1, '#description' => t('Fuzzysearch breaks down words into ngrams, or pieces of text this long, for indexing and searching. Default value is 3. You must reindex for changes to take effect.'), '#default_value' => variable_get('fuzzysearch_ngram_length', 3), ); $form['index']['index_cron'] = array( '#type' => 'textfield', '#title' => t('Nodes to index per cron run'), '#size' => 4, '#maxlength' => 4, '#description' => t('This sets the number of nodes to index per cron run. A high number may cause PHP to timeout.'), '#default_value' => variable_get('fuzzysearch_index_cron', 150), ); $form['index']['reindex'] = array( '#type' => 'checkbox', '#title' => 'Rebuild index', '#description' => t('Check the box and click submit to re-index all nodes on the site. Re-indexing will begin with the the next cron run.'), '#default_value' => false ); $form['search'] = array( '#title' => t('Search settings'), '#type' => 'fieldset' ); $form['search']['completeness'] = array( '#type' => 'textfield', '#title' => t('Minimum completeness'), '#size' => 3, '#maxlength' => 3, '#description' => t('Enter value between 0 and 100 to set the minimum match completeness required in the returned results.'), '#default_value' => variable_get('fuzzysearch_min_completeness', 40), ); $form['display'] = array( '#title' => t('Display settings'), '#type' => 'fieldset' ); $form['display']['debug_score'] = array( '#type' => 'checkbox', '#title' => t('Display scoring'), '#description' => t('If selected, the completeness and score of the results will be shown below each result'), '#default_value' => variable_get('fuzzysearch_debug_score', FALSE) ); $form['display']['excerpt'] = array( '#type' => 'textfield', '#title' => t('Result excerpt length'), '#size' => 3, '#maxlength' => 3, '#description' => t('Set the length of the displayed text excerpt containing a found search term. Applies per found term.'), '#default_value' => variable_get('fuzzysearch_excerpt', 200), ); $form['display']['max_result'] = array( '#type' => 'textfield', '#title' => t('Maximum result length'), '#size' => 4, '#maxlength' => 4, '#description' => t('Set the maximum length of the displayed result. Set to 0 for unlimited length. Applies per result.'), '#default_value' => variable_get('fuzzysearch_max_result', 0), ); $form['display']['spelling'] = array( '#type' => 'textfield', '#title' => t('Minimum spelling score'), '#size' => 3, '#maxlength' => 3, '#description' => t('Fuzzysearch tries to highlight search terms that may be misspelled. You can set the minimum threshold, which is calculated as a ratio of ngram hits to misses in a term. 0 may cause a misspelling to highlight everything, and 100 will only highlight exact terms. Enter value between 0 and 100. Changing this setting does not require reindexing.'), '#default_value' => variable_get('fuzzysearch_spelling', 30), ); $options = node_get_types('names'); $form['display']['nodetypes'] = array( '#type' => 'checkboxes', '#title' => t('Check any node types you want to exclude from search results'), '#description' => t('This does not affect how Fuzzy Search indexes your content types.'), '#default_value' => variable_get('fuzzysearch_nodetypes', array('')), '#options' => $options, ); $form['display']['not_found'] = array( '#type' => 'checkbox', '#title' => t('Experimental: Let fuzzysearch handle 404 Page Not Found errors'), '#description' => t('If selected, fuzzysearch will interrupt the normal page not found process and show the site user results using everything after http://example.com/ as search tems. Any time drupal core or a contibuted module sets a 404 header and writes the not found to watchdog(), fuzzysearch will take over after the watchdog entry is created. Please Note: This has the potential to be a big performance drain on your site.'), '#default_value' => variable_get('fuzzysearch_not_found', FALSE) ); $form['submit'] = array( '#type' => 'submit', '#value' => 'Submit', '#weight' => 10 ); return $form; } function fuzzysearch_admin_form_validate($form, &$form_state) { if (!is_numeric($form_state['values']['ngram_length'])) { form_set_error('ngram_length', t('Ngram length must be an integer.')); } if (!is_numeric($form_state['values']['index_cron'])) { form_set_error('index_cron', t('Cron index number must be an integer.')); } if (!is_numeric($form_state['values']['completeness'])) { form_set_error('completeness', t('Minimum match completeness must be an integer.')); } if (!is_numeric($form_state['values']['excerpt'])) { form_set_error('spelling', t('Excerpt length must be an integer.')); } if (!is_numeric($form_state['values']['max_result'])) { form_set_error('spelling', t('Maximum result length must be an integer.')); } if ($form_state['values']['excerpt'] > $form_state['values']['max_result'] && $form_state['values']['max_result'] != 0) { drupal_set_message(t('The maximum result length has been set smaller than the excerpt length. Any results will display the result\'s teaser instead of the found excerpt.'), 'warning'); } if (!is_numeric($form_state['values']['spelling'])) { form_set_error('spelling', t('Minimum spelling score must be an integer.')); } } function fuzzysearch_admin_form_submit($form, &$form_state) { if ($form_state['values']['reindex']) { // Refresh the index queue. db_query("DELETE FROM {fuzzysearch_index_queue}"); $query = db_query("SELECT nid FROM {node}"); while ($row = db_fetch_object($query)) { fuzzysearch_reindex($row->nid, 'fuzzysearch'); } drupal_set_message('Nodes ready for reindexing, please run cron to update the index.'); } variable_set('fuzzysearch_ngram_length', $form_state['values']['ngram_length']); variable_set('fuzzysearch_index_cron', $form_state['values']['index_cron']); variable_set('fuzzysearch_min_completeness', $form_state['values']['completeness']); variable_set('fuzzysearch_excerpt', $form_state['values']['excerpt']); variable_set('fuzzysearch_max_result', $form_state['values']['max_result']); variable_set('fuzzysearch_spelling', $form_state['values']['spelling']); variable_set('fuzzysearch_debug_score', $form_state['values']['debug_score']); variable_set('fuzzysearch_nodetypes', $form_state['values']['nodetypes']); variable_set('fuzzysearch_not_found', $form_state['values']['not_found']); } /** * Implementation of hook_nodeapi(). * * Remove node from index on deletion and queue node for indexing on insert. */ function fuzzysearch_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) { switch ($op) { case 'update': case 'insert': fuzzysearch_reindex($node->nid, 'fuzzysearch'); break; case 'delete': db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $node->nid); break; } } /** * Set factors for scores returned by modules implementing hook_search_score(). */ function fuzzysearch_scoring() { $form['scoring'] = array( '#title' => t('Scoring adjustment'), '#description' => t('Choose a multiplier for each of the score factors. Changing these settings will require all content to be reindexed.'), '#type' => 'fieldset', ); // Allow multipliers to range from 10 = max impact on score, to 0 = no impact on score. $select_values = array( 10 => 10, 9 => 9, 8 => 8, 7 => 7, 6 => 6, 5 => 5, 4 => 4, 3 => 3, 2 => 2, 1 => 1, 0 => 0); // Return all the score modifiers using hook_search_score // expects each score modifier to return an array defining the title and // description of the modifier. $scores = module_invoke_all('fuzzysearch_score', 'settings', NULL); foreach ($scores as $key => $score) { $form_index = $score['id']; $form['scoring'][$form_index] = array( '#title' => $score['title'], '#description' => $score['description'], '#type' => 'select', '#options' => $select_values, '#default_value' => variable_get('fuzzysearch_scoring_'. $score['id'], 5), ); } $form['scoring']['submit'] = array( '#value' => t('Update score factors'), '#type' => 'submit', ); return $form; } /** * Save the score modifiers as set in the administrative form. */ function fuzzysearch_scoring_submit($form, &$form_state) { foreach ($form_state['values'] as $key => $value) { if ($key != 'op' || $key != 'submit' || $key != 'form_token' || $key != 'form_id') { variable_set('fuzzysearch_scoring_'. $key, $value); } } drupal_set_message('Score factor multipliers have been updated'); } /** * External API function that allows modules to flag a node for reindexing. * * @param $nid * Nid of the node to be reindexed. * @param $module * Name of the module flagging the node. */ function fuzzysearch_reindex($nid, $module) { $query = db_query("SELECT * FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid); if (!db_result($query)) { db_query("INSERT INTO {fuzzysearch_index_queue} (nid, module, timestamp) VALUES (%d, '%s', %d)", $nid, $module, time()); } } /** * Implementation of hook_cron(). */ function fuzzysearch_cron() { $query = db_query_range("SELECT nid FROM {fuzzysearch_index_queue}", 0, variable_get('fuzzysearch_index_cron', 150)); while ($result = db_fetch_object($query)) { fuzzysearch_index($result->nid); } } /** * Index the node data in the fuzzy index table. * * @param nid * The node id of the node being indexed. * @return * Returns TRUE on success, FALSE on failure. */ function fuzzysearch_index($nid) { // First step is removing past index db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $nid); $node = node_load($nid); // Let modules alter a node before indexing or prevent it from being indexed. // See readme.txt. foreach (module_implements('fuzzysearch_index') as $name) { $function = $name .'_fuzzysearch_index'; $node = $function($node); if (!$node) { // Update the node table to make indexed = 1; db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid); return; } } // Index node title $text .= '

'. $node->title .'

'; // Build and index the node body. $node->build_mode = NODE_BUILD_SEARCH_INDEX; $node = node_build_content($node, FALSE, FALSE); $node->body = drupal_render($node->content); $text .= $node->body; // Implementation of nodeapi's update_index op. $outside_text = module_invoke_all('nodeapi', $node, 'update index', NULL, NULL); if ($outside_text) { foreach ($outside_text as $content) { $text .= ' '. $content; } } // Insert code to allow other modules to filter indexed text before indexing // Multipliers for scores of words inside certain HTML tags. // Note: 'a' must be included for link ranking to work. $tags = array('h1' => 10, 'h2' => 9, 'h3' => 8, 'h4' => 7, 'h5' => 6, 'h6' => 5, 'u' => 2, 'b' => 2, 'i' => 2, 'strong' => 2, 'em' => 2, 'a' => 5); // Strip off all ignored tags to speed up processing $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>'); // Hook_fuzzysearch_filter lets modules filter text. This should be used for // more complex filtering. Stop words should not use this. Create a stopword // file instead. See fuzzysearch/stopwords/README.txt. foreach (module_implements('fuzzysearch_filter') as $name) { $function = $name .'_fuzzysearch_filter'; $text = $function('index', $text); } // Allow other modules to modify the score of the node based on each owns calculations // the sum of all the scores added to each node is then multiplied by the score of the word, // this allows for faster result queries because all scoring is done at the time of indexing $hook_scores = module_invoke_all('fuzzysearch_score', 'index', $node); // Build the final score multiplier for each node based on returned multipliers from other nodes foreach ($hook_scores as $score) { $multiplier = variable_get('fuzzysearch_scoring_'. $score['id'], 5); $hook_score += $score['score'] * $multiplier; } // Begin indexing content. // Find all words not located within tags (score = 1) $content = preg_replace('/<([A-Z][A-Z0-9]*)[^>]*>(.*?)<\/\1>/i', '', $text); $content = fuzzysearch_cleanse($content); // Remove stopwords. $text = fuzzysearch_stopwords($text); $words = array(); $index_words = array(); $words = preg_split('/\s/', $content, -1, PREG_SPLIT_NO_EMPTY); // Build the index array with scores foreach ($words as $word) { $key = array_search($word, $index_words); if ($key === FALSE) { $index_words[] = $word; $index_scores[] = 1; } else { $index_scores[$key] += 1; } } // Find all words located within tags (score > 1) preg_match_all('/<([A-Z][A-Z0-9]*)([^>]*)>(.*?)<\/\1>/i', $text, $tagged); // filter through each set of content inbetween tags foreach ($tagged[3] as $key => $content) { $content = fuzzysearch_cleanse($content); $words = preg_split('/\s/', $content, -1, PREG_SPLIT_NO_EMPTY); $tag = $tagged[1][$key]; $tag_score = $tags[$tag]; foreach ($words as $word) { $key = array_search($word, $index_words); if ($key === FALSE) { $index_words[] = $word; $index_scores[] = $tag_score; } else { $index_scores[$key] += $tag_score; } } } foreach ($index_words as $key => $word) { // Each word gets a word_id, which comes from the last value in the id column, // which is serial. First we check to make sure it's set. We have to do this // to avoid a postrgresql error. if (!$word_id) { db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (0, 0, 'xxx', 0, 0)"); db_query("DELETE FROM {fuzzysearch_index} WHERE (nid = 0 AND word_id = 0 AND ngram = 'xxx' AND completeness = 0 AND score = 0)"); $word_id = db_last_insert_id('fuzzysearch_index', 'id'); } else { $word_id = db_last_insert_id('fuzzysearch_index', 'id'); } fuzzysearch_index_insert($word, $word_id, $nid, $index_scores[$key], $hook_score); } // Update the node table to make indexed = 1; db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid); } /** * Insert the words into the database as they are indexed. * * @param $word * Word to insert into the index. * @param $nid * The node id that is to be associated with this word. * @param $word_score * Score given to the word based on the tag it is in. * @param $node_score * Score modifier given to the node from hook_search_score. */ function fuzzysearch_index_insert($word, $word_id, $nid, $word_score, $node_score) { $length = drupal_strlen($word); $nlength = variable_get('fuzzysearch_ngram_length', 3); // Ensure that having all score modifiers set to 0 will not affect our natural scoring if ($node_score > 0) { $score = $word_score * $node_score; } else { $score = $word_score; } if ($length > $nlength) { // Calculate how complete the ngram is compared to the length of the word $completeness = 100 / ($length - $nlength + 1); // Create ngrams and index them for ($i=0; $i < ($length - $nlength + 1); $i++) { db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)", $nid, $word_id, drupal_substr($word, $i, $nlength), $completeness, $score); } } else { // The ngram is the same length as the actual word so it is complete $completeness = 100; // Index the ngram db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)", $nid, $word_id, $word, $completeness, $score); } } /** * Implementation of hook_comment(). * */ function fuzzysearch_comment($a1, $op) { switch ($op) { // Reindex the node when comments are added or changed case 'insert': case 'update': case 'delete': case 'publish': case 'unpublish': fuzzysearch_reindex(is_array($a1) ? $a1['nid'] : $a1->nid, 'fuzzysearch'); break; } } /** * Strip all non alphanumeric characters from a string */ function fuzzysearch_cleanse($text) { $text = strip_tags($text); $text = drupal_strtolower($text); return preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); } /** * Form to search the index */ function fuzzysearch_box_form() { $form['keys'] = array( '#type' => 'textfield', '#size' => 15, '#default_value' => '', ); $form['submit'] = array( '#type' => 'submit', '#value' => t('Search'), ); $form['#submit'][] = 'fuzzysearch_form_submit'; return $form; } /** * Theme the output of the search block */ function theme_fuzzysearch_box_form($form) { $output .= '
'. drupal_render($form['keys']) . drupal_render($form['submit']) .'
'; $output .= drupal_render($form); return $output; } /** * Implementation of hook_block(). */ function fuzzysearch_block($op = 'list', $delta = 0, $edit = array()) { switch ($op) { case 'list': $blocks[0]['info'] = t('Fuzzy search form'); $blocks[1]['info'] = t('Fuzzy search title query'); return $blocks; break; case 'view': if (user_access('fuzzysearch content')) { switch ($delta) { case 0: $block['content'] = drupal_get_form('fuzzysearch_box_form'); $block['subject'] = t('Search'); return $block; break; case 1: if ($_GET['fuzzysearch']) { $block['content'] = theme('fuzzysearch_show_results', check_plain($_GET['fuzzysearch']), variable_get('fuzzysearch_block_theme', BLOCK_THEME), variable_get('fuzzysearch_block_limit', 5)); } return $block; break; } } break; case 'configure': switch ($delta) { case 1: $form["fuzzysearch_block_limit"] = array( '#type' => 'select', '#title' => t('Number of results to display'), '#default_value' => variable_get('fuzzysearch_block_limit', 5), '#options' => drupal_map_assoc(range(1, 15)) ); $form['fuzzysearch_block_theme'] = array( '#type' => 'radios', '#title' => t('Display method'), '#default_value' => variable_get('fuzzysearch_block_theme', 0), '#options' => array(BLOCK_THEME => 'Titles', NODE_THEME => 'Nodes'), '#description' => t('Show titles only or node theme.') ); return $form; break; } break; case 'save': switch ($delta) { case 1: variable_set('fuzzysearch_block_limit', $edit['fuzzysearch_block_limit']); variable_set('fuzzysearch_block_theme', $edit['fuzzysearch_block_theme']); break; } } } /** * Form to search the index */ function fuzzysearch_form($form_state, $keys = '') { $form['keys'] = array( '#title' => t('Enter search phrase'), '#type' => 'textfield', '#size' => 35, '#default_value' => $keys, ); $form['submit'] = array( '#type' => 'submit', '#value' => t('Search'), ); return $form; } /** * Redirect to callback with keys so that the search can be linked to. */ function fuzzysearch_form_submit($form, &$form_state) { // The search form relies on control of the redirect destination for its // functionality, so we override any static destination set in the request, // for example by drupal_access_denied() or drupal_not_found() // (see http://drupal.org/node/292565). if (isset($_REQUEST['destination'])) { unset($_REQUEST['destination']); } if (isset($_REQUEST['edit']['destination'])) { unset($_REQUEST['edit']['destination']); } $form_state['redirect'] = 'fuzzysearch/results/'. $form_state['values']['keys']; } /** * Output formatting for the search form */ function theme_fuzzysearch_form($form) { $output .= '
'. drupal_render($form['keys']) . drupal_render($form['submit']) .'
'; $output .= drupal_render($form); return $output; } /** * Process the search query */ function fuzzysearch_process($query, $theme = NODE_THEME, $limit = 10) { global $user; global $multibyte; // if no keys were entered do not display anything below the search form if (!$query) { return; } // Sanitize query again because it can be submitted from url as well as form. $query = fuzzysearch_cleanse($query); // Log the search keys: watchdog('fuzzysearch', '%query', array('%query' => $query), WATCHDOG_NOTICE, l(t('results'), 'fuzzysearch/results/'. $query)); // Hook_fuzzysearch_filter lets modules filter text. This should be used for // more complex filtering. Stop words should not use this. Create a stopword // file instead. See fuzzysearch/stopwords/README.txt. foreach (module_implements('fuzzysearch_filter') as $name) { $function = $name .'_fuzzysearch_filter'; $query = $function('search', $query); } // Remove stopwords. $query = fuzzysearch_stopwords($query); $nlength = variable_get('fuzzysearch_ngram_length', 3); $min_spelling = variable_get('fuzzysearch_spelling', 30); $excerpt = variable_get('fuzzysearch_excerpt', 200); $boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))'; $words = explode(' ', $query); // Build the WHERE clause for the ngrams. foreach ($words as $k => $word) { // @todo Change type of query based on boolean operators $length = drupal_strlen($word); if ($length > $nlength) { // Determine lengths which we want to search for if ($length > 3) { // 5 letter word matches down to 4 letter words $comp = 1 / (4 - $nlength + 1); } if ($length > 5) { // 6 and 7 letter words match down to 5 letter words $comp = 1 / (5 - $nlength + 1); } if ($length > 7) { // anything larger matches down 2 letters than its own length $comp = 1 / ($length - 2 - $nlength + 1); } $comp = number_format($comp + .001, 3) * 100; for ($i=0; $i < $length - 2; $i++) { $clause .= " (ngram = '". drupal_substr($word, $i, $nlength) ."' AND completeness < ". $comp .") OR"; } } // Words as long as ngrams length are inserted as is. else { $clause .= " ngram = '". $word ."' OR"; } } $clause = preg_replace("/ OR$/", '', $clause); // @todo: Fix the minimum completeness so that a single qgram match doesn't necessarily return a match $min_completeness = check_plain(variable_get('fuzzysearch_min_completeness', 40)); // Get content types to exclude from results. They are still indexed. $types = array_filter(variable_get('fuzzysearch_nodetypes', array(''))); // Build the query args and placeholders. $args[] = $min_completeness; $args += $types; $placeholders = count($types) ? db_placeholders($types, 'text'): '\'\''; // Main query $sql = "SELECT n.nid, MAX(n.moderate) AS moderate, MAX(n.uid) AS uid, MAX(n.type) AS type, MAX(n.status) AS status, SUM(subpercent) AS percent, SUM(subscore) AS score FROM (SELECT DISTINCT word_id, nn.nid, SUM(completeness) AS subpercent, SUM(score) AS subscore FROM {fuzzysearch_index} s LEFT JOIN {node} nn ON (nn.nid = s.nid) WHERE (($clause)) GROUP BY word_id, nn.nid HAVING SUM(completeness) >= %d) AS q LEFT JOIN {node} n on n.nid = q.nid WHERE n.status = 1 AND n.type NOT IN ($placeholders) GROUP BY n.nid ORDER BY percent DESC, score DESC"; // Count query $sql_count = "SELECT COUNT(DISTINCT(n.nid)) FROM (SELECT nn.type, nn.uid, nn.moderate, nn.nid, CEILING(SUM(completeness)) AS completeness, SUM(score) AS score FROM {fuzzysearch_index} AS s LEFT JOIN {node} nn on s.nid = nn.nid WHERE $clause GROUP BY word_id, s.nid, nn.type, nn.uid, nn.moderate, nn.nid HAVING SUM(completeness) >= %d) AS q LEFT JOIN {node} n on n.nid = q.nid WHERE n.status = 1 AND n.type NOT IN ($placeholders)"; $sql = db_rewrite_sql($sql); $sql_count = db_rewrite_sql($sql_count); $block_limit = $theme == BLOCK_THEME ? variable_get('fuzzysearch_block_limit', 5) : 0; if ($block_limit) { $pager_results = db_query($sql. ' LIMIT '. $block_limit, $args); } else { $pager_results = pager_query($sql, $limit, 0, $sql_count, $args); } // Load the matched nodes. while ($row = db_fetch_object($pager_results)) { $node = node_load($row->nid); $node->score = $row->score; $node->completeness = $row->percent; // If this is just a title search, we can skip all the processing below. if ($theme == 1) { // Build the node body. This grabs cck field labels and values. Remove // double spaces added for html legibility by cck. $node->build_mode = NODE_BUILD_SEARCH_RESULT ; $node = node_build_content($node, FALSE, FALSE); $node->body = preg_replace("/ +/"," ", drupal_render($node->content)); // Add the comments to the node for highlighting. if (function_exists('comment_render') && $node->comment && user_access('access comments')) { $comments = db_query('SELECT subject, comment FROM {comments} WHERE nid = %d AND status = %d', $node->nid, COMMENT_PUBLISHED); while($comment = db_fetch_object($comments)) { $node->body .= ' '. strip_tags($comment->subject) .' '. strip_tags($comment->comment); } } // Query the matched nodes for the search ngrams. We use this for fuzzy // highlighting of misspelled words. We do this per node to narrow // the possible false ngrams when a misspelled ngram matches a real one. // This could still return some false ngrams, but that's why it's fuzzy. $sql_ngrams = " SELECT s.ngram, s.word_id, s.completeness FROM {fuzzysearch_index} s LEFT JOIN {node} n ON (n.nid = s.nid) WHERE (($clause) AND n.nid = $row->nid AND n.status = 1 AND n.type NOT IN ($placeholders))"; $ngrams = db_query($sql_ngrams, $args); $clean_grams = array(); $i = 0; while ($ngram = db_fetch_array($ngrams)) { $clean_grams[$ngram['ngram']][] = $ngram; $i++; } // Ngrams can occur multiple times, so filter. $clean_grams = fuzzysearch_unique($clean_grams); // This will hold our search terms. $clean_words = explode(' ', $query); // Now we rebuild the words stripping out misspelled ngrams. foreach ($clean_words as $key => $clean_word) { // If we have an exact match, let's skip the work to check for misspellings. if (!preg_match('/\b'. $clean_word .'\b/iu', $node->body)) { $pos = array(); $id_count = array(); $bad_positions = array(); $len = drupal_strlen($clean_word); // Ignore search terms under 3 characters. if ($len >= 3) { // Get the position of each good hit. foreach ($clean_grams as $n => $gram) { if ($multibyte == UNICODE_MULTIBYTE) { if (mb_stripos($clean_word, $n) !== FALSE) { $pos[mb_stripos($clean_word, $n)] = $n; // Keep count of our word ids so we can try to guess which word // we are trying to match. foreach ($clean_grams[$n] as $ngram_data) { $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1; } } } // No multibyte. else { if (stripos($clean_word, $n) !== FALSE) { $pos[stripos($clean_word, $n)] = $n; // Keep count of our word ids so we can try to guess which word // we are trying to match. foreach ($clean_grams[$n] as $ngram_data) { $id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1; } } } } ksort($pos); // This gives us an array with the most common word_id as the first // element. arsort($id_count); $id_count = array_keys($id_count); // Remove any position matches that are not in our likely word (the // word with the highest word_id count). foreach ($pos as $position => $pgram) { $pmatch = FALSE; foreach ($clean_grams[$pgram] as $pid) { if ($pid['word_id'] == $id_count[0]){ $pmatch = TRUE; } } if (!$pmatch) { unset($pos[$position]); } } // Start with a dummy word at the right length, but only if there are // some matching ngram hits. $newword = ''; if (count($pos)) { $newword = str_pad('', $len, '.'); } $hits = $misses = $i = 0; // Check character by character for ngram matches. We don't need to check // beyond the first character of the ngram. for ($i = 0; $i <= $len - $nlength; $i++) { // This is a match, so insert it into our dummy word. if (isset($pos[$i])) { $newword = drupal_substr($newword, 0, $i) . $pos[$i] . drupal_substr($newword, $i + $nlength, $len); ++$hits; } // This is a miss, so replace with a wildcard. else { // But don't overwrite a letter, only a '.' . if(drupal_substr($newword, $i, 1) == '.') { $newword = ($i == 0 || $i > $len - $nlength) ? $newword : drupal_substr($newword, 0, $i) . '.+' . drupal_substr($newword, $i + $len); } ++$misses; $bad_positions[] = $i; } } // Only keep our rebuilt word if it meets our minimum spelling match score. if (($hits)/($len - 2) * 100 >= $min_spelling) { // 2 consecutive misses could indicate a missing letter, so find the likely // missing position and replace with the wildcard. if (count($bad_positions) >= 2) { foreach ($bad_positions as $bad_key => $bad_pos) { if ($bad_positions[$bad_key + 1] == ($bad_pos + 1)) { $newword = drupal_substr($newword, 0, $bad_pos) . '.' . drupal_substr($newword, $bad_pos + 1, $len); } } } // Remove consecutive wildcards and add word boundaries. $newword = preg_replace("/\.\./", ".+", $newword); $newword = preg_replace("/\.\+\.\+/", ".+", $newword); $newword = '\b\w*'. trim($newword, '.+') .'.+?\b'; $clean_words[$key] = $newword; } else { unset($clean_words[$key]); } } // Under 3 characters, so unset the word. else { unset($clean_words[$key]); } } } // Build a replacement node body containing sections of text with the found // words, with leading and trailing text. $node->body = strip_tags($node->body); $section = array(); $section_length = array(); foreach ($clean_words as $k => $word) { $location = 0; // If the word is found, add its position to $section. while (preg_match('/'. $word .'/iu', $node->body, $matches, PREG_OFFSET_CAPTURE, $location) && $word != '') { // Make sure we didn't traverse any word breaks by checking for spaces. // Pretty sure we don't need mb_stripos() here because we don't actually // care about the position if (!stripos($matches[0][0], ' ')) { $section[] = _fuzzysearch_char_count($node->body, $matches[0][1]); $section_length[$matches[0][1]] = drupal_strlen($word); $clean_words[$k] = $matches[0][0]; } // Increase $location by one so we don't find the previous location. $location = $matches[0][1] + 1; } } // Because we found words one by one, the locations are out of order. Sort // so that the locations are in natural order. asort($section); ksort($section_length); $section = array_values($section); $section_length = array_values($section_length); $p = 0; $found = $newbody = ''; $trail = $lead = $excerpt / 2; $start = $section[0]; while (isset($section[$p])) { // If the current section is within the previous, let's not create a new one // so we don't have any duplicate text. if ($section[$p] + $lead + $section_length[$p] + $trail > $section[$p + 1] && $section[$p + 1]) { $trail = $section[$p+1] + $section_length[$p+1] + $lead - $start; $p++; continue; } // Put an excerpt into our replacement node body, with the // found word in the center. $found = $start - $lead < 0 ? drupal_substr($node->body, 0, $excerpt) : drupal_substr($node->body, $start - $lead, $trail + $lead); if (variable_get('fuzzysearch_max_result', 0) && (strlen($newbody . $found) > variable_get('fuzzysearch_max_result', 0))) { break; } $newbody .= '...'. $found .'... '; $p++; $start = $section[$p]; $trail = $lead; } // Wrap the found words in a tag to highlight them. $newbody = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $clean_words) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '\0', $newbody); // If there are no result excerpts in the body, at least show the teaser. $node->body = $newbody == '' ? $node->teaser : $newbody; $results[] = $node; } else { $results[] = $node; } } return $results; } /** * Gather results from the index and build result page. */ function fuzzysearch_show_results($keys = '') { return theme('fuzzysearch_show_results', $keys); } /** * Theme hook for rendering search results. **/ function theme_fuzzysearch_show_results($keys = '', $theme = NODE_THEME, $limit = 10) { drupal_add_css(drupal_get_path('module', 'fuzzysearch') .'/fuzzysearch.css', 'module'); $results = fuzzysearch_process($keys, $theme, $limit); if ($results) { if ($theme == NODE_THEME) { drupal_set_title(check_plain($keys)); $output .= '
'; $output .= drupal_get_form('fuzzysearch_form', $keys); $output .= '
'; $output .= theme('fuzzysearch_results', $results); $output .= theme('pager', NULL, $limit); } else { $output .= theme('fuzzysearch_results_title', $results); } } else { $output = '

'. t('No matches were found.') .'

'; } return $output; } /** * Theme the search results */ function theme_fuzzysearch_results($results) { drupal_set_title(t('Search results')); $output .= '
'; $output .= '
'; foreach ($results as $result) { $i++; $odd = $i%2 ? 'odd' : 'even'; $output .= '
'; $output .= theme('fuzzysearch_result', $result); if (variable_get('fuzzysearch_debug_score', FALSE)) { $output .= '

'. t('Completeness: ') . number_format($result->completeness) . t(' Score: ') . number_format($result->score) .'

'; } $output .= '
'; } $output .= '
'; return $output; } function theme_fuzzysearch_results_title($results) { $output .= '
'; $output .= '
'; return $output; } // Using some code from drupal_substr() to set character count of the found // clean word. function _fuzzysearch_char_count($text, $position) { $bytes = 0; // Count all the continuation bytes from the start until we have found // $start characters $bytes = -1; $chars = -1; while ($bytes < $position) { $bytes++; $c = ord($text[$bytes]); if ($c < 0x80 || $c >= 0xC0) { $chars++; } } return $chars; } /** * Implementation of watchdog(). */ function fuzzysearch_watchdog($log_entry) { // Experimental!: // If there is a 404 header and a watchdog entry for it and it's requested in // the module settings, we take over the 404 page and show results on the // watchdog message. if (stristr(drupal_get_headers(), '404 not found') && variable_get('fuzzysearch_not_found', FALSE)) { foreach ($_GET as $term) { $query .= str_replace('/', ' ', $term) .' '; } drupal_set_message(t('The page you requested: "@page," could not be found. A site search for that page found the following results:', array('@page' => $log_entry['message']))); drupal_goto('fuzzysearch/results/'. $query); } } /** * Remove stop words from search query and text to be indexed. * * @param $text The text to be stripped of stop words. */ function fuzzysearch_stopwords($text) { static $stop_words; if (!is_array($stop_words)) { $stop_words = array(); $files = file_scan_directory('sites/all/libraries/fuzzysearch/stopwords', 'fuzzysearch_stopwords_.+\.txt', array(), 0, TRUE, 'name' ); foreach ($files as $file) { $stop_words = array_merge($stop_words, explode(' ', file_get_contents($file->filename))); } } $text = explode(' ', $text); $text = array_diff($text, $stop_words); return implode(' ', $text); } /** * Recursive array_unique(). * */ function fuzzysearch_unique($array) { $result = array_map("unserialize", array_unique(array_map("serialize", $array))); foreach ($result as $key => $value) { if ( is_array($value) ) { $result[$key] = fuzzysearch_unique($value); } } return $result; }