'Fuzzysearch',
'description' => 'Fuzzysearch settings allow you to index certain node data',
'page callback' => 'fuzzysearch_admin',
'access arguments' => array('administer fuzzysearch'),
'type' => MENU_NORMAL_ITEM,
);
$items['fuzzysearch/results'] = array(
'title' => 'Search',
'page callback' => 'fuzzysearch_show_results',
'access arguments' => array('fuzzysearch content'),
'type' => MENU_DYNAMIC_ITEM,
);
$items['admin/reports/fuzzysearch'] = array(
'title' => 'Top fuzzysearch phrases',
'description' => 'View most popular fuzzysearch phrases.',
'page callback' => 'dblog_top',
'page arguments' => array('fuzzysearch'),
'access arguments' => array('access site reports'),
'file' => 'dblog.admin.inc',
'file path' => drupal_get_path('module', 'dblog'),
);
return $items;
}
/**
* Implementation of hook_perm().
*/
function fuzzysearch_perm() {
return array('administer fuzzysearch', 'fuzzysearch content');
}
/**
* Implementation of hook_theme().
*/
function fuzzysearch_theme() {
return array(
'fuzzysearch_box_form' => array(
'args' => array(
'form' => NULL,
),
),
'fuzzysearch_form' => array(
'args' => array(
'form' => NULL,
),
),
'fuzzysearch_show_results' => array(
'args' => array(
'keys' => NULL,
),
),
'fuzzysearch_results_title' => array(
'args' => array(
'results' => NULL,
),
),
'fuzzysearch_results' => array(
'args' => array(
'results' => NULL,
),
),
'fuzzysearch_result' => array(
'template' => 'fuzzysearch-result',
'arguments' => array('node' => NULL, 'teaser' => FALSE, 'page' => FALSE),
),
);
}
/**
* Implementation of hook_content_build_modes().
*/
function fuzzysearch_content_build_modes() {
return array(
'fuzzysearch' => array(
'title' => t('Fuzzy Search'),
'build modes' => array(
NODE_BUILD_SEARCH_INDEX => array(
'title' => t('Search Index'),
'views style' => FALSE,
),
NODE_BUILD_SEARCH_RESULT => array(
'title' => t('Search Result'),
'views style' => FALSE,
),
),
),
);
}
function fuzzysearch_preprocess_fuzzysearch_result(&$variables) {
$node = $variables['node'];
if (module_exists('taxonomy')) {
$variables['taxonomy'] = taxonomy_link('taxonomy terms', $node);
}
else {
$variables['taxonomy'] = array();
}
if ($variables['teaser'] && $node->teaser) {
$variables['content'] = $node->teaser;
}
elseif (isset($node->body)) {
$variables['content'] = $node->body;
}
else {
$variables['content'] = '';
}
$variables['date'] = format_date($node->created);
$variables['links'] = !empty($node->links) ? theme('links', $node->links, array('class' => 'links inline')) : '';
$variables['name'] = theme('username', $node);
$variables['node_url'] = url('node/'. $node->nid);
$variables['terms'] = theme('links', $variables['taxonomy'], array('class' => 'links inline'));
$variables['title'] = check_plain($node->title);
// Flatten the node object's member fields.
$variables = array_merge((array)$node, $variables);
// Display info only on certain node types.
if (theme_get_setting('toggle_node_info_'. $node->type)) {
$variables['submitted'] = theme('node_submitted', $node);
$variables['picture'] = theme_get_setting('toggle_node_user_picture') ? theme('user_picture', $node) : '';
}
else {
$variables['submitted'] = '';
$variables['picture'] = '';
}
}
/**
* Build the administration settings panel.
*/
function fuzzysearch_admin() {
$total = db_result(db_query("SELECT COUNT(*) FROM {node}"));
$remaining = db_result(db_query("SELECT COUNT(*) FROM {fuzzysearch_index_queue}"));
$count = format_plural($remaining, 'There is 1 item left to index.', 'There are @count items left to index.');
$percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) .'%';
$status = '
'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'
';
$output .= $status;
$output .= drupal_get_form('fuzzysearch_admin_form');
$output .= drupal_get_form('fuzzysearch_scoring');
return $output;
}
/**
* Module Administration (clear index).
*/
function fuzzysearch_admin_form() {
$form['index'] = array(
'#title' => t('Index settings'),
'#type' => 'fieldset'
);
$form['index']['ngram_length'] = array(
'#type' => 'textfield',
'#title' => t('Ngram length'),
'#size' => 1,
'#maxlength' => 1,
'#description' => t('Fuzzysearch breaks down words into ngrams, or pieces of text this long, for indexing and searching. Default value is 3. You must reindex for changes to take effect.'),
'#default_value' => variable_get('fuzzysearch_ngram_length', 3),
);
$form['index']['index_cron'] = array(
'#type' => 'textfield',
'#title' => t('Nodes to index per cron run'),
'#size' => 4,
'#maxlength' => 4,
'#description' => t('This sets the number of nodes to index per cron run. A high number may cause PHP to timeout.'),
'#default_value' => variable_get('fuzzysearch_index_cron', 150),
);
$form['index']['reindex'] = array(
'#type' => 'checkbox',
'#title' => 'Rebuild index',
'#description' => t('Check the box and click submit to re-index all nodes on the site. Re-indexing will begin with the the next cron run.'),
'#default_value' => false
);
$form['search'] = array(
'#title' => t('Search settings'),
'#type' => 'fieldset'
);
$form['search']['completeness'] = array(
'#type' => 'textfield',
'#title' => t('Minimum completeness'),
'#size' => 3,
'#maxlength' => 3,
'#description' => t('Enter value between 0 and 100 to set the minimum match completeness required in the returned results.'),
'#default_value' => variable_get('fuzzysearch_min_completeness', 40),
);
$form['display'] = array(
'#title' => t('Display settings'),
'#type' => 'fieldset'
);
$form['display']['debug_score'] = array(
'#type' => 'checkbox',
'#title' => t('Display scoring'),
'#description' => t('If selected, the completeness and score of the results will be shown below each result'),
'#default_value' => variable_get('fuzzysearch_debug_score', FALSE)
);
$form['display']['excerpt'] = array(
'#type' => 'textfield',
'#title' => t('Result excerpt length'),
'#size' => 3,
'#maxlength' => 3,
'#description' => t('Set the length of the displayed text excerpt containing a found search term. Applies per found term.'),
'#default_value' => variable_get('fuzzysearch_excerpt', 200),
);
$form['display']['max_result'] = array(
'#type' => 'textfield',
'#title' => t('Maximum result length'),
'#size' => 4,
'#maxlength' => 4,
'#description' => t('Set the maximum length of the displayed result. Set to 0 for unlimited length. Applies per result.'),
'#default_value' => variable_get('fuzzysearch_max_result', 0),
);
$form['display']['spelling'] = array(
'#type' => 'textfield',
'#title' => t('Minimum spelling score'),
'#size' => 3,
'#maxlength' => 3,
'#description' => t('Fuzzysearch tries to highlight search terms that may be misspelled. You can set the minimum threshold, which is calculated as a ratio of ngram hits to misses in a term. 0 may cause a misspelling to highlight everything, and 100 will only highlight exact terms. Enter value between 0 and 100. Changing this setting does not require reindexing.'),
'#default_value' => variable_get('fuzzysearch_spelling', 30),
);
$options = node_get_types('names');
$form['display']['nodetypes'] = array(
'#type' => 'checkboxes',
'#title' => t('Check any node types you want to exclude from search results'),
'#description' => t('This does not affect how Fuzzy Search indexes your content types.'),
'#default_value' => variable_get('fuzzysearch_nodetypes', array('')),
'#options' => $options,
);
$form['display']['not_found'] = array(
'#type' => 'checkbox',
'#title' => t('Experimental: Let fuzzysearch handle 404 Page Not Found errors'),
'#description' => t('If selected, fuzzysearch will interrupt the normal page not found process and show the site user results using everything after http://example.com/ as search tems. Any time drupal core or a contibuted module sets a 404 header and writes the not found to watchdog(), fuzzysearch will take over after the watchdog entry is created. Please Note: This has the potential to be a big performance drain on your site.'),
'#default_value' => variable_get('fuzzysearch_not_found', FALSE)
);
$form['submit'] = array(
'#type' => 'submit',
'#value' => 'Submit',
'#weight' => 10
);
return $form;
}
function fuzzysearch_admin_form_validate($form, &$form_state) {
if (!is_numeric($form_state['values']['ngram_length'])) {
form_set_error('ngram_length', t('Ngram length must be an integer.'));
}
if (!is_numeric($form_state['values']['index_cron'])) {
form_set_error('index_cron', t('Cron index number must be an integer.'));
}
if (!is_numeric($form_state['values']['completeness'])) {
form_set_error('completeness', t('Minimum match completeness must be an integer.'));
}
if (!is_numeric($form_state['values']['excerpt'])) {
form_set_error('spelling', t('Excerpt length must be an integer.'));
}
if (!is_numeric($form_state['values']['max_result'])) {
form_set_error('spelling', t('Maximum result length must be an integer.'));
}
if ($form_state['values']['excerpt'] > $form_state['values']['max_result'] && $form_state['values']['max_result'] != 0) {
drupal_set_message(t('The maximum result length has been set smaller than the excerpt length. Any results will display the result\'s teaser instead of the found excerpt.'), 'warning');
}
if (!is_numeric($form_state['values']['spelling'])) {
form_set_error('spelling', t('Minimum spelling score must be an integer.'));
}
}
function fuzzysearch_admin_form_submit($form, &$form_state) {
if ($form_state['values']['reindex']) {
// Refresh the index queue.
db_query("DELETE FROM {fuzzysearch_index_queue}");
$query = db_query("SELECT nid FROM {node}");
while ($row = db_fetch_object($query)) {
fuzzysearch_reindex($row->nid, 'fuzzysearch');
}
drupal_set_message('Nodes ready for reindexing, please run cron to update the index.');
}
variable_set('fuzzysearch_ngram_length', $form_state['values']['ngram_length']);
variable_set('fuzzysearch_index_cron', $form_state['values']['index_cron']);
variable_set('fuzzysearch_min_completeness', $form_state['values']['completeness']);
variable_set('fuzzysearch_excerpt', $form_state['values']['excerpt']);
variable_set('fuzzysearch_max_result', $form_state['values']['max_result']);
variable_set('fuzzysearch_spelling', $form_state['values']['spelling']);
variable_set('fuzzysearch_debug_score', $form_state['values']['debug_score']);
variable_set('fuzzysearch_nodetypes', $form_state['values']['nodetypes']);
variable_set('fuzzysearch_not_found', $form_state['values']['not_found']);
}
/**
* Implementation of hook_nodeapi().
*
* Remove node from index on deletion and queue node for indexing on insert.
*/
function fuzzysearch_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) {
switch ($op) {
case 'update':
case 'insert':
fuzzysearch_reindex($node->nid, 'fuzzysearch');
break;
case 'delete':
db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $node->nid);
break;
}
}
/**
* Set factors for scores returned by modules implementing hook_search_score().
*/
function fuzzysearch_scoring() {
$form['scoring'] = array(
'#title' => t('Scoring adjustment'),
'#description' => t('Choose a multiplier for each of the score factors. Changing these settings will require all content to be reindexed.'),
'#type' => 'fieldset',
);
// Allow multipliers to range from 10 = max impact on score, to 0 = no impact on score.
$select_values = array(
10 => 10,
9 => 9,
8 => 8,
7 => 7,
6 => 6,
5 => 5,
4 => 4,
3 => 3,
2 => 2,
1 => 1,
0 => 0);
// Return all the score modifiers using hook_search_score
// expects each score modifier to return an array defining the title and
// description of the modifier.
$scores = module_invoke_all('fuzzysearch_score', 'settings', NULL);
foreach ($scores as $key => $score) {
$form_index = $score['id'];
$form['scoring'][$form_index] = array(
'#title' => $score['title'],
'#description' => $score['description'],
'#type' => 'select',
'#options' => $select_values,
'#default_value' => variable_get('fuzzysearch_scoring_'. $score['id'], 5),
);
}
$form['scoring']['submit'] = array(
'#value' => t('Update score factors'),
'#type' => 'submit',
);
return $form;
}
/**
* Save the score modifiers as set in the administrative form.
*/
function fuzzysearch_scoring_submit($form, &$form_state) {
foreach ($form_state['values'] as $key => $value) {
if ($key != 'op' || $key != 'submit' || $key != 'form_token' || $key != 'form_id') {
variable_set('fuzzysearch_scoring_'. $key, $value);
}
}
drupal_set_message('Score factor multipliers have been updated');
}
/**
* External API function that allows modules to flag a node for reindexing.
*
* @param $nid
* Nid of the node to be reindexed.
* @param $module
* Name of the module flagging the node.
*/
function fuzzysearch_reindex($nid, $module) {
$query = db_query("SELECT * FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
if (!db_result($query)) {
db_query("INSERT INTO {fuzzysearch_index_queue} (nid, module, timestamp) VALUES (%d, '%s', %d)", $nid, $module, time());
}
}
/**
* Implementation of hook_cron().
*/
function fuzzysearch_cron() {
$query = db_query_range("SELECT nid FROM {fuzzysearch_index_queue}", 0, variable_get('fuzzysearch_index_cron', 150));
while ($result = db_fetch_object($query)) {
fuzzysearch_index($result->nid);
}
}
/**
* Index the node data in the fuzzy index table.
*
* @param nid
* The node id of the node being indexed.
* @return
* Returns TRUE on success, FALSE on failure.
*/
function fuzzysearch_index($nid) {
// First step is removing past index
db_query("DELETE FROM {fuzzysearch_index} WHERE nid = %d", $nid);
$node = node_load($nid);
// Let modules alter a node before indexing or prevent it from being indexed.
// See readme.txt.
foreach (module_implements('fuzzysearch_index') as $name) {
$function = $name .'_fuzzysearch_index';
$node = $function($node);
if (!$node) {
// Update the node table to make indexed = 1;
db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
return;
}
}
// Index node title
$text .= ' '. $node->title .'
';
// Build and index the node body.
$node->build_mode = NODE_BUILD_SEARCH_INDEX;
$node = node_build_content($node, FALSE, FALSE);
$node->body = drupal_render($node->content);
$text .= $node->body;
// Implementation of nodeapi's update_index op.
$outside_text = module_invoke_all('nodeapi', $node, 'update index', NULL, NULL);
if ($outside_text) {
foreach ($outside_text as $content) {
$text .= ' '. $content;
}
}
// Insert code to allow other modules to filter indexed text before indexing
// Multipliers for scores of words inside certain HTML tags.
// Note: 'a' must be included for link ranking to work.
$tags = array('h1' => 10,
'h2' => 9,
'h3' => 8,
'h4' => 7,
'h5' => 6,
'h6' => 5,
'u' => 2,
'b' => 2,
'i' => 2,
'strong' => 2,
'em' => 2,
'a' => 5);
// Strip off all ignored tags to speed up processing
$text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');
// Hook_fuzzysearch_filter lets modules filter text. This should be used for
// more complex filtering. Stop words should not use this. Create a stopword
// file instead. See fuzzysearch/stopwords/README.txt.
foreach (module_implements('fuzzysearch_filter') as $name) {
$function = $name .'_fuzzysearch_filter';
$text = $function('index', $text);
}
// Allow other modules to modify the score of the node based on each owns calculations
// the sum of all the scores added to each node is then multiplied by the score of the word,
// this allows for faster result queries because all scoring is done at the time of indexing
$hook_scores = module_invoke_all('fuzzysearch_score', 'index', $node);
// Build the final score multiplier for each node based on returned multipliers from other nodes
foreach ($hook_scores as $score) {
$multiplier = variable_get('fuzzysearch_scoring_'. $score['id'], 5);
$hook_score += $score['score'] * $multiplier;
}
// Begin indexing content.
// Find all words not located within tags (score = 1)
$content = preg_replace('/<([A-Z][A-Z0-9]*)[^>]*>(.*?)<\/\1>/i', '', $text);
$content = fuzzysearch_cleanse($content);
// Remove stopwords.
$text = fuzzysearch_stopwords($text);
$words = array();
$index_words = array();
$words = preg_split('/\s/', $content, -1, PREG_SPLIT_NO_EMPTY);
// Build the index array with scores
foreach ($words as $word) {
$key = array_search($word, $index_words);
if ($key === FALSE) {
$index_words[] = $word;
$index_scores[] = 1;
}
else {
$index_scores[$key] += 1;
}
}
// Find all words located within tags (score > 1)
preg_match_all('/<([A-Z][A-Z0-9]*)([^>]*)>(.*?)<\/\1>/i', $text, $tagged);
// filter through each set of content inbetween tags
foreach ($tagged[3] as $key => $content) {
$content = fuzzysearch_cleanse($content);
$words = preg_split('/\s/', $content, -1, PREG_SPLIT_NO_EMPTY);
$tag = $tagged[1][$key];
$tag_score = $tags[$tag];
foreach ($words as $word) {
$key = array_search($word, $index_words);
if ($key === FALSE) {
$index_words[] = $word;
$index_scores[] = $tag_score;
}
else {
$index_scores[$key] += $tag_score;
}
}
}
foreach ($index_words as $key => $word) {
// Each word gets a word_id, which comes from the last value in the id column,
// which is serial. First we check to make sure it's set. We have to do this
// to avoid a postrgresql error.
if (!$word_id) {
db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (0, 0, 'xxx', 0, 0)");
db_query("DELETE FROM {fuzzysearch_index} WHERE (nid = 0 AND word_id = 0 AND ngram = 'xxx' AND completeness = 0 AND score = 0)");
$word_id = db_last_insert_id('fuzzysearch_index', 'id');
}
else {
$word_id = db_last_insert_id('fuzzysearch_index', 'id');
}
fuzzysearch_index_insert($word, $word_id, $nid, $index_scores[$key], $hook_score);
}
// Update the node table to make indexed = 1;
db_query("DELETE FROM {fuzzysearch_index_queue} WHERE nid = %d", $nid);
}
/**
* Insert the words into the database as they are indexed.
*
* @param $word
* Word to insert into the index.
* @param $nid
* The node id that is to be associated with this word.
* @param $word_score
* Score given to the word based on the tag it is in.
* @param $node_score
* Score modifier given to the node from hook_search_score.
*/
function fuzzysearch_index_insert($word, $word_id, $nid, $word_score, $node_score) {
$length = drupal_strlen($word);
$nlength = variable_get('fuzzysearch_ngram_length', 3);
// Ensure that having all score modifiers set to 0 will not affect our natural scoring
if ($node_score > 0) {
$score = $word_score * $node_score;
}
else {
$score = $word_score;
}
if ($length > $nlength) {
// Calculate how complete the ngram is compared to the length of the word
$completeness = 100 / ($length - $nlength + 1);
// Create ngrams and index them
for ($i=0; $i < ($length - $nlength + 1); $i++) {
db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)",
$nid, $word_id, drupal_substr($word, $i, $nlength), $completeness, $score);
}
}
else {
// The ngram is the same length as the actual word so it is complete
$completeness = 100;
// Index the ngram
db_query("INSERT INTO {fuzzysearch_index} (nid, word_id, ngram, completeness, score) VALUES (%d, %d, '%s', %f, %f)",
$nid, $word_id, $word, $completeness, $score);
}
}
/**
* Implementation of hook_comment().
*
*/
function fuzzysearch_comment($a1, $op) {
switch ($op) {
// Reindex the node when comments are added or changed
case 'insert':
case 'update':
case 'delete':
case 'publish':
case 'unpublish':
fuzzysearch_reindex(is_array($a1) ? $a1['nid'] : $a1->nid, 'fuzzysearch');
break;
}
}
/**
* Strip all non alphanumeric characters from a string
*/
function fuzzysearch_cleanse($text) {
$text = strip_tags($text);
$text = drupal_strtolower($text);
return preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
}
/**
* Form to search the index
*/
function fuzzysearch_box_form() {
$form['keys'] = array(
'#type' => 'textfield',
'#size' => 15,
'#default_value' => '',
);
$form['submit'] = array(
'#type' => 'submit',
'#value' => t('Search'),
);
$form['#submit'][] = 'fuzzysearch_form_submit';
return $form;
}
/**
* Theme the output of the search block
*/
function theme_fuzzysearch_box_form($form) {
$output .= ''. drupal_render($form['keys']) . drupal_render($form['submit']) .'
';
$output .= drupal_render($form);
return $output;
}
/**
* Implementation of hook_block().
*/
function fuzzysearch_block($op = 'list', $delta = 0, $edit = array()) {
switch ($op) {
case 'list':
$blocks[0]['info'] = t('Fuzzy search form');
$blocks[1]['info'] = t('Fuzzy search title query');
return $blocks;
break;
case 'view':
if (user_access('fuzzysearch content')) {
switch ($delta) {
case 0:
$block['content'] = drupal_get_form('fuzzysearch_box_form');
$block['subject'] = t('Search');
return $block;
break;
case 1:
if ($_GET['fuzzysearch']) {
$block['content'] = theme('fuzzysearch_show_results', check_plain($_GET['fuzzysearch']), variable_get('fuzzysearch_block_theme', BLOCK_THEME), variable_get('fuzzysearch_block_limit', 5));
}
return $block;
break;
}
}
break;
case 'configure':
switch ($delta) {
case 1:
$form["fuzzysearch_block_limit"] = array(
'#type' => 'select',
'#title' => t('Number of results to display'),
'#default_value' => variable_get('fuzzysearch_block_limit', 5),
'#options' => drupal_map_assoc(range(1, 15))
);
$form['fuzzysearch_block_theme'] = array(
'#type' => 'radios',
'#title' => t('Display method'),
'#default_value' => variable_get('fuzzysearch_block_theme', 0),
'#options' => array(BLOCK_THEME => 'Titles', NODE_THEME => 'Nodes'),
'#description' => t('Show titles only or node theme.')
);
return $form;
break;
}
break;
case 'save':
switch ($delta) {
case 1:
variable_set('fuzzysearch_block_limit', $edit['fuzzysearch_block_limit']);
variable_set('fuzzysearch_block_theme', $edit['fuzzysearch_block_theme']);
break;
}
}
}
/**
* Form to search the index
*/
function fuzzysearch_form($form_state, $keys = '') {
$form['keys'] = array(
'#title' => t('Enter search phrase'),
'#type' => 'textfield',
'#size' => 35,
'#default_value' => $keys,
);
$form['submit'] = array(
'#type' => 'submit',
'#value' => t('Search'),
);
return $form;
}
/**
* Redirect to callback with keys so that the search can be linked to.
*/
function fuzzysearch_form_submit($form, &$form_state) {
// The search form relies on control of the redirect destination for its
// functionality, so we override any static destination set in the request,
// for example by drupal_access_denied() or drupal_not_found()
// (see http://drupal.org/node/292565).
if (isset($_REQUEST['destination'])) {
unset($_REQUEST['destination']);
}
if (isset($_REQUEST['edit']['destination'])) {
unset($_REQUEST['edit']['destination']);
}
$form_state['redirect'] = 'fuzzysearch/results/'. $form_state['values']['keys'];
}
/**
* Output formatting for the search form
*/
function theme_fuzzysearch_form($form) {
$output .= ''. drupal_render($form['keys']) . drupal_render($form['submit']) .'
';
$output .= drupal_render($form);
return $output;
}
/**
* Process the search query
*/
function fuzzysearch_process($query, $theme = NODE_THEME, $limit = 10) {
global $user;
global $multibyte;
// if no keys were entered do not display anything below the search form
if (!$query) {
return;
}
// Sanitize query again because it can be submitted from url as well as form.
$query = fuzzysearch_cleanse($query);
// Log the search keys:
watchdog('fuzzysearch', '%query', array('%query' => $query), WATCHDOG_NOTICE, l(t('results'), 'fuzzysearch/results/'. $query));
// Hook_fuzzysearch_filter lets modules filter text. This should be used for
// more complex filtering. Stop words should not use this. Create a stopword
// file instead. See fuzzysearch/stopwords/README.txt.
foreach (module_implements('fuzzysearch_filter') as $name) {
$function = $name .'_fuzzysearch_filter';
$query = $function('search', $query);
}
// Remove stopwords.
$query = fuzzysearch_stopwords($query);
$nlength = variable_get('fuzzysearch_ngram_length', 3);
$min_spelling = variable_get('fuzzysearch_spelling', 30);
$excerpt = variable_get('fuzzysearch_excerpt', 200);
$boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))';
$words = explode(' ', $query);
// Build the WHERE clause for the ngrams.
foreach ($words as $k => $word) {
// @todo Change type of query based on boolean operators
$length = drupal_strlen($word);
if ($length > $nlength) {
// Determine lengths which we want to search for
if ($length > 3) {
// 5 letter word matches down to 4 letter words
$comp = 1 / (4 - $nlength + 1);
}
if ($length > 5) {
// 6 and 7 letter words match down to 5 letter words
$comp = 1 / (5 - $nlength + 1);
}
if ($length > 7) {
// anything larger matches down 2 letters than its own length
$comp = 1 / ($length - 2 - $nlength + 1);
}
$comp = number_format($comp + .001, 3) * 100;
for ($i=0; $i < $length - 2; $i++) {
$clause .= " (ngram = '". drupal_substr($word, $i, $nlength) ."' AND completeness < ". $comp .") OR";
}
}
// Words as long as ngrams length are inserted as is.
else {
$clause .= " ngram = '". $word ."' OR";
}
}
$clause = preg_replace("/ OR$/", '', $clause);
// @todo: Fix the minimum completeness so that a single qgram match doesn't necessarily return a match
$min_completeness = check_plain(variable_get('fuzzysearch_min_completeness', 40));
// Get content types to exclude from results. They are still indexed.
$types = array_filter(variable_get('fuzzysearch_nodetypes', array('')));
// Build the query args and placeholders.
$args[] = $min_completeness;
$args += $types;
$placeholders = count($types) ? db_placeholders($types, 'text'): '\'\'';
// Main query
$sql = "SELECT n.nid, MAX(n.moderate) AS moderate, MAX(n.uid) AS uid, MAX(n.type) AS type, MAX(n.status) AS status, SUM(subpercent) AS percent, SUM(subscore) AS score
FROM (SELECT DISTINCT word_id, nn.nid, SUM(completeness) AS subpercent, SUM(score) AS subscore
FROM {fuzzysearch_index} s
LEFT JOIN {node} nn ON (nn.nid = s.nid)
WHERE (($clause))
GROUP BY word_id, nn.nid HAVING SUM(completeness) >= %d) AS q
LEFT JOIN {node} n on n.nid = q.nid
WHERE n.status = 1
AND n.type NOT IN ($placeholders)
GROUP BY n.nid ORDER BY percent DESC, score DESC";
// Count query
$sql_count = "SELECT COUNT(DISTINCT(n.nid))
FROM (SELECT nn.type, nn.uid, nn.moderate, nn.nid, CEILING(SUM(completeness)) AS completeness, SUM(score) AS score
FROM {fuzzysearch_index} AS s
LEFT JOIN {node} nn on s.nid = nn.nid
WHERE $clause
GROUP BY word_id, s.nid, nn.type, nn.uid, nn.moderate, nn.nid
HAVING SUM(completeness) >= %d) AS q
LEFT JOIN {node} n on n.nid = q.nid
WHERE n.status = 1
AND n.type NOT IN ($placeholders)";
$sql = db_rewrite_sql($sql);
$sql_count = db_rewrite_sql($sql_count);
$block_limit = $theme == BLOCK_THEME ? variable_get('fuzzysearch_block_limit', 5) : 0;
if ($block_limit) {
$pager_results = db_query($sql. ' LIMIT '. $block_limit, $args);
}
else {
$pager_results = pager_query($sql, $limit, 0, $sql_count, $args);
}
// Load the matched nodes.
while ($row = db_fetch_object($pager_results)) {
$node = node_load($row->nid);
$node->score = $row->score;
$node->completeness = $row->percent;
// If this is just a title search, we can skip all the processing below.
if ($theme == 1) {
// Build the node body. This grabs cck field labels and values. Remove
// double spaces added for html legibility by cck.
$node->build_mode = NODE_BUILD_SEARCH_RESULT ;
$node = node_build_content($node, FALSE, FALSE);
$node->body = preg_replace("/ +/"," ", drupal_render($node->content));
// Add the comments to the node for highlighting.
if (function_exists('comment_render') && $node->comment && user_access('access comments')) {
$comments = db_query('SELECT subject, comment FROM {comments} WHERE nid = %d AND status = %d', $node->nid, COMMENT_PUBLISHED);
while($comment = db_fetch_object($comments)) {
$node->body .= ' '. strip_tags($comment->subject) .' '. strip_tags($comment->comment);
}
}
// Query the matched nodes for the search ngrams. We use this for fuzzy
// highlighting of misspelled words. We do this per node to narrow
// the possible false ngrams when a misspelled ngram matches a real one.
// This could still return some false ngrams, but that's why it's fuzzy.
$sql_ngrams = "
SELECT s.ngram, s.word_id, s.completeness
FROM {fuzzysearch_index} s
LEFT JOIN {node} n ON (n.nid = s.nid)
WHERE (($clause) AND n.nid = $row->nid AND n.status = 1
AND n.type NOT IN ($placeholders))";
$ngrams = db_query($sql_ngrams, $args);
$clean_grams = array();
$i = 0;
while ($ngram = db_fetch_array($ngrams)) {
$clean_grams[$ngram['ngram']][] = $ngram;
$i++;
}
// Ngrams can occur multiple times, so filter.
$clean_grams = fuzzysearch_unique($clean_grams);
// This will hold our search terms.
$clean_words = explode(' ', $query);
// Now we rebuild the words stripping out misspelled ngrams.
foreach ($clean_words as $key => $clean_word) {
// If we have an exact match, let's skip the work to check for misspellings.
if (!preg_match('/\b'. $clean_word .'\b/iu', $node->body)) {
$pos = array();
$id_count = array();
$bad_positions = array();
$len = drupal_strlen($clean_word);
// Ignore search terms under 3 characters.
if ($len >= 3) {
// Get the position of each good hit.
foreach ($clean_grams as $n => $gram) {
if ($multibyte == UNICODE_MULTIBYTE) {
if (mb_stripos($clean_word, $n) !== FALSE) {
$pos[mb_stripos($clean_word, $n)] = $n;
// Keep count of our word ids so we can try to guess which word
// we are trying to match.
foreach ($clean_grams[$n] as $ngram_data) {
$id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
}
}
}
// No multibyte.
else {
if (stripos($clean_word, $n) !== FALSE) {
$pos[stripos($clean_word, $n)] = $n;
// Keep count of our word ids so we can try to guess which word
// we are trying to match.
foreach ($clean_grams[$n] as $ngram_data) {
$id_count[$ngram_data['word_id']] = $id_count[$ngram_data['word_id']] + 1;
}
}
}
}
ksort($pos);
// This gives us an array with the most common word_id as the first
// element.
arsort($id_count);
$id_count = array_keys($id_count);
// Remove any position matches that are not in our likely word (the
// word with the highest word_id count).
foreach ($pos as $position => $pgram) {
$pmatch = FALSE;
foreach ($clean_grams[$pgram] as $pid) {
if ($pid['word_id'] == $id_count[0]){
$pmatch = TRUE;
}
}
if (!$pmatch) {
unset($pos[$position]);
}
}
// Start with a dummy word at the right length, but only if there are
// some matching ngram hits.
$newword = '';
if (count($pos)) {
$newword = str_pad('', $len, '.');
}
$hits = $misses = $i = 0;
// Check character by character for ngram matches. We don't need to check
// beyond the first character of the ngram.
for ($i = 0; $i <= $len - $nlength; $i++) {
// This is a match, so insert it into our dummy word.
if (isset($pos[$i])) {
$newword = drupal_substr($newword, 0, $i) . $pos[$i] . drupal_substr($newword, $i + $nlength, $len);
++$hits;
}
// This is a miss, so replace with a wildcard.
else {
// But don't overwrite a letter, only a '.' .
if(drupal_substr($newword, $i, 1) == '.') {
$newword = ($i == 0 || $i > $len - $nlength) ? $newword : drupal_substr($newword, 0, $i) . '.+' . drupal_substr($newword, $i + $len);
}
++$misses;
$bad_positions[] = $i;
}
}
// Only keep our rebuilt word if it meets our minimum spelling match score.
if (($hits)/($len - 2) * 100 >= $min_spelling) {
// 2 consecutive misses could indicate a missing letter, so find the likely
// missing position and replace with the wildcard.
if (count($bad_positions) >= 2) {
foreach ($bad_positions as $bad_key => $bad_pos) {
if ($bad_positions[$bad_key + 1] == ($bad_pos + 1)) {
$newword = drupal_substr($newword, 0, $bad_pos) . '.' . drupal_substr($newword, $bad_pos + 1, $len);
}
}
}
// Remove consecutive wildcards and add word boundaries.
$newword = preg_replace("/\.\./", ".+", $newword);
$newword = preg_replace("/\.\+\.\+/", ".+", $newword);
$newword = '\b\w*'. trim($newword, '.+') .'.+?\b';
$clean_words[$key] = $newword;
}
else {
unset($clean_words[$key]);
}
}
// Under 3 characters, so unset the word.
else {
unset($clean_words[$key]);
}
}
}
// Build a replacement node body containing sections of text with the found
// words, with leading and trailing text.
$node->body = strip_tags($node->body);
$section = array();
$section_length = array();
foreach ($clean_words as $k => $word) {
$location = 0;
// If the word is found, add its position to $section.
while (preg_match('/'. $word .'/iu', $node->body, $matches, PREG_OFFSET_CAPTURE, $location) && $word != '') {
// Make sure we didn't traverse any word breaks by checking for spaces.
// Pretty sure we don't need mb_stripos() here because we don't actually
// care about the position
if (!stripos($matches[0][0], ' ')) {
$section[] = _fuzzysearch_char_count($node->body, $matches[0][1]);
$section_length[$matches[0][1]] = drupal_strlen($word);
$clean_words[$k] = $matches[0][0];
}
// Increase $location by one so we don't find the previous location.
$location = $matches[0][1] + 1;
}
}
// Because we found words one by one, the locations are out of order. Sort
// so that the locations are in natural order.
asort($section);
ksort($section_length);
$section = array_values($section);
$section_length = array_values($section_length);
$p = 0;
$found = $newbody = '';
$trail = $lead = $excerpt / 2;
$start = $section[0];
while (isset($section[$p])) {
// If the current section is within the previous, let's not create a new one
// so we don't have any duplicate text.
if ($section[$p] + $lead + $section_length[$p] + $trail > $section[$p + 1] && $section[$p + 1]) {
$trail = $section[$p+1] + $section_length[$p+1] + $lead - $start;
$p++;
continue;
}
// Put an excerpt into our replacement node body, with the
// found word in the center.
$found = $start - $lead < 0 ? drupal_substr($node->body, 0, $excerpt) : drupal_substr($node->body, $start - $lead, $trail + $lead);
if (variable_get('fuzzysearch_max_result', 0) && (strlen($newbody . $found) > variable_get('fuzzysearch_max_result', 0))) {
break;
}
$newbody .= '...'. $found .'... ';
$p++;
$start = $section[$p];
$trail = $lead;
}
// Wrap the found words in a tag to highlight them.
$newbody = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $clean_words) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '\0', $newbody);
// If there are no result excerpts in the body, at least show the teaser.
$node->body = $newbody == '' ? $node->teaser : $newbody;
$results[] = $node;
}
else {
$results[] = $node;
}
}
return $results;
}
/**
* Gather results from the index and build result page.
*/
function fuzzysearch_show_results($keys = '') {
return theme('fuzzysearch_show_results', $keys);
}
/**
* Theme hook for rendering search results.
**/
function theme_fuzzysearch_show_results($keys = '', $theme = NODE_THEME, $limit = 10) {
drupal_add_css(drupal_get_path('module', 'fuzzysearch') .'/fuzzysearch.css', 'module');
$results = fuzzysearch_process($keys, $theme, $limit);
if ($results) {
if ($theme == NODE_THEME) {
drupal_set_title(check_plain($keys));
$output .= '';
$output .= drupal_get_form('fuzzysearch_form', $keys);
$output .= '
'; $output .= theme('fuzzysearch_results', $results);
$output .= theme('pager', NULL, $limit);
}
else {
$output .= theme('fuzzysearch_results_title', $results);
}
}
else {
$output = ''. t('No matches were found.') .'
';
}
return $output;
}
/**
* Theme the search results
*/
function theme_fuzzysearch_results($results) {
drupal_set_title(t('Search results'));
$output .= '';
$output .= '
';
foreach ($results as $result) {
$i++;
$odd = $i%2 ? 'odd' : 'even';
$output .= '
';
$output .= theme('fuzzysearch_result', $result);
if (variable_get('fuzzysearch_debug_score', FALSE)) {
$output .= '
'. t('Completeness: ') . number_format($result->completeness) . t(' Score: ') . number_format($result->score) .'
';
}
$output .= '
';
}
$output .= '
';
return $output;
}
function theme_fuzzysearch_results_title($results) {
$output .= '';
$output .= '
';
foreach ($results as $result) {
$i++;
$odd = $i%2 ? 'odd' : 'even';
$output .= '- '. l($result->title, drupal_get_path_alias('node/'. $result->nid)) .'
';
}
$output .= '
';
return $output;
}
// Using some code from drupal_substr() to set character count of the found
// clean word.
function _fuzzysearch_char_count($text, $position) {
$bytes = 0;
// Count all the continuation bytes from the start until we have found
// $start characters
$bytes = -1;
$chars = -1;
while ($bytes < $position) {
$bytes++;
$c = ord($text[$bytes]);
if ($c < 0x80 || $c >= 0xC0) {
$chars++;
}
}
return $chars;
}
/**
* Implementation of watchdog().
*/
function fuzzysearch_watchdog($log_entry) {
// Experimental!:
// If there is a 404 header and a watchdog entry for it and it's requested in
// the module settings, we take over the 404 page and show results on the
// watchdog message.
if (stristr(drupal_get_headers(), '404 not found') && variable_get('fuzzysearch_not_found', FALSE)) {
foreach ($_GET as $term) {
$query .= str_replace('/', ' ', $term) .' ';
}
drupal_set_message(t('The page you requested: "@page," could not be found. A site search for that page found the following results:', array('@page' => $log_entry['message'])));
drupal_goto('fuzzysearch/results/'. $query);
}
}
/**
* Remove stop words from search query and text to be indexed.
*
* @param $text The text to be stripped of stop words.
*/
function fuzzysearch_stopwords($text) {
static $stop_words;
if (!is_array($stop_words)) {
$stop_words = array();
$files = file_scan_directory('sites/all/libraries/fuzzysearch/stopwords', 'fuzzysearch_stopwords_.+\.txt', array(), 0, TRUE, 'name' );
foreach ($files as $file) {
$stop_words = array_merge($stop_words, explode(' ', file_get_contents($file->filename)));
}
}
$text = explode(' ', $text);
$text = array_diff($text, $stop_words);
return implode(' ', $text);
}
/**
* Recursive array_unique().
*
*/
function fuzzysearch_unique($array) {
$result = array_map("unserialize", array_unique(array_map("serialize", $array)));
foreach ($result as $key => $value) {
if ( is_array($value) ) {
$result[$key] = fuzzysearch_unique($value);
}
}
return $result;
}