' . t('This module provides an aid to finding broken links on your site. It periodically checks contents of all public nodes, tries to find any html links and check for their validity. It reports broken links through the admin interface. For more information about status codes see Status Code Definitions.', array('@rfc' => 'http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html')) . '
'; } } /** * Implementation of hook_menu(). */ function linkchecker_menu() { $items['admin/settings/linkchecker'] = array( 'title' => 'Link checker', 'description' => 'Configure the content types that should be checked for broken links and how the hypertext links will be checked and reported and repaired.', 'page callback' => 'drupal_get_form', 'page arguments' => array('linkchecker_admin_settings_form'), 'access arguments' => array('administer linkchecker'), 'file' => 'includes/linkchecker.admin.inc', ); $items['admin/reports/linkchecker'] = array( 'title' => 'Broken links', 'description' => 'Shows a list of broken links in content.', 'page callback' => 'linkchecker_admin_report_page', 'type' => MENU_NORMAL_ITEM, 'access arguments' => array('access broken links report'), 'file' => 'includes/linkchecker.pages.inc', ); // Add the user menu item after node/edit tab. $items['user/%user/linkchecker'] = array( 'title' => 'Broken links', 'description' => 'Shows a list of broken links in content.', 'page callback' => 'linkchecker_user_report_page', 'page arguments' => array(1), 'type' => MENU_LOCAL_TASK, 'access callback' => '_linkchecker_user_access_account_broken_links_report', 'access arguments' => array(1), 'file' => 'includes/linkchecker.pages.inc', 'weight' => 3, ); $items['linkchecker/%linkchecker_link/edit'] = array( 'title' => 'Edit link settings', 'page callback' => 'drupal_get_form', 'page arguments' => array('linkchecker_link_edit_form', 1), 'access callback' => '_linkchecker_user_access_edit_link_settings', 'access arguments' => array(1), 'file' => 'includes/linkchecker.pages.inc', 'type' => MENU_CALLBACK, ); return $items; } /** * Access callback for user/%user/linkchecker. */ function _linkchecker_user_access_account_broken_links_report($account) { global $user; // Users with 'access own broken links report' permission can only view their // own report. Users with the 'access broken links report' permission can // view the report for any authenticated user. return $account->uid && (($user->uid == $account->uid && user_access('access own broken links report')) || user_access('access broken links report')); } /** * Access callback for linkchecker/%linkchecker_link/edit. */ function _linkchecker_user_access_edit_link_settings($link) { return user_access('edit link settings') && _linkchecker_link_access($link); } /** * Determines if the current user has access to view a link. * * Link URLs can contain private information (for example, usernames and * passwords). So this module should only display links to a user if the link * already appears in at least one place on the site where the user would * otherwise have access to see it. */ function _linkchecker_link_access($link) { $link = (object) $link; return _linkchecker_link_node_ids($link) || _linkchecker_link_comment_ids($link) || _linkchecker_link_block_ids($link); } /** * Returns IDs of nodes that contain a link which the current user may be allowed to view. * * Important note: For performance reasons, this function is not always * guaranteed to return the exact list of node IDs that the current user is * allowed to view. It will, however, always return an empty array if the user * does not have access to view *any* such nodes, thereby meeting the security * goals of _linkchecker_link_access() and other places that call it. * * In the case where a user has access to some of the nodes that contain the * link, this function may return some node IDs that the user does not have * access to. Therefore, use caution with its results. * * @param $link * An object representing the link to check. * @param $node_author_account * (optional) If a user account object is provided, the returned nodes will * additionally be restricted to only those owned by this account. Otherwise, * nodes owned by any user account may be returned. * @return * An array of node IDs that contain the provided link and that the current * user may be allowed to view. */ function _linkchecker_link_node_ids($link, $node_author_account = NULL) { static $fields_with_node_links = array(); // Exit if all node types are disabled or if the user cannot access content, // there is no need to check further. $linkchecker_scan_nodetypes = array_filter(variable_get('linkchecker_scan_nodetypes', array())); if (empty($linkchecker_scan_nodetypes) || !user_access('access content')) { return array(); } // Disable language negotiation temporarily, re-enable it later. if (module_exists('i18n')) { i18n_selection_mode('off'); } // Get a list of nodes containing the link, using db_rewrite_sql() to allow // node access modules to exclude nodes that the current user does not have // access to view. if (!empty($node_author_account)) { $nodes = db_query(db_rewrite_sql('SELECT n.nid FROM {node} n INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid INNER JOIN {node_revisions} r ON r.vid = n.vid WHERE ln.lid = %d AND (n.uid = %d OR r.uid = %d)'), $link->lid, $node_author_account->uid, $node_author_account->uid); } else { $nodes = db_query(db_rewrite_sql('SELECT n.nid FROM {node} n INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid WHERE ln.lid = %d'), $link->lid); } // Re-enable language negotiation. if (module_exists('i18n')) { i18n_selection_mode('reset'); } // Check if the current user has access to view the link in each node. // However, for performance reasons, as soon as we find one node where that // is the case, stop checking and return the remainder of the list. $nids = array(); $access_allowed = FALSE; while ($node = db_fetch_object($nodes)) { if ($access_allowed) { $nids[] = $node->nid; continue; } $node = node_load($node->nid); // We must check whether the link is currently part of the node; if not, we // do not want to return it (and it is not safe to, since we cannot know if // it contained access restrictions for the current user at the point which // it was originally extracted by the Link checker module). if (!isset($fields_with_node_links[$node->nid])) { $fields_with_node_links[$node->nid] = _linkchecker_extract_node_links($node, TRUE); } if (empty($fields_with_node_links[$node->nid][$link->url])) { continue; } // If the link only appears in CCK fields and a field access module is // being used, we must check that the current user has access to view at // least one field that contains the link; if they don't, we should not // return the node. $fields = $fields_with_node_links[$node->nid][$link->url]; if (!in_array('node', $fields) && module_exists('content') && module_implements('field_access')) { $fields_with_access = array(); foreach (content_fields(NULL, $node->type) as $field) { // Only check link and text fields, since those are the only types we // extract links from. if (($field['type'] == 'link' || $field['type'] == 'text') && content_access('view', $field, NULL, $node)) { $fields_with_access[] = $field['field_name']; } } if (!array_intersect($fields, $fields_with_access)) { continue; } } $nids[] = $node->nid; $access_allowed = TRUE; } return $nids; } /** * Returns IDs of comments that contain a link which the current user is allowed to view. * * @param $link * An object representing the link to check. * @param $comment_author_account * (optional) If a user account object is provided, the returned comments * will additionally be restricted to only those owned by this account. * Otherwise, comments owned by any user account may be returned. * @return * An array of comment IDs that contain the provided link and that the * current user is allowed to view. */ function _linkchecker_link_comment_ids($link, $comment_author_account = NULL) { // Exit if comments are disabled or if the user cannot access comments, there // is no need to check further. if (!module_exists('comment') || !variable_get('linkchecker_scan_comments', 0) || !user_access('access comments')) { return array(); } // Get a list of comments containing the link, using db_rewrite_sql() to // allow comment access modules to exclude comments that the current user // does not have access to view. if (!empty($comment_author_account)) { $comments = db_query(db_rewrite_sql('SELECT c.cid FROM {comments} c INNER JOIN {linkchecker_comments} lc ON lc.cid = c.cid WHERE lc.lid = %d AND c.uid = %d', 'c', 'cid'), $link->lid, $comment_author_account->uid); } else { $comments = db_query(db_rewrite_sql('SELECT c.cid FROM {comments} c INNER JOIN {linkchecker_comments} lc ON lc.cid = c.cid WHERE lc.lid = %d', 'c', 'cid'), $link->lid); } // Return the array of comment IDs. $cids = array(); while ($comment = db_fetch_object($comments)) { $cids[] = $comment->cid; } return $cids; } /** * Returns IDs of blocks that contain a link which the current user is allowed to view. * * @param $link * An object representing the link to check. * @return * An array of custom block IDs that contain the provided link and that the * current user is allowed to view. */ function _linkchecker_link_block_ids($link) { // Exit if blocks are disabled. if (!variable_get('linkchecker_scan_blocks', 0)) { return array(); } // Get the initial list of block IDs. $boxes = db_query('SELECT bid FROM {linkchecker_boxes} WHERE lid = %d', $link->lid); $bids = array(); while ($box = db_fetch_object($boxes)) { $bids[] = $box->bid; } // If the user can administer blocks, they're able to see all block content. if (user_access('administer blocks')) { return $bids; } // Otherwise, only return blocks that this user (or anonymous users) have // access to. global $user; $rids = array_keys($user->roles); $rids[] = DRUPAL_ANONYMOUS_RID; $allowed_boxes = db_query("SELECT DISTINCT b.delta FROM {blocks} b LEFT JOIN {blocks_roles} r ON b.module = r.module AND b.delta = r.delta WHERE b.module = 'block' AND (r.rid IN (". db_placeholders($rids) .") OR r.rid IS NULL)", $rids); $allowed_bids = array(); while ($allowed_box = db_fetch_object($allowed_boxes)) { $allowed_bids[] = $allowed_box->delta; } return array_intersect($bids, $allowed_bids); } /** * Implementation of hook_cron(). */ function linkchecker_cron() { // Remove outdated links no longer in use once per day. if (time() - variable_get('linkchecker_cleanup_links_last', 0) >= 86400) { _linkchecker_cleanup_links(); variable_set('linkchecker_cleanup_links_last', time()); } // Run link checker in a new process, independent of cron. if (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl') { // Setup callback options array; call _linkchecker_check_links() in the // background. $callback_options = array(array('function' => '_linkchecker_check_links')); // Queue up the request. httprl_queue_background_callback($callback_options); // Execute request. httprl_send_request(); // Exit here so we don't call _linkchecker_check_links() in this process. return; } // Run the link checks the normal way. _linkchecker_check_links(); } /** * Run link checks. */ function _linkchecker_check_links() { // Get max_execution_time from configuration, override 0 with 240 seconds. $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time'); // Make sure we have enough time to validate all of the links. linkchecker_set_time_limit($max_execution_time); // Make sure this is the only process trying to run this function. if (!lock_acquire(__FUNCTION__, $max_execution_time)) { watchdog('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING); return FALSE; } $has_httprl = (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl'); // Do not confuse admins with a setting of maximum checkable links per cron // run and guess that 2 links can be checked per second with 1 thread, what is // nevertheless uncommon. The max_execution_time can be used to calculate // a useful value that is higher, but not totally out of scope and limits the // query resultset to a resonable size. $linkchecker_check_connections_max = variable_get('linkchecker_check_connections_max', 8); $check_links_max_per_cron_run = ($has_httprl) ? ($linkchecker_check_connections_max * $max_execution_time) : $max_execution_time; $linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200); $linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)'); // Connection limit can be overriden via settings.php. Two connections is the // limit defined in RFC http://www.ietf.org/rfc/rfc2616.txt. Modern browsers // are typically using 6-8 connections and no more. Never use more and keep // in mind that you can overload other people servers. $linkchecker_check_domain_connections = variable_get('linkchecker_check_domain_connections', 2); // Get URLs for checking. $links = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run); $links_remaining = $links->num_rows; while ($link = db_fetch_object($links)) { $headers = array(); $headers['User-Agent'] = $linkchecker_check_useragent; $uri = @parse_url($link->url); // URL contains a fragment. if (in_array($link->method, array('HEAD', 'GET')) && !empty($uri['fragment'])) { // We need the full content and not only the HEAD. $link->method = 'GET'; // Request text content only (like Firefox/Chrome). $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'; } elseif ($link->method == 'GET') { // Range: Only request the first 1024 bytes from remote server. This is // required to prevent timeouts on URLs that are large downloads. $headers['Range'] = 'bytes=0-1024'; } // Add in the headers. $options = array( 'headers' => $headers, 'method' => $link->method, 'max_redirects' => 0, ); if ($has_httprl) { // Define the callback and add the $link object to it. // Notes: // - 'global_timeout' does not require a timer_read('page'), as this job // runs in a new process, independent of cron. $options += array( 'global_connections' => $linkchecker_check_connections_max, 'global_timeout' => $max_execution_time - 30, 'domain_connections' => $linkchecker_check_domain_connections, 'callback' => array( array( 'function' => '_linkchecker_status_handling', ), $link, // This need to be passed or it's not send back to _linkchecker_status_handling() ) ); // Queue up the requests. httprl_request($link->url, $options); $links_remaining--; // After all links are queued, run the url checks. if ($links_remaining == 0) { httprl_send_request(); } } else { // Drupal core $response = drupal_http_request($link->url, $options['headers'], $options['method'], NULL, $options['max_redirects']); // Add 'redirect_code' property to core response object for consistency // with HTTPRL object. if ($response->code == 301 && !isset($response->redirect_code)) { $response->redirect_code = $response->code; } // Add 'uri' property to core response object for 'fragment' check and // consistency with HTTPRL object. $response->uri = $uri; _linkchecker_status_handling($response, $link); if ((timer_read('page') / 1000) > ($max_execution_time / 2)) { break; // Stop once we have used over half of the maximum execution time. } } } // Release the lock. lock_release(__FUNCTION__); watchdog('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO); // Peak memory usage is only available in PHP >= 5.2. if (version_compare(phpversion(), '5.2.0', '>=')) { watchdog('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array('@memory_get_peak_usage' => format_size(memory_get_peak_usage()), '@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG); } else { watchdog('linkchecker', 'Memory usage: @memory_get_usage.', array('@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG); } return TRUE; } /** * Status code handling. * * @param object $response * An object containing the HTTP request headers, response code, headers, * data and redirect status. * @param string $link * An object containing the url, lid and fail_count. */ function _linkchecker_status_handling(&$response, $link) { $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); // - Prevent E_ALL warnings in DB updates for non-existing $response->error. // - @todo drupal_http_request() may not provide an UTF8 encoded error message // what results in a database UPDATE failure. For more information, see // http://drupal.org/node/371495. // Workaround: ISO-8859-1 as source encoding may be wrong, but WFM. if (!isset($response->error)) { $response->error = ''; } if (!isset($response->status_message)) { $response->status_message = ''; } $response->error = trim(drupal_convert_to_utf8($response->error, 'ISO-8859-1')); $response->status_message = trim(drupal_convert_to_utf8($response->status_message, 'ISO-8859-1')); // Destination anchors in HTML documents may be specified either by the A // element (naming it with the name attribute), or by any other element // (naming with the id attribute). // See http://www.w3.org/TR/html401/struct/links.html if ($response->code == 200 && !empty($response->data) && !empty($response->headers['content-type']) && !empty($response->uri['fragment']) && in_array($response->headers['content-type'], array('text/html', 'application/xhtml+xml', 'application/xml')) && !preg_match('/(\s[^>]*(name|id)(\s+)?=(\s+)?["\'])(' . preg_quote($response->uri['fragment'], '/') . ')(["\'][^>]*>)/i', $response->data) ) { // Override status code 200 with status code 404 so it can be handled with // default status code 404 logic and custom error text. $response->code = 404; $response->status_message = $response->error = 'URL fragment identifier not found in content'; } switch ($response->code) { case -4: // HTTPRL: httprl_send_request timed out. // Skip these and try them again next cron run. break; case -2: // HTTPRL: maximum allowed redirects exhausted. case 301: db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->redirect_code, $response->status_message, time(), $link->lid); // A HTTP status code of 301 tells us an existing link have changed to // a new link. The remote site owner was so kind to provide us the new // link and if we trust this change we are able to replace the old link // with the new one without any hand work. $auto_repair_301 = variable_get('linkchecker_action_status_code_301', 0); if ($auto_repair_301 && $auto_repair_301 <= ($link->fail_count+1) && valid_url($response->redirect_url, TRUE)) { // Switch anonymous user to an admin. linkchecker_impersonate_user(user_load(array('name' => variable_get('linkchecker_impersonate_user', '')))); // NODES: Autorepair all nodes having this outdated link. $res = db_query("SELECT * FROM {linkchecker_nodes} WHERE lid = %d", $link->lid); while ($row = db_fetch_object($res)) { $node = node_load(array('nid' => $row->nid)); // Has the node object loaded successfully? if (is_object($node)) { $node_original = drupal_clone($node); // Create array of node fields to scan (for e.g. $node->title, $node->links_weblink_url). $text_items = array(); $text_items[] = 'title'; $text_items[] = 'body'; $text_items[] = 'teaser'; // Update 'weblink' nodes from 'links' module package. if (module_exists('links_weblink') && $node->type == 'weblink' && isset($node->links_weblink_url)) { $text_items[] = 'links_weblink_url'; } // Update 'weblinks' nodes from 'weblinks' module. if (module_exists('weblinks') && $node->type == 'weblinks' && isset($node->url)) { $text_items[] = 'url'; } // Now replace the outdated link with the permanently moved one in all node fields. foreach ($text_items as $text_item) { _linkchecker_link_replace($node->$text_item, $link->url, $response->redirect_url); } // Search for CCK-fields of types 'link' and 'text'. if (module_exists('content')) { $fields = content_fields(NULL, $node->type); foreach ($fields as $field) { if (isset($node->{$field['field_name']})) { if (module_exists('link') && $field['type'] == 'link') { foreach ($node->$field['field_name'] as $delta => $item) { _linkchecker_link_replace($node->{$field['field_name']}[$delta]['url'], $link->url, $response->redirect_url); } } elseif (module_exists('text') && $field['type'] == 'text') { foreach ($node->$field['field_name'] as $delta => $item) { _linkchecker_link_replace($node->{$field['field_name']}[$delta]['value'], $link->url, $response->redirect_url); } } } } } if ($node_original != $node) { // Always use the default revision setting. See node_object_prepare(). $node_options = variable_get('node_options_'. $node->type, array('status', 'promote')); $node->revision = in_array('revision', $node_options); // Generate a log message for the node_revisions table, visible on the node's revisions tab. $node->log = t('Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $row->nid), '%src' => $link->url, '%dst' => $response->redirect_url)); // Save changed node and update the node link list. node_save($node); watchdog('linkchecker', 'Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $row->nid), '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); } else { watchdog('linkchecker', 'Link update in node failed. Permanently moved link %src not found in node %node. Manual fix required.', array('%node' => url('node/' . $row->nid), '%src' => $link->url), WATCHDOG_WARNING); } } else { watchdog('linkchecker', 'Loading node %node for update failed. Manual fix required.', array('%node' => $row->nid), WATCHDOG_ERROR); } } // COMMENTS: Autorepair all comments having this outdated link. $res = db_query("SELECT * FROM {linkchecker_comments} WHERE lid = %d", $link->lid); while ($row = db_fetch_object($res)) { $comment = _linkchecker_comment_load($row->cid); // Has the custom comment array loaded successfully? if (!empty($comment)) { $comment_original = $comment; // Create array of comment fields to scan (for e.g. $comment->subject, $comment->comment). $text_items = array(); $text_items[] = 'subject'; $text_items[] = 'comment'; // Now replace the outdated link with the permanently moved one in all comment fields. foreach ($text_items as $text_item) { _linkchecker_link_replace($comment[$text_item], $link->url, $response->redirect_url); } // Save changed comment and update the comment link list. $comment_diff = array_diff($comment, $comment_original); if (!empty($comment_diff)) { comment_save($comment); watchdog('linkchecker', 'Changed permanently moved link in comment %comment from %src to %dst.', array('%comment' => $comment['cid'], '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); } else { watchdog('linkchecker', 'Link update in comment failed. Permanently moved link %src not found in comment %comment. Manual fix required.', array('%comment' => $comment['cid'], '%src' => $link->url), WATCHDOG_WARNING); } } else { watchdog('linkchecker', 'Loading comment %comment for update failed. Manual fix required.', array('%comment' => $comment['cid']), WATCHDOG_ERROR); } } // BOXES: Autorepair all boxes having this outdated link. $res = db_query("SELECT * FROM {linkchecker_boxes} WHERE lid = %d", $link->lid); while ($row = db_fetch_object($res)) { $box = block_box_get($row->bid); // Has the custom block array loaded successfully? if (!empty($box)) { $box_original = $box; // Create array of box fields to scan. $text_items = array(); $text_items[] = 'info'; $text_items[] = 'body'; // Now replace the outdated link with the permanently moved one in all // box fields. foreach ($text_items as $text_item) { _linkchecker_link_replace($box[$text_item], $link->url, $response->redirect_url); } $box_diff = array_diff($box, $box_original); if (!empty($box_diff)) { // Save changed box and update the box link list. block_box_save($box, $row->bid); // There is no hook that fires on block_box_save(), therefore do link // extraction programmatically. _linkchecker_add_box_links($box, $row->bid); watchdog('linkchecker', 'Changed permanently moved link in box %bid from %src to %dst.', array('%bid' => $row->bid, '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); } else { watchdog('linkchecker', 'Link update in block failed. Permanently moved link %src not found in block %bid. Manual fix required.', array('%bid' => $row->bid, '%src' => $link->url), WATCHDOG_WARNING); } } else { watchdog('linkchecker', 'Loading block %bid for update failed. Manual fix required.', array('%bid' => $row->bid), WATCHDOG_ERROR); } } // Revert user back to anonymous. linkchecker_revert_user(); } else { watchdog('linkchecker', 'Link %link has changed and needs to be updated.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); } break; case 404: db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); watchdog('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); // If unpublishing limit is reached, unpublish all nodes having this link. $linkchecker_action_status_code_404 = variable_get('linkchecker_action_status_code_404', 0); if ($linkchecker_action_status_code_404 && $linkchecker_action_status_code_404 <= ($link->fail_count+1)) { // Switch anonymous user to an admin. linkchecker_impersonate_user(user_load(array('name' => variable_get('linkchecker_impersonate_user', '')))); _linkchecker_unpublish_nodes($link->lid); linkchecker_revert_user(); } break; case 405: // - 405: Special error handling if method is not allowed. Switch link // checking to GET method and try again. db_query("UPDATE {linkchecker_links} SET method = '%s', code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", 'GET', $response->code, $response->error, time(), $link->lid); watchdog('linkchecker', 'Method HEAD is not allowed for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker')); break; case 500: // - 500: Like WGET, try with GET on "500 Internal server error". // - If GET also fails with status code 500, than the link is broken. if ($link->method == 'GET' && $response->code == 500) { db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); watchdog('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); } else { db_query("UPDATE {linkchecker_links} SET method = '%s', code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", 'GET', $response->code, $response->error, time(), $link->lid); watchdog('linkchecker', 'Internal server error for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker')); } break; default: // Don't treat ignored response codes as errors. if (in_array($response->code, $ignore_response_codes)) { db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = %d, last_checked = %d WHERE lid = %d", $response->code, $response->error, 0, time(), $link->lid); //watchdog('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker')); } else { db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); //watchdog('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker')); } } // Free Memory. $response = new stdClass(); } function linkchecker_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) { switch ($op) { case 'insert': // The node is going to be published. if (_linkchecker_scan_nodetype($node->type) && $node->status) { _linkchecker_add_node_links($node); } break; case 'update': // The node is going to be published. if (_linkchecker_scan_nodetype($node->type) && $node->status) { _linkchecker_add_node_links($node); } else { // The node is going to be unpublished. _linkchecker_delete_node_links($node->nid); } break; case 'delete': _linkchecker_delete_node_links($node->nid); break; case 'prepare': // Node edit tab is viewed. if (arg(0) == 'node' && is_numeric(arg(1)) && arg(2) == 'edit' && isset($node->nid)) { // Show a message on node edit page if a link check failed once or more. $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); $links = db_query("SELECT ll.* FROM {linkchecker_nodes} ln INNER JOIN {linkchecker_links} ll ON ln.lid = ll.lid WHERE ln.nid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array($node->nid, 0, 1), $ignore_response_codes)); while ($link = db_fetch_object($links)) { if (_linkchecker_link_access($link)) { drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); } } } break; } } function linkchecker_comment($comment, $op) { // Convert $comment object (admin/content/comment) to array (comment/edit/[cid]). $comment = (array) $comment; switch ($op) { case 'publish': $node_type = db_result(db_query("SELECT type FROM {node} WHERE nid = %d", $comment['nid'])); if (_linkchecker_scan_nodetype($node_type) && variable_get('linkchecker_scan_comments', 0)) { _linkchecker_add_comment_links($comment); } break; case 'unpublish': case 'delete': _linkchecker_delete_comment_links($comment['cid']); break; } } function linkchecker_form_alter(&$form, $form_state, $form_id) { switch ($form_id) { // Catch the block add/configure form and add custom submit handler. case 'block_add_block_form': // Add custom submit handler to block add form. $form['#submit'][] = 'linkchecker_block_add_form_submit'; break; case 'block_admin_configure': // When displaying the form, show the broken links warning. if (empty($form_state['post']) && is_numeric(arg(5))) { // Show a message on block edit page if a link check failed once or more. $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); $links = db_query("SELECT ll.* FROM {linkchecker_boxes} lb INNER JOIN {linkchecker_links} ll ON lb.lid = ll.lid WHERE lb.bid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array(arg(5), 0, 1), $ignore_response_codes)); while ($link = db_fetch_object($links)) { if (_linkchecker_link_access($link)) { drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); } } } // Add custom submit handler to block configuration form. $form['#submit'][] = 'linkchecker_block_configure_form_submit'; break; case 'block_box_delete': $form['#submit'][] = 'linkchecker_block_box_delete_form_submit'; break; case 'comment_form': // When displaying the form as 'view' or 'preview', show the broken links warning. if ((empty($form_state['post']) || isset($form_state['post']['op']) && $form_state['post']['op'] == t('Preview')) && arg(0) == 'comment' && arg(1) == 'edit' && is_numeric(arg(2))) { // Show a message on comment edit page if a link check failed once or more. $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); $links = db_query("SELECT ll.* FROM {linkchecker_comments} lc INNER JOIN {linkchecker_links} ll ON lc.lid = ll.lid WHERE lc.cid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array(arg(2), 0, 1), $ignore_response_codes)); while ($link = db_fetch_object($links)) { if (_linkchecker_link_access($link)) { drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); } } } break; } } /** * Custom submit handler for block add page. */ function linkchecker_block_add_form_submit($form, &$form_state) { if (variable_get('linkchecker_scan_blocks', 0)) { $bid = db_result(db_query("SELECT MAX(bid) FROM {boxes}")); _linkchecker_add_box_links($form_state['values'], $bid); } } /** * Custom submit handler for block configure page. */ function linkchecker_block_configure_form_submit($form, &$form_state) { if (variable_get('linkchecker_scan_blocks', 0)) { _linkchecker_add_box_links($form_state['values'], $form_state['values']['delta']); } } /** * Custom submit handler for block delete page. */ function linkchecker_block_box_delete_form_submit($form, &$form_state) { _linkchecker_delete_box_links($form_state['values']['bid']); } /** * Extracts links from a node. * * @param $node * The fully populated node object. * @param $return_field_names * If set to TRUE, the returned array will contain the link URLs as keys, and * each element will be an array containing all field names in which the URL * is found (the special field name "node" is used to represent all scanned * node content that is not a CCK field). Otherwise, a simple array of URLs * will be returned. * @return * An array whose keys are fully qualified and unique URLs found in the node * (as returned by _linkchecker_extract_links()), or a more complex * structured array (see above) if $return_field_names is TRUE. */ function _linkchecker_extract_node_links($node, $return_field_names = FALSE) { // Get current node language options for url() functions. $languages = language_list(); $url_options = empty($node->language) ? array('absolute' => TRUE) : array('language' => $languages[$node->language], 'absolute' => TRUE); // Create array of node fields to scan. $text_items = array(); $text_items_by_field = array(); $text_items[] = $text_items_by_field['node'][] = _filter_url($node->title, $node->format); $text_items[] = $text_items_by_field['node'][] = _linkchecker_check_markup($node->body, $node->format, FALSE); $text_items[] = $text_items_by_field['node'][] = _linkchecker_check_markup($node->teaser, $node->format, FALSE); // Search for links in 'weblink' nodes from 'links' module package. if (module_exists('links_weblink') && $node->type == 'weblink' && !empty($node->links_weblink_url)) { $text_items[] = $text_items_by_field['node'][] = _filter_url(url($node->links_weblink_url, $url_options), $node->format); } // Search for links in 'weblinks' nodes from 'weblinks' module. if (module_exists('weblinks') && $node->type == 'weblinks' && !empty($node->url)) { $text_items[] = $text_items_by_field['node'][] = _filter_url(url($node->url, $url_options), $node->format); } // Search for CCK-fields of types 'link' and 'text'. if (module_exists('content')) { $fields = content_fields(NULL, $node->type); foreach ($fields as $field) { if (!empty($node->{$field['field_name']})) { if (module_exists('link') && $field['type'] == 'link') { foreach ($node->$field['field_name'] as $delta => $item) { if (!empty($item['url'])) { // Make non-absolute urls absolute or they are not found by _filter_url(). $text_items[] = $text_items_by_field[$field['field_name']][] = _filter_url(url($item['url'], $url_options), $node->format); } } } elseif (module_exists('text') && $field['type'] == 'text') { foreach ($node->$field['field_name'] as $delta => $item) { $text_items[] = $text_items_by_field[$field['field_name']][] = _filter_url($item['value'], $node->format); } } } } } // Get the absolute node path for extraction of relative links. $path = url('node/'. $node->nid, $url_options); // Extract all links in a node. $links = _linkchecker_extract_links(implode(' ', $text_items), $path); // Return either the array of links, or an array of field names containing // each link, depending on what was requested. if (!$return_field_names) { return $links; } else { $field_names = array(); foreach ($text_items_by_field as $field_name => $items) { foreach ($items as $item) { foreach ($links as $uri => $link) { // We only need to do a quick check here to see if the URL appears // anywhere in the text; if so, that means users with access to this // field will be able to see the URL (and any private data such as // passwords contained in it). This is sufficient for the purposes of // _linkchecker_link_node_ids(), where this information is used. foreach ($link as $original_link) { if (strpos($item, $original_link) !== FALSE) { $field_names[$uri][$field_name] = $field_name; } } } } } return $field_names; } } /** * Add node links to database. * * @param $node * The fully populated node object. * @param $skip_missing_links_detection * To prevent endless batch loops the value need to be TRUE. With FALSE * the need for content re-scans is detected by the number of missing links. */ function _linkchecker_add_node_links($node, $skip_missing_links_detection = FALSE) { $links = array_keys(_linkchecker_extract_node_links($node)); // Node have links. if (!empty($links)) { // Remove all links from the links array already in the database // and only add missing links to database. $missing_links = _linkchecker_node_links_missing($node->nid, $links); // Only add links to database that do not exists. $i = 0; foreach ($missing_links as $url) { $urlhash = md5($url); $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); if (!$link) { $link = new stdClass(); $link->urlhash = $urlhash; $link->url = $url; $link->status = _linkchecker_link_check_status_filter($url); drupal_write_record('linkchecker_links', $link); } db_query("INSERT INTO {linkchecker_nodes} (nid, lid) VALUES (%d, %d)", $node->nid, $link->lid); // Break processing if max links limit per run has been reached. $i++; if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { break; } } // The first chunk of links not yet found in the {linkchecker_links} table // have now been imported by the above code. If the number of missing links // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // the content need to be re-scanned until all links have been collected and // saved in {linkchecker_links} table. // // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // links and need to be substracted from the number of missing links to // calculate the correct number of re-scan rounds. // // To prevent endless loops the $skip_missing_links_detection need to be TRUE. // This value will be set by the calling batch process that already knows // that it is running a batch job and the number of required re-scan rounds. $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; if (!$skip_missing_links_detection && $missing_links_count > 0) { module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); batch_set(_linkchecker_batch_import_single_node($node->nid, $missing_links_count)); // If batches were set in the submit handlers, we process them now, // possibly ending execution. We make sure we do not react to the batch // that is already being processed (if a batch operation performs a // drupal_execute). if ($batch = &batch_get() && !isset($batch['current_set'])) { batch_process('node/' . $node->nid); } } } // Remove dead link references for cleanup reasons as very last step. _linkchecker_cleanup_node_references($node->nid, $links); } /** * Add comment links to database. * * @param $comment * The fully populated comment array. * @param $skip_missing_links_detection * To prevent endless batch loops the value need to be TRUE. With FALSE * the need for content re-scans is detected by the number of missing links. */ function _linkchecker_add_comment_links($comment, $skip_missing_links_detection = FALSE) { // Create array of comment fields to scan. $text_items = array(); $text_items[] = _filter_url($comment['subject'], $comment['format']); $text_items[] = _linkchecker_check_markup($comment['comment'], $comment['format'], FALSE); // Get the absolute node path for extraction of relative links. $languages = language_list(); $node_language = db_result(db_query("SELECT language FROM {node} WHERE nid = %d", $comment['nid'])); $path = url('node/'. $comment['nid'], array('language' => $languages[$node_language], 'absolute' => TRUE)); // Extract all links in a comment. $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items), $path)); // Comment have links. if (!empty($links)) { // Remove all links from the links array already in the database // and only add missing links to database. $missing_links = _linkchecker_comment_links_missing($comment['cid'], $links); // Only add unique links to database that do not exist. $i = 0; foreach ($missing_links as $url) { $urlhash = md5($url); $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); if (!$link) { $link = new stdClass(); $link->urlhash = $urlhash; $link->url = $url; $link->status = _linkchecker_link_check_status_filter($url); drupal_write_record('linkchecker_links', $link); } db_query("INSERT INTO {linkchecker_comments} (cid, lid) VALUES (%d, %d)", $comment['cid'], $link->lid); // Break processing if max links limit per run has been reached. $i++; if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { break; } } // The first chunk of links not yet found in the {linkchecker_links} table // have now been imported by the above code. If the number of missing links // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // the content need to be re-scanned until all links have been collected and // saved in {linkchecker_links} table. // // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // links and need to be substracted from the number of missing links to // calculate the correct number of re-scan rounds. // // To prevent endless loops the $skip_missing_links_detection need to be TRUE. // This value will be set by the calling batch process that already knows // that it is running a batch job and the number of required re-scan rounds. $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; if (!$skip_missing_links_detection && $missing_links_count > 0) { module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); batch_set(_linkchecker_batch_import_single_comment($comment['cid'], $missing_links_count)); // If batches were set in the submit handlers, we process them now, // possibly ending execution. We make sure we do not react to the batch // that is already being processed (if a batch operation performs a // drupal_execute). if ($batch = &batch_get() && !isset($batch['current_set'])) { batch_process('node/' . $comment['nid']); } } } // Remove dead link references for cleanup reasons as very last step. _linkchecker_cleanup_comment_references($comment['cid'], $links); } /** * Add block links to database. * * @param array $box * The fully populated block array. * @param integer $bid * Block id from table {blocks}.bid. * @param $skip_missing_links_detection * To prevent endless batch loops the value need to be TRUE. With FALSE * the need for content re-scans is detected by the number of missing links. */ function _linkchecker_add_box_links($box, $bid, $skip_missing_links_detection = FALSE) { // Create array of box fields to scan. $text_items = array(); $text_items[] = _filter_url($box['info'], $box['format']); $text_items[] = _linkchecker_check_markup($box['body'], $box['format'], FALSE); // Extract all links in a box. $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items))); // Box has links. if (!empty($links)) { // Remove all links from the links array already in the database // and only add missing links to database. $missing_links = _linkchecker_box_links_missing($bid, $links); // Only add unique links to database that do not exist. $i = 0; foreach ($missing_links as $url) { $urlhash = md5($url); $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); if (!$link) { $link = new stdClass(); $link->urlhash = $urlhash; $link->url = $url; $link->status = _linkchecker_link_check_status_filter($url); drupal_write_record('linkchecker_links', $link); } db_query("INSERT INTO {linkchecker_boxes} (bid, lid) VALUES (%d, %d)", $bid, $link->lid); // Break processing if max links limit per run has been reached. $i++; if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { break; } } // The first chunk of links not yet found in the {linkchecker_links} table // have now been imported by the above code. If the number of missing links // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // the content need to be re-scanned until all links have been collected and // saved in {linkchecker_links} table. // // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN // links and need to be substracted from the number of missing links to // calculate the correct number of re-scan rounds. // // To prevent endless loops the $skip_missing_links_detection need to be TRUE. // This value will be set by the calling batch process that already knows // that it is running a batch job and the number of required re-scan rounds. $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; if (!$skip_missing_links_detection && $missing_links_count > 0) { module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); batch_set(_linkchecker_batch_import_single_box($bid, $missing_links_count)); // If batches were set in the submit handlers, we process them now, // possibly ending execution. We make sure we do not react to the batch // that is already being processed (if a batch operation performs a // drupal_execute). if ($batch = &batch_get() && !isset($batch['current_set'])) { batch_process('admin/build/block'); } } } // Remove dead link references for cleanup reasons as very last step. _linkchecker_cleanup_box_references($bid, $links); } /** * Remove all node references to links in the linkchecker_nodes table. */ function _linkchecker_delete_node_links($nid) { return db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid); } /** * Remove all comment references to links in the linkchecker_comments table. */ function _linkchecker_delete_comment_links($cid) { return db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid); } /** * Remove all box references to links in the linkchecker_boxes table. */ function _linkchecker_delete_box_links($bid) { return db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid); } /** * Cleanup no longer used node references to links in the linkchecker_nodes table. */ function _linkchecker_cleanup_node_references($nid = 0, $links = array()) { if (empty($links)) { // Node do not have links. Delete all references if exists. db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid); } else { // The node still have more than one link, but other links may have been // removed and links no longer in the content need to be deleted from the // linkchecker_nodes reference table. db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($nid), array_map('md5', $links))); } } /** * Cleanup no longer used comment references to links in the linkchecker_comments table. */ function _linkchecker_cleanup_comment_references($cid = 0, $links = array()) { if (empty($links)) { // Comment do not have links. Delete all references if exists. db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid); } else { // The comment still have more than one link, but other links may have been // removed and links no longer in the content need to be deleted from the // linkchecker_comments reference table. db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($cid), array_map('md5', $links))); } } /** * Cleanup no longer used box references to links in the linkchecker_boxes table. */ function _linkchecker_cleanup_box_references($bid = 0, $links = array()) { if (empty($links)) { // Block do not have links. Delete all references if exists. db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid); } else { // The block still have more than one link, but other links may have been // removed and links no longer in the content need to be deleted from the // linkchecker_boxes reference table. db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($bid), array_map('md5', $links))); } } /** * Returns an array of node references missing in the linkchecker_nodes table. */ function _linkchecker_node_links_missing($nid, $links) { $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_nodes} ln ON ll.lid = ln.lid WHERE ln.nid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($nid), array_map('md5', $links))); $links_in_database = array(); while ($row = db_fetch_object($res)) { $links_in_database[] = $row->url; } return array_diff($links, $links_in_database); } /** * Returns an array of comment references missing in the linkchecker_comments table. */ function _linkchecker_comment_links_missing($cid, $links) { $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_comments} lc ON ll.lid = lc.lid WHERE lc.cid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($cid), array_map('md5', $links))); $links_in_database = array(); while ($row = db_fetch_object($res)) { $links_in_database[] = $row->url; } return array_diff($links, $links_in_database); } /** * Returns an array of box references missing in the linkchecker_boxes table. */ function _linkchecker_box_links_missing($bid, $links) { $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_boxes} lb ON ll.lid = lb.lid WHERE lb.bid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($bid), array_map('md5', $links))); $links_in_database = array(); while ($row = db_fetch_object($res)) { $links_in_database[] = $row->url; } return array_diff($links, $links_in_database); } /** * Run perodically via cron and delete all links without a references. * * For speed reasons and check results we keep the links for some time * as they may be reused by other new content. */ function _linkchecker_cleanup_links() { // Remove disabled node types no longer in use. $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array()))); if (!empty($node_types)) { db_query('DELETE FROM {linkchecker_nodes} WHERE nid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . db_placeholders($node_types, 'varchar') . '))', $node_types); // FIXME: Remove comments //db_query('DELETE FROM {linkchecker_comments} WHERE cid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . db_placeholders($node_types, 'varchar') . '))', $node_types); } else { db_query('TRUNCATE TABLE {linkchecker_nodes}'); // FIXME: Remove comments } // Remove comment link references if comment scanning is disabled. // TODO: Remove comments of unpublished nodes. if (variable_get('linkchecker_scan_comments', 0) == 0) { db_query('TRUNCATE TABLE {linkchecker_comments}'); } // Remove block link references if block scanning is disabled. if (variable_get('linkchecker_scan_blocks', 0) == 0) { db_query('TRUNCATE TABLE {linkchecker_boxes}'); } // Remove dead links without references. db_query('DELETE FROM {linkchecker_links} WHERE lid NOT IN ( SELECT DISTINCT lid FROM {linkchecker_boxes} UNION SELECT DISTINCT lid FROM {linkchecker_comments} UNION SELECT DISTINCT lid FROM {linkchecker_nodes} )'); } /** * Extract links from content. * * @param string $text * The text to be scanned for links. * @param string $content_path * Path to the content that is currently scanned for links. This value is * required to build full qualified links from relative links. Relative links * are not extracted from content, if path is not provided. * @return array * Array whose keys are fully qualified and unique URLs found in the * content, and whose values are arrays of actual text (raw URLs or paths) * corresponding to each fully qualified URL. */ function _linkchecker_extract_links($text = '', $content_path = NULL) { global $base_root; // Finds all hyperlinks in the content. $matches_a = array(1 => NULL); if (variable_get('linkchecker_extract_from_a', 1) == 1) { // Extract all chars in the href value, except double and single quotes. $pattern_a = '/<(?:a|area)\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i'; preg_match_all($pattern_a, $text, $matches_a); } // Finds all audio links in the content. $matches_audio = array(1 => NULL); if (variable_get('linkchecker_extract_from_audio', 1) == 1) { $pattern_audio = '/