2024-04-29 13:12:44 +05:45

1344 lines
35 KiB
PHP

<?php
/**
* The crawler class
*
* @since 1.1.0
*/
namespace LiteSpeed;
defined('WPINC') || exit();
class Crawler extends Root
{
const LOG_TAG = '🕸️';
const TYPE_REFRESH_MAP = 'refresh_map';
const TYPE_EMPTY = 'empty';
const TYPE_BLACKLIST_EMPTY = 'blacklist_empty';
const TYPE_BLACKLIST_DEL = 'blacklist_del';
const TYPE_BLACKLIST_ADD = 'blacklist_add';
const TYPE_START = 'start';
const TYPE_RESET = 'reset';
const USER_AGENT = 'lscache_walker';
const FAST_USER_AGENT = 'lscache_runner';
const CHUNKS = 10000;
private $_sitemeta = 'meta.data';
private $_resetfile;
private $_end_reason;
private $_ncpu = 1;
private $_crawler_conf = array(
'cookies' => array(),
'headers' => array(),
'ua' => '',
);
private $_crawlers = array();
private $_cur_threads = -1;
private $_max_run_time;
private $_cur_thread_time;
private $_map_status_list = array(
'H' => array(),
'M' => array(),
'B' => array(),
'N' => array(),
);
protected $_summary;
/**
* Initialize crawler, assign sitemap path
*
* @since 1.1.0
*/
public function __construct()
{
if (is_multisite()) {
$this->_sitemeta = 'meta' . get_current_blog_id() . '.data';
}
$this->_resetfile = LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta . '.reset';
$this->_summary = self::get_summary();
$this->_ncpu = $this->_get_server_cpu();
self::debug('Init w/ CPU cores=' . $this->_ncpu);
}
/**
* Try get server CPUs
* @since 5.2
*/
private function _get_server_cpu()
{
$cpuinfo_file = '/proc/cpuinfo';
$setting_open_dir = ini_get('open_basedir');
if ($setting_open_dir) {
return 1;
} // Server has limit
try {
if (!@is_file($cpuinfo_file)) {
return 1;
}
} catch (\Exception $e) {
return 1;
}
$cpuinfo = file_get_contents($cpuinfo_file);
preg_match_all('/^processor/m', $cpuinfo, $matches);
return count($matches[0]) ?: 1;
}
/**
* Check whether the current crawler is active/runable/useable/enabled/want it to work or not
*
* @since 4.3
*/
public function is_active($curr)
{
$bypass_list = self::get_option('bypass_list', array());
return !in_array($curr, $bypass_list);
}
/**
* Toggle the current crawler's activeness state, i.e., runable/useable/enabled/want it to work or not, and return the updated state
*
* @since 4.3
*/
public function toggle_activeness($curr)
{
// param type: int
$bypass_list = self::get_option('bypass_list', array());
if (in_array($curr, $bypass_list)) {
// when the ith opt was off / in the bypassed list, turn it on / remove it from the list
unset($bypass_list[array_search($curr, $bypass_list)]);
$bypass_list = array_values($bypass_list);
self::update_option('bypass_list', $bypass_list);
return true;
} else {
// when the ith opt was on / not in the bypassed list, turn it off / add it to the list
$bypass_list[] = (int) $curr;
self::update_option('bypass_list', $bypass_list);
return false;
}
}
/**
* Clear bypassed list
*
* @since 4.3
* @access public
*/
public function clear_disabled_list()
{
self::update_option('bypass_list', array());
$msg = __('Crawler disabled list is cleared! All crawlers are set to active! ', 'litespeed-cache');
Admin_Display::note($msg);
self::debug('All crawlers are set to active...... ');
}
/**
* Overwride get_summary to init elements
*
* @since 3.0
* @access public
*/
public static function get_summary($field = false)
{
$_default = array(
'list_size' => 0,
'last_update_time' => 0,
'curr_crawler' => 0,
'curr_crawler_beginning_time' => 0,
'last_pos' => 0,
'last_count' => 0,
'last_crawled' => 0,
'last_start_time' => 0,
'last_status' => '',
'is_running' => 0,
'end_reason' => '',
'meta_save_time' => 0,
'pos_reset_check' => 0,
'done' => 0,
'this_full_beginning_time' => 0,
'last_full_time_cost' => 0,
'last_crawler_total_cost' => 0,
'crawler_stats' => array(), // this will store all crawlers hit/miss crawl status
);
wp_cache_delete('alloptions', 'options'); // ensure the summary is current
$summary = parent::get_summary();
$summary = array_merge($_default, $summary);
if (!$field) {
return $summary;
}
if (array_key_exists($field, $summary)) {
return $summary[$field];
}
return null;
}
/**
* Overwride save_summary
*
* @since 3.0
* @access public
*/
public static function save_summary($data = false, $reload = false, $overwrite = false)
{
$instance = self::cls();
$instance->_summary['meta_save_time'] = time();
if (!$data) {
$data = $instance->_summary;
}
parent::save_summary($data, $reload, $overwrite);
File::save(LITESPEED_STATIC_DIR . '/crawler/' . $instance->_sitemeta, json_encode($data), true);
}
/**
* Cron start async crawling
*
* @since 5.5
*/
public static function start_async_cron()
{
Task::async_call('crawler');
}
/**
* Manually start async crawling
*
* @since 5.5
*/
public static function start_async()
{
Task::async_call('crawler_force');
$msg = __('Started async crawling', 'litespeed-cache');
Admin_Display::success($msg);
}
/**
* Ajax crawl handler
*
* @since 5.5
*/
public static function async_handler($manually_run = false)
{
self::debug('------------async-------------start_async_handler');
// self::debug('-------------async------------ check_ajax_referer');
// add_action('check_ajax_referer', function ($a, $b) {
// \LiteSpeed\Crawler::debug('---------------' . $a . $b);
// });
// check_ajax_referer('async_crawler', 'nonce');
// self::debug('--------------async----------- start async crawling');
self::start($manually_run);
}
/**
* Proceed crawling
*
* @since 1.1.0
* @access public
*/
public static function start($manually_run = false)
{
if (!Router::can_crawl()) {
self::debug('......crawler is NOT allowed by the server admin......');
return false;
}
if ($manually_run) {
self::debug('......crawler manually ran......');
}
// $i = 0;
// while ($i < 100) {
// self::debug('......sleep ' . ($i++) . '......' . time());
// sleep(1);
// }
// return;
self::cls()->_crawl_data($manually_run);
}
/**
* Crawling start
*
* @since 1.1.0
* @access private
*/
private function _crawl_data($manually_run)
{
if (!defined('LITESPEED_LANE_HASH')) {
define('LITESPEED_LANE_HASH', Str::rrand(8));
}
if ($this->_check_valid_lane()) {
$this->_take_over_lane();
} else {
self::debug('⚠️ lane in use');
return;
// if ($manually_run) {
// self::debug('......crawler started (manually_rund)......');
// // Log pid to prevent from multi running
// if (defined('LITESPEED_CLI')) {
// // Take over lane
// self::debug('⚠️⚠️⚠️ Forced take over lane (CLI)');
// $this->_take_over_lane();
// }
// }
}
self::debug('......crawler started......');
// for the first time running
if (!$this->_summary || !Data::cls()->tb_exist('crawler') || !Data::cls()->tb_exist('crawler_blacklist')) {
$this->cls('Crawler_Map')->gen();
}
// if finished last time, regenerate sitemap
if ($this->_summary['done'] === 'touchedEnd') {
// check whole crawling interval
$last_fnished_at = $this->_summary['last_full_time_cost'] + $this->_summary['this_full_beginning_time'];
if (!$manually_run && time() - $last_fnished_at < $this->conf(Base::O_CRAWLER_CRAWL_INTERVAL)) {
self::debug('Cron abort: cache warmed already.');
// if not reach whole crawling interval, exit
$this->Release_lane();
return;
}
self::debug('TouchedEnd. regenerate sitemap....');
$this->cls('Crawler_Map')->gen();
}
$this->list_crawlers();
// Skip the crawlers that in bypassed list
while (!$this->is_active($this->_summary['curr_crawler']) && $this->_summary['curr_crawler'] < count($this->_crawlers)) {
self::debug('Skipped the Crawler #' . $this->_summary['curr_crawler'] . ' ......');
$this->_summary['curr_crawler']++;
}
if ($this->_summary['curr_crawler'] >= count($this->_crawlers)) {
$this->_end_reason = 'end';
$this->_terminate_running();
$this->Release_lane();
return;
}
// In case crawlers are all done but not reload, reload it
if (empty($this->_summary['curr_crawler']) || empty($this->_crawlers[$this->_summary['curr_crawler']])) {
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
}
$this->load_conf();
try {
$this->_engine_start();
$this->Release_lane();
} catch (\Exception $e) {
self::debug('🛑 ' . $e->getMessage());
}
}
/**
* Load conf before running crawler
*
* @since 3.0
* @access private
*/
private function load_conf()
{
$this->_crawler_conf['base'] = home_url();
$current_crawler = $this->_crawlers[$this->_summary['curr_crawler']];
/**
* Set role simulation
* @since 1.9.1
*/
if (!empty($current_crawler['uid'])) {
// Get role simulation vary name
$vary_name = $this->cls('Vary')->get_vary_name();
$vary_val = $this->cls('Vary')->finalize_default_vary($current_crawler['uid']);
$this->_crawler_conf['cookies'][$vary_name] = $vary_val;
$this->_crawler_conf['cookies']['litespeed_role'] = $current_crawler['uid'];
}
/**
* Check cookie crawler
* @since 2.8
*/
foreach ($current_crawler as $k => $v) {
if (strpos($k, 'cookie:') !== 0) {
continue;
}
if ($v == '_null') {
continue;
}
$this->_crawler_conf['cookies'][substr($k, 7)] = $v;
}
/**
* Set WebP simulation
* @since 1.9.1
*/
if (!empty($current_crawler['webp'])) {
$this->_crawler_conf['headers'][] = 'Accept: image/webp,*/*';
}
/**
* Set mobile crawler
* @since 2.8
*/
if (!empty($current_crawler['mobile'])) {
$this->_crawler_conf['ua'] = 'Mobile iPhone';
}
/**
* Limit delay to use server setting
* @since 1.8.3
*/
$this->_crawler_conf['run_delay'] = $this->conf(Base::O_CRAWLER_USLEEP); // microseconds
if (!empty($_SERVER[Base::ENV_CRAWLER_USLEEP]) && $_SERVER[Base::ENV_CRAWLER_USLEEP] > $this->_crawler_conf['run_delay']) {
$this->_crawler_conf['run_delay'] = $_SERVER[Base::ENV_CRAWLER_USLEEP];
}
$this->_crawler_conf['run_duration'] = $this->conf(Base::O_CRAWLER_RUN_DURATION);
$this->_crawler_conf['load_limit'] = $this->conf(Base::O_CRAWLER_LOAD_LIMIT);
if (!empty($_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE])) {
$this->_crawler_conf['load_limit'] = $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE];
} elseif (!empty($_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT]) && $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT] < $this->_crawler_conf['load_limit']) {
$this->_crawler_conf['load_limit'] = $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT];
}
}
/**
* Start crawler
*
* @since 1.1.0
* @access private
*/
private function _engine_start()
{
// check if is running
// if ($this->_summary['is_running'] && time() - $this->_summary['is_running'] < $this->_crawler_conf['run_duration']) {
// $this->_end_reason = 'stopped';
// self::debug('The crawler is running.');
// return;
// }
// check current load
$this->_adjust_current_threads();
if ($this->_cur_threads == 0) {
$this->_end_reason = 'stopped_highload';
self::debug('Stopped due to heavy load.');
return;
}
// log started time
self::save_summary(array('last_start_time' => time()));
// set time limit
$maxTime = (int) ini_get('max_execution_time');
self::debug('ini_get max_execution_time=' . $maxTime);
if ($maxTime == 0) {
$maxTime = 300; // hardlimit
} else {
$maxTime -= 5;
}
if ($maxTime >= $this->_crawler_conf['run_duration']) {
$maxTime = $this->_crawler_conf['run_duration'];
self::debug('Use run_duration setting as max_execution_time=' . $maxTime);
} elseif (ini_set('max_execution_time', $this->_crawler_conf['run_duration'] + 15) !== false) {
$maxTime = $this->_crawler_conf['run_duration'];
self::debug('ini_set max_execution_time=' . $maxTime);
}
self::debug('final max_execution_time=' . $maxTime);
$this->_max_run_time = $maxTime + time();
// mark running
$this->_prepare_running();
// run cralwer
$this->_do_running();
$this->_terminate_running();
}
/**
* Get server load
*
* @since 5.5
*/
public function get_server_load()
{
/**
* If server is windows, exit
* @see https://wordpress.org/support/topic/crawler-keeps-causing-crashes/
*/
if (!function_exists('sys_getloadavg')) {
return -1;
}
$curload = sys_getloadavg();
$curload = $curload[0];
self::debug('Server load: ' . $curload);
return $curload;
}
/**
* Adjust threads dynamically
*
* @since 1.1.0
* @access private
*/
private function _adjust_current_threads()
{
$curload = $this->get_server_load();
if ($curload == -1) {
self::debug('set threads=0 due to func sys_getloadavg not exist!');
$this->_cur_threads = 0;
return;
}
$curload /= $this->_ncpu;
// $curload = 1;
if ($this->_cur_threads == -1) {
// init
if ($curload > $this->_crawler_conf['load_limit']) {
$curthreads = 0;
} elseif ($curload >= $this->_crawler_conf['load_limit'] - 1) {
$curthreads = 1;
} else {
$curthreads = intval($this->_crawler_conf['load_limit'] - $curload);
if ($curthreads > $this->conf(Base::O_CRAWLER_THREADS)) {
$curthreads = $this->conf(Base::O_CRAWLER_THREADS);
}
}
} else {
// adjust
$curthreads = $this->_cur_threads;
if ($curload >= $this->_crawler_conf['load_limit'] + 1) {
sleep(5); // sleep 5 secs
if ($curthreads >= 1) {
$curthreads--;
}
} elseif ($curload >= $this->_crawler_conf['load_limit']) {
// if ( $curthreads > 1 ) {// if already 1, keep
$curthreads--;
// }
} elseif ($curload + 1 < $this->_crawler_conf['load_limit']) {
if ($curthreads < $this->conf(Base::O_CRAWLER_THREADS)) {
$curthreads++;
}
}
}
// $log = 'set current threads = ' . $curthreads . ' previous=' . $this->_cur_threads
// . ' max_allowed=' . $this->conf( Base::O_CRAWLER_THREADS ) . ' load_limit=' . $this->_crawler_conf[ 'load_limit' ] . ' current_load=' . $curload;
$this->_cur_threads = $curthreads;
$this->_cur_thread_time = time();
}
/**
* Mark running status
*
* @since 1.1.0
* @access private
*/
private function _prepare_running()
{
$this->_summary['is_running'] = time();
$this->_summary['done'] = 0; // reset done status
$this->_summary['last_status'] = 'prepare running';
$this->_summary['last_crawled'] = 0;
// Current crawler starttime mark
if ($this->_summary['last_pos'] == 0) {
$this->_summary['curr_crawler_beginning_time'] = time();
}
if ($this->_summary['curr_crawler'] == 0 && $this->_summary['last_pos'] == 0) {
$this->_summary['this_full_beginning_time'] = time();
$this->_summary['list_size'] = $this->cls('Crawler_Map')->count_map();
}
if ($this->_summary['end_reason'] == 'end' && $this->_summary['last_pos'] == 0) {
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
}
self::save_summary();
}
/**
* Take over lane
* @since 6.1
*/
private function _take_over_lane()
{
self::debug('Take over lane as lane is free: ' . $this->json_local_path() . '.pid');
file::save($this->json_local_path() . '.pid', LITESPEED_LANE_HASH);
}
/**
* Update lane file
* @since 6.1
*/
private function _touch_lane()
{
touch($this->json_local_path() . '.pid');
}
/**
* Release lane file
* @since 6.1
*/
public function Release_lane()
{
$lane_file = $this->json_local_path() . '.pid';
if (!file_exists($lane_file)) {
return;
}
self::debug('Release lane');
unlink($lane_file);
}
/**
* Check if lane is used by other crawlers
* @since 6.1
*/
private function _check_valid_lane($strict_mode = false)
{
// Check lane hash
$lane_file = $this->json_local_path() . '.pid';
if ($strict_mode) {
if (!file_exists($lane_file)) {
self::debug("lane file not existed, strict mode is false [file] $lane_file");
return false;
}
}
$pid = file::read($lane_file);
if ($pid && LITESPEED_LANE_HASH != $pid) {
// If lane file is older than 1h, ignore
if (time() - filemtime($lane_file) > 3600) {
self::debug('Lane file is older than 1h, releasing lane');
$this->Release_lane();
return true;
}
return false;
}
return true;
}
/**
* Run crawler
*
* @since 1.1.0
* @access private
*/
private function _do_running()
{
$options = $this->_get_curl_options(true);
while ($urlChunks = $this->cls('Crawler_Map')->list_map(self::CHUNKS, $this->_summary['last_pos'])) {
// self::debug('$urlChunks=' . count($urlChunks) . ' $this->_cur_threads=' . $this->_cur_threads);
// start crawling
$urlChunks = array_chunk($urlChunks, $this->_cur_threads);
// self::debug('$urlChunks after array_chunk: ' . count($urlChunks));
foreach ($urlChunks as $rows) {
if (!$this->_check_valid_lane(true)) {
$this->_end_reason = 'lane_invalid';
self::debug('🛑 The crawler lane is used by newer crawler.');
throw new \Exception('invalid crawler lane');
}
// Update time
$this->_touch_lane();
// self::debug('chunk fetching count($rows)= ' . count($rows));
// multi curl
$rets = $this->_multi_request($rows, $options);
// check result headers
foreach ($rows as $row) {
// self::debug('chunk fetching 553');
if (empty($rets[$row['id']])) {
// If already in blacklist, no curl happened, no corresponding record
continue;
}
// self::debug('chunk fetching 557');
// check response
if ($rets[$row['id']]['code'] == 428) {
// HTTP/1.1 428 Precondition Required (need to test)
$this->_end_reason = 'crawler_disabled';
self::debug('crawler_disabled');
return;
}
$status = $this->_status_parse($rets[$row['id']]['header'], $rets[$row['id']]['code'], $row['url']); // B or H or M or N(nocache)
self::debug('[status] ' . $this->_status2title($status) . "\t\t [url] " . $row['url']);
$this->_map_status_list[$status][$row['id']] = array(
'url' => $row['url'],
'code' => $rets[$row['id']]['code'], // 201 or 200 or 404
);
if (empty($this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status])) {
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status] = 0;
}
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status]++;
}
// update offset position
$_time = time();
$this->_summary['last_count'] = count($rows);
$this->_summary['last_pos'] += $this->_summary['last_count'];
$this->_summary['last_crawled'] += $this->_summary['last_count'];
$this->_summary['last_update_time'] = $_time;
$this->_summary['last_status'] = 'updated position';
// self::debug("chunk fetching 604 last_pos:{$this->_summary['last_pos']} last_count:{$this->_summary['last_count']} last_crawled:{$this->_summary['last_crawled']}");
// check duration
if ($this->_summary['last_update_time'] > $this->_max_run_time) {
$this->_end_reason = 'stopped_maxtime';
self::debug('Terminated due to maxtime');
return;
// return __('Stopped due to exceeding defined Maximum Run Time', 'litespeed-cache');
}
// make sure at least each 10s save meta & map status once
if ($_time - $this->_summary['meta_save_time'] > 10) {
$this->_map_status_list = $this->cls('Crawler_Map')->save_map_status($this->_map_status_list, $this->_summary['curr_crawler']);
self::save_summary();
}
// self::debug('chunk fetching 597');
// check if need to reset pos each 5s
if ($_time > $this->_summary['pos_reset_check']) {
$this->_summary['pos_reset_check'] = $_time + 5;
if (file_exists($this->_resetfile) && unlink($this->_resetfile)) {
self::debug('Terminated due to reset file');
$this->_summary['last_pos'] = 0;
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
// reset done status
$this->_summary['done'] = 0;
$this->_summary['this_full_beginning_time'] = 0;
$this->_end_reason = 'stopped_reset';
return;
// return __('Stopped due to reset meta position', 'litespeed-cache');
}
}
// self::debug('chunk fetching 615');
// check loads
if ($this->_summary['last_update_time'] - $this->_cur_thread_time > 60) {
$this->_adjust_current_threads();
if ($this->_cur_threads == 0) {
$this->_end_reason = 'stopped_highload';
self::debug('🛑 Terminated due to highload');
return;
// return __('Stopped due to load over limit', 'litespeed-cache');
}
}
$this->_summary['last_status'] = 'sleeping ' . $this->_crawler_conf['run_delay'] . 'ms';
usleep($this->_crawler_conf['run_delay']);
}
// self::debug('chunk fetching done');
}
// All URLs are done for current crawler
$this->_end_reason = 'end';
$this->_summary['crawler_stats'][$this->_summary['curr_crawler']]['W'] = 0;
self::debug('Crawler #' . $this->_summary['curr_crawler'] . ' touched end');
}
/**
* Send multi curl requests
* If res=B, bypass request and won't return
*
* @since 1.1.0
* @access private
*/
private function _multi_request($rows, $options)
{
if (!function_exists('curl_multi_init')) {
exit('curl_multi_init disabled');
}
$mh = curl_multi_init();
$curls = array();
foreach ($rows as $row) {
if (substr($row['res'], $this->_summary['curr_crawler'], 1) == 'B') {
continue;
}
if (substr($row['res'], $this->_summary['curr_crawler'], 1) == 'N') {
continue;
}
if (!function_exists('curl_init')) {
exit('curl_init disabled');
}
$curls[$row['id']] = curl_init();
// Append URL
$url = $row['url'];
if ($this->conf(Base::O_CRAWLER_DROP_DOMAIN)) {
$url = $this->_crawler_conf['base'] . $row['url'];
}
curl_setopt($curls[$row['id']], CURLOPT_URL, $url);
self::debug('Crawling [url] ' . $url . ($url == $row['url'] ? '' : ' [ori] ' . $row['url']));
curl_setopt_array($curls[$row['id']], $options);
curl_multi_add_handle($mh, $curls[$row['id']]);
}
// self::debug('-----debug1');
// execute curl
if ($curls) {
do {
$status = curl_multi_exec($mh, $active);
if ($active) {
curl_multi_select($mh);
}
} while ($active && $status == CURLM_OK);
}
// self::debug('-----debug2');
// curl done
$ret = array();
foreach ($rows as $row) {
if (substr($row['res'], $this->_summary['curr_crawler'], 1) == 'B') {
continue;
}
if (substr($row['res'], $this->_summary['curr_crawler'], 1) == 'N') {
continue;
}
// self::debug('-----debug3');
$ch = $curls[$row['id']];
// Parse header
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$content = curl_multi_getcontent($ch);
$header = substr($content, 0, $header_size);
$ret[$row['id']] = array(
'header' => $header,
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
);
// self::debug('-----debug4');
curl_multi_remove_handle($mh, $ch);
curl_close($ch);
}
// self::debug('-----debug5');
curl_multi_close($mh);
// self::debug('-----debug6');
return $ret;
}
/**
* Translate the status to title
* @since 6.0
*/
private function _status2title($status)
{
if ($status == 'H') {
return '✅ Hit';
}
if ($status == 'M') {
return '😊 Miss';
}
if ($status == 'B') {
return '😅 Blacklisted';
}
if ($status == 'N') {
return '😅 Blacklisted';
}
return '🛸 Unknown';
}
/**
* Check returned curl header to find if cached or not
*
* @since 2.0
* @access private
*/
private function _status_parse($header, $code, $url)
{
if ($code == 201) {
return 'H';
}
if (stripos($header, 'X-Litespeed-Cache-Control: no-cache') !== false) {
// If is from DIVI, taken as miss
if (defined('LITESPEED_CRAWLER_IGNORE_NONCACHEABLE') && LITESPEED_CRAWLER_IGNORE_NONCACHEABLE) {
return 'M';
}
// If blacklist is disabled
if (
(defined('LITESPEED_CRAWLER_DISABLE_BLOCKLIST') && LITESPEED_CRAWLER_DISABLE_BLOCKLIST) ||
apply_filters('litespeed_crawler_disable_blocklist', '__return_false', $url)
) {
return 'M';
}
return 'N'; // Blacklist
}
$_cache_headers = array('x-litespeed-cache', 'x-lsadc-cache', 'x-qc-cache');
foreach ($_cache_headers as $_header) {
if (stripos($header, $_header) !== false) {
if (stripos($header, $_header . ': miss') !== false) {
return 'M'; // Miss
}
return 'H'; // Hit
}
}
// If blacklist is disabled
if (
(defined('LITESPEED_CRAWLER_DISABLE_BLOCKLIST') && LITESPEED_CRAWLER_DISABLE_BLOCKLIST) ||
apply_filters('litespeed_crawler_disable_blocklist', '__return_false', $url)
) {
return 'M';
}
return 'B'; // Blacklist
}
/**
* Get curl_options
*
* @since 1.1.0
* @access private
*/
private function _get_curl_options($crawler_only = false)
{
$options = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_FOLLOWLOCATION => false,
CURLOPT_ENCODING => 'gzip',
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_TIMEOUT => $this->conf(Base::O_CRAWLER_TIMEOUT), // Larger timeout to avoid incorrect blacklist addition #900171
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_NOBODY => false,
CURLOPT_HTTPHEADER => $this->_crawler_conf['headers'],
);
$options[CURLOPT_HTTPHEADER][] = 'Cache-Control: max-age=0';
/**
* Try to enable http2 connection (only available since PHP7+)
* @since 1.9.1
* @since 2.2.7 Commented due to cause no-cache issue
* @since 2.9.1+ Fixed wrongly usage of CURL_HTTP_VERSION_1_1 const
*/
$options[CURLOPT_HTTP_VERSION] = CURL_HTTP_VERSION_1_1;
// $options[ CURL_HTTP_VERSION_2 ] = 1;
// IP resolve
if ($this->conf(Base::O_SERVER_IP)) {
Utility::compatibility();
if (($this->conf(Base::O_CRAWLER_DROP_DOMAIN) || !$crawler_only) && $this->_crawler_conf['base']) {
// Resolve URL to IP
$parsed_url = parse_url($this->_crawler_conf['base']);
if (!empty($parsed_url['host'])) {
$dom = $parsed_url['host'];
$port = $parsed_url['scheme'] == 'https' ? '443' : '80';
$url = $dom . ':' . $port . ':' . $this->conf(Base::O_SERVER_IP);
$options[CURLOPT_RESOLVE] = array($url);
$options[CURLOPT_DNS_USE_GLOBAL_CACHE] = false;
}
}
}
// if is walker
// $options[ CURLOPT_FRESH_CONNECT ] = true;
// Referer
if (isset($_SERVER['HTTP_HOST']) && isset($_SERVER['REQUEST_URI'])) {
$options[CURLOPT_REFERER] = 'http://' . $_SERVER['HTTP_HOST'] . $_SERVER['REQUEST_URI'];
}
// User Agent
if ($crawler_only) {
if (strpos($this->_crawler_conf['ua'], Crawler::FAST_USER_AGENT) !== 0) {
$this->_crawler_conf['ua'] = Crawler::FAST_USER_AGENT . ' ' . $this->_crawler_conf['ua'];
}
}
$options[CURLOPT_USERAGENT] = $this->_crawler_conf['ua'];
/**
* Append hash to cookie for validation
* @since 1.9.1
*/
if ($crawler_only) {
$this->_crawler_conf['cookies']['litespeed_hash'] = Router::get_hash();
}
// Cookies
$cookies = array();
foreach ($this->_crawler_conf['cookies'] as $k => $v) {
if (!$v) {
continue;
}
$cookies[] = $k . '=' . urlencode($v);
}
if ($cookies) {
$options[CURLOPT_COOKIE] = implode('; ', $cookies);
}
return $options;
}
/**
* Self curl to get HTML content
*
* @since 3.3
*/
public function self_curl($url, $ua, $uid = false, $accept = false)
{
// $accept not in use yet
$this->_crawler_conf['base'] = home_url();
$this->_crawler_conf['ua'] = $ua;
if ($accept) {
$this->_crawler_conf['headers'] = array('Accept: ' . $accept);
}
if ($uid) {
$this->_crawler_conf['cookies']['litespeed_role'] = $uid;
$this->_crawler_conf['cookies']['litespeed_hash'] = Router::get_hash();
}
$options = $this->_get_curl_options();
$options[CURLOPT_HEADER] = false;
$options[CURLOPT_FOLLOWLOCATION] = true;
$ch = curl_init();
curl_setopt_array($ch, $options);
curl_setopt($ch, CURLOPT_URL, $url);
$result = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($code != 200) {
self::debug('❌ Response code is not 200 in self_curl() [code] ' . var_export($code, true));
return false;
}
return $result;
}
/**
* Terminate crawling
*
* @since 1.1.0
* @access private
*/
private function _terminate_running()
{
$this->_map_status_list = $this->cls('Crawler_Map')->save_map_status($this->_map_status_list, $this->_summary['curr_crawler']);
if ($this->_end_reason == 'end') {
// Current crawler is fully done
// $end_reason = sprintf( __( 'Crawler %s reached end of sitemap file.', 'litespeed-cache' ), '#' . ( $this->_summary['curr_crawler'] + 1 ) );
$this->_summary['curr_crawler']++; // Jump to next cralwer
// $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array(); // reset this at next crawl time
$this->_summary['last_pos'] = 0; // reset last position
$this->_summary['last_crawler_total_cost'] = time() - $this->_summary['curr_crawler_beginning_time'];
$count_crawlers = count($this->list_crawlers());
if ($this->_summary['curr_crawler'] >= $count_crawlers) {
self::debug('_terminate_running Touched end, whole crawled. Reload crawler!');
$this->_summary['curr_crawler'] = 0;
// $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
$this->_summary['done'] = 'touchedEnd'; // log done status
$this->_summary['last_full_time_cost'] = time() - $this->_summary['this_full_beginning_time'];
}
}
$this->_summary['last_status'] = 'stopped';
$this->_summary['is_running'] = 0;
$this->_summary['end_reason'] = $this->_end_reason;
self::save_summary();
}
/**
* List all crawlers ( tagA => [ valueA => titleA, ... ] ...)
*
* @since 1.9.1
* @access public
*/
public function list_crawlers()
{
if ($this->_crawlers) {
return $this->_crawlers;
}
$crawler_factors = array();
// Add default Guest crawler
$crawler_factors['uid'] = array(0 => __('Guest', 'litespeed-cache'));
// WebP on/off
if (($this->conf(Base::O_GUEST) && $this->conf(Base::O_GUEST_OPTM)) || $this->conf(Base::O_IMG_OPTM_WEBP)) {
$crawler_factors['webp'] = array(1 => 'WebP', 0 => '');
}
// Guest Mode on/off
if ($this->conf(Base::O_GUEST)) {
$vary_name = $this->cls('Vary')->get_vary_name();
$vary_val = 'guest_mode:1';
if (!defined('LSCWP_LOG')) {
$vary_val = md5($this->conf(Base::HASH) . $vary_val);
}
$crawler_factors['cookie:' . $vary_name] = array($vary_val => '', '_null' => '<font data-balloon-pos="up" aria-label="Guest Mode">👒</font>');
}
// Mobile crawler
if ($this->conf(Base::O_CACHE_MOBILE)) {
$crawler_factors['mobile'] = array(1 => '<font data-balloon-pos="up" aria-label="Mobile">📱</font>', 0 => '');
}
// Get roles set
// List all roles
foreach ($this->conf(Base::O_CRAWLER_ROLES) as $v) {
$role_title = '';
$udata = get_userdata($v);
if (isset($udata->roles) && is_array($udata->roles)) {
$tmp = array_values($udata->roles);
$role_title = array_shift($tmp);
}
if (!$role_title) {
continue;
}
$crawler_factors['uid'][$v] = ucfirst($role_title);
}
// Cookie crawler
foreach ($this->conf(Base::O_CRAWLER_COOKIES) as $v) {
if (empty($v['name'])) {
continue;
}
$this_cookie_key = 'cookie:' . $v['name'];
$crawler_factors[$this_cookie_key] = array();
foreach ($v['vals'] as $v2) {
$crawler_factors[$this_cookie_key][$v2] =
$v2 == '_null' ? '' : '<font data-balloon-pos="up" aria-label="Cookie">🍪</font>' . esc_html($v['name']) . '=' . esc_html($v2);
}
}
// Crossing generate the crawler list
$this->_crawlers = $this->_recursive_build_crawler($crawler_factors);
return $this->_crawlers;
}
/**
* Build a crawler list recursively
*
* @since 2.8
* @access private
*/
private function _recursive_build_crawler($crawler_factors, $group = array(), $i = 0)
{
$current_factor = array_keys($crawler_factors);
$current_factor = $current_factor[$i];
$if_touch_end = $i + 1 >= count($crawler_factors);
$final_list = array();
foreach ($crawler_factors[$current_factor] as $k => $v) {
// Don't alter $group bcos of loop usage
$item = $group;
$item['title'] = !empty($group['title']) ? $group['title'] : '';
if ($v) {
if ($item['title']) {
$item['title'] .= ' - ';
}
$item['title'] .= $v;
}
$item[$current_factor] = $k;
if ($if_touch_end) {
$final_list[] = $item;
} else {
// Inception: next layer
$final_list = array_merge($final_list, $this->_recursive_build_crawler($crawler_factors, $item, $i + 1));
}
}
return $final_list;
}
/**
* Return crawler meta file local path
*
* @since 6.1
* @access public
*/
public function json_local_path()
{
if (!file_exists(LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta)) {
return false;
}
return LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta;
}
/**
* Return crawler meta file
*
* @since 1.1.0
* @access public
*/
public function json_path()
{
if (!file_exists(LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta)) {
return false;
}
return LITESPEED_STATIC_URL . '/crawler/' . $this->_sitemeta;
}
/**
* Create reset pos file
*
* @since 1.1.0
* @access public
*/
public function reset_pos()
{
File::save($this->_resetfile, time(), true);
self::save_summary(array('is_running' => 0));
}
/**
* Display status based by matching crawlers order
*
* @since 3.0
* @access public
*/
public function display_status($status_row, $reason_set)
{
if (!$status_row) {
return '';
}
$_status_list = array(
'-' => 'default',
'M' => 'primary',
'H' => 'success',
'B' => 'danger',
'N' => 'warning',
);
$reason_set = explode(',', $reason_set);
$status = '';
foreach (str_split($status_row) as $k => $v) {
$reason = $reason_set[$k];
if ($reason == 'Man') {
$reason = __('Manually added to blocklist', 'litespeed-cache');
}
if ($reason == 'Existed') {
$reason = __('Previously existed in blocklist', 'litespeed-cache');
}
if ($reason) {
$reason = 'data-balloon-pos="up" aria-label="' . $reason . '"';
}
$status .= '<i class="litespeed-dot litespeed-bg-' . $_status_list[$v] . '" ' . $reason . '>' . ($k + 1) . '</i>';
}
return $status;
}
/**
* Output info and exit
*
* @since 1.1.0
* @access protected
* @param string $error Error info
*/
protected function output($msg)
{
if (defined('DOING_CRON')) {
echo $msg;
// exit();
} else {
echo "<script>alert('" . htmlspecialchars($msg) . "');</script>";
// exit;
}
}
/**
* Handle all request actions from main cls
*
* @since 3.0
* @access public
*/
public function handler()
{
$type = Router::verify_type();
switch ($type) {
case self::TYPE_REFRESH_MAP:
$this->cls('Crawler_Map')->gen();
break;
case self::TYPE_EMPTY:
$this->cls('Crawler_Map')->empty_map();
break;
case self::TYPE_BLACKLIST_EMPTY:
$this->cls('Crawler_Map')->blacklist_empty();
break;
case self::TYPE_BLACKLIST_DEL:
if (!empty($_GET['id'])) {
$this->cls('Crawler_Map')->blacklist_del($_GET['id']);
}
break;
case self::TYPE_BLACKLIST_ADD:
if (!empty($_GET['id'])) {
$this->cls('Crawler_Map')->blacklist_add($_GET['id']);
}
break;
// Handle the ajax request to proceed crawler manually by admin
case self::TYPE_START:
self::start_async();
break;
case self::TYPE_RESET:
$this->reset_pos();
break;
default:
break;
}
Admin::redirect();
}
}