1061 lines
30 KiB
PHP
1061 lines
30 KiB
PHP
<?php
|
|
/**
|
|
* The crawler class
|
|
*
|
|
* @since 1.1.0
|
|
*/
|
|
namespace LiteSpeed;
|
|
defined( 'WPINC' ) || exit;
|
|
|
|
class Crawler extends Root {
|
|
const TYPE_REFRESH_MAP = 'refresh_map';
|
|
const TYPE_EMPTY = 'empty';
|
|
const TYPE_BLACKLIST_EMPTY = 'blacklist_empty';
|
|
const TYPE_BLACKLIST_DEL = 'blacklist_del';
|
|
const TYPE_BLACKLIST_ADD = 'blacklist_add';
|
|
const TYPE_START = 'start';
|
|
const TYPE_RESET = 'reset';
|
|
|
|
const USER_AGENT = 'lscache_walker';
|
|
const FAST_USER_AGENT = 'lscache_runner';
|
|
const CHUNKS = 10000;
|
|
|
|
private $_sitemeta = 'meta.data';
|
|
private $_resetfile;
|
|
private $_end_reason;
|
|
|
|
private $_crawler_conf = array(
|
|
'cookies' => array(),
|
|
'headers' => array(),
|
|
'ua' => '',
|
|
);
|
|
private $_crawlers = array();
|
|
private $_cur_threads = -1;
|
|
private $_max_run_time;
|
|
private $_cur_thread_time;
|
|
private $_map_status_list = array(
|
|
'H' => array(),
|
|
'M' => array(),
|
|
'B' => array(),
|
|
'N' => array(),
|
|
);
|
|
protected $_summary;
|
|
|
|
/**
|
|
* Initialize crawler, assign sitemap path
|
|
*
|
|
* @since 1.1.0
|
|
*/
|
|
public function __construct() {
|
|
if ( is_multisite() ) {
|
|
$this->_sitemeta = 'meta' . get_current_blog_id() . '.data';
|
|
}
|
|
|
|
$this->_resetfile = LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta . '.reset';
|
|
|
|
$this->_summary = self::get_summary();
|
|
|
|
Debug2::debug( '🐞 Init' );
|
|
}
|
|
|
|
/**
|
|
* Check whether the current crawler is active/runable/useable/enabled/want it to work or not
|
|
*
|
|
* @since 4.3
|
|
*/
|
|
public function is_active( $curr ){
|
|
$bypass_list = self::get_option( 'bypass_list' , array() );
|
|
return ! in_array( $curr, $bypass_list );
|
|
}
|
|
|
|
/**
|
|
* Toggle the current crawler's activeness state, i.e., runable/useable/enabled/want it to work or not, and return the updated state
|
|
*
|
|
* @since 4.3
|
|
*/
|
|
public function toggle_activeness( $curr ) { // param type: int
|
|
$bypass_list = self::get_option( 'bypass_list' , array() );
|
|
if ( in_array( $curr, $bypass_list ) ) { // when the ith opt was off / in the bypassed list, turn it on / remove it from the list
|
|
unset( $bypass_list[ array_search( $curr, $bypass_list ) ] );
|
|
$bypass_list = array_values( $bypass_list );
|
|
self::update_option( 'bypass_list' , $bypass_list );
|
|
return true;
|
|
} else { // when the ith opt was on / not in the bypassed list, turn it off / add it to the list
|
|
$bypass_list[] = ( int ) $curr;
|
|
self::update_option( 'bypass_list' , $bypass_list );
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clear bypassed list
|
|
*
|
|
* @since 4.3
|
|
* @access public
|
|
*/
|
|
public function clear_disabled_list() {
|
|
self::update_option( 'bypass_list', array() );
|
|
|
|
$msg = __( 'Crawler disabled list is cleared! All crawlers are set to active! ', 'litespeed-cache' );
|
|
Admin_Display::note( $msg );
|
|
|
|
Debug2::debug( '🐞 All crawlers are set to active...... ' );
|
|
}
|
|
|
|
/**
|
|
* Overwride get_summary to init elements
|
|
*
|
|
* @since 3.0
|
|
* @access public
|
|
*/
|
|
public static function get_summary( $field = false ) {
|
|
$_default = array(
|
|
'list_size' => 0,
|
|
'last_update_time' => 0,
|
|
'curr_crawler' => 0,
|
|
'curr_crawler_beginning_time' => 0,
|
|
'last_pos' => 0,
|
|
'last_count' => 0,
|
|
'last_crawled' => 0,
|
|
'last_start_time' => 0,
|
|
'last_status' => '',
|
|
'is_running' => 0,
|
|
'end_reason' => '',
|
|
'meta_save_time' => 0,
|
|
'pos_reset_check' => 0,
|
|
'done' => 0,
|
|
'this_full_beginning_time' => 0,
|
|
'last_full_time_cost' => 0,
|
|
'last_crawler_total_cost' => 0,
|
|
'crawler_stats' => array(), // this will store all crawlers hit/miss crawl status
|
|
);
|
|
|
|
$summary = parent::get_summary();
|
|
$summary = array_merge( $_default, $summary );
|
|
|
|
if ( ! $field ) {
|
|
return $summary;
|
|
}
|
|
|
|
if ( array_key_exists( $field, $summary ) ) {
|
|
return $summary[ $field ];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Overwride save_summary
|
|
*
|
|
* @since 3.0
|
|
* @access public
|
|
*/
|
|
public static function save_summary( $data = null ) {
|
|
$instance = self::cls();
|
|
$instance->_summary[ 'meta_save_time' ] = time();
|
|
|
|
if ( $data === null ) {
|
|
$data = $instance->_summary;
|
|
}
|
|
|
|
parent::save_summary( $data );
|
|
|
|
File::save( LITESPEED_STATIC_DIR . '/crawler/' . $instance->_sitemeta, json_encode( $data ), true );
|
|
}
|
|
|
|
/**
|
|
* Proceed crawling
|
|
*
|
|
* @since 1.1.0
|
|
* @access public
|
|
*/
|
|
public static function start( $force = false ) {
|
|
if ( ! Router::can_crawl() ) {
|
|
Debug2::debug( '🐞 ......crawler is NOT allowed by the server admin......' );
|
|
return false;
|
|
}
|
|
|
|
if ( $force ) {
|
|
Debug2::debug( '🐞 ......crawler manually ran......' );
|
|
}
|
|
|
|
self::cls()->_crawl_data( $force );
|
|
}
|
|
|
|
/**
|
|
* Crawling start
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _crawl_data( $force ) {
|
|
Debug2::debug( '🐞 ......crawler started......' );
|
|
// for the first time running
|
|
if ( ! $this->_summary || ! Data::cls()->tb_exist( 'crawler' ) || ! Data::cls()->tb_exist( 'crawler_blacklist' ) ) {
|
|
$this->cls( 'Crawler_Map' )->gen();
|
|
}
|
|
|
|
// if finished last time, regenerate sitemap
|
|
if ( $this->_summary['done'] === 'touchedEnd' ) {
|
|
// check whole crawling interval
|
|
$last_fnished_at = $this->_summary[ 'last_full_time_cost' ] + $this->_summary[ 'this_full_beginning_time' ];
|
|
if ( ! $force && time() - $last_fnished_at < $this->conf( Base::O_CRAWLER_CRAWL_INTERVAL ) ) {
|
|
Debug2::debug( '🐞 Cron abort: cache warmed already.' );
|
|
// if not reach whole crawling interval, exit
|
|
return;
|
|
}
|
|
Debug2::debug( '🐞 TouchedEnd. regenerate sitemap....' );
|
|
$this->cls( 'Crawler_Map' )->gen();
|
|
}
|
|
|
|
$this->list_crawlers();
|
|
|
|
// Skip the crawlers that in bypassed list
|
|
while ( ! $this->is_active( $this->_summary[ 'curr_crawler' ] ) && $this->_summary[ 'curr_crawler' ] < count( $this->_crawlers ) ) {
|
|
Debug2::debug( '🐞 Skipped the Crawler #' . $this->_summary[ 'curr_crawler' ] . ' ......' );
|
|
$this->_summary[ 'curr_crawler' ]++;
|
|
}
|
|
if ( $this->_summary[ 'curr_crawler' ] >= count( $this->_crawlers ) ) {
|
|
$this->_end_reason = 'end';
|
|
$this->_terminate_running();
|
|
return;
|
|
}
|
|
|
|
// In case crawlers are all done but not reload, reload it
|
|
if ( empty( $this->_summary[ 'curr_crawler' ] ) || empty( $this->_crawlers[ $this->_summary[ 'curr_crawler' ] ] ) ) {
|
|
$this->_summary[ 'curr_crawler' ] = 0;
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
|
|
}
|
|
|
|
$this->load_conf();
|
|
|
|
$this->_engine_start();
|
|
}
|
|
|
|
/**
|
|
* Load conf before running crawler
|
|
*
|
|
* @since 3.0
|
|
* @access private
|
|
*/
|
|
private function load_conf() {
|
|
$this->_crawler_conf[ 'base' ] = home_url();
|
|
|
|
$current_crawler = $this->_crawlers[ $this->_summary[ 'curr_crawler' ] ];
|
|
|
|
/**
|
|
* Set role simulation
|
|
* @since 1.9.1
|
|
*/
|
|
if ( ! empty( $current_crawler[ 'uid' ] ) ) {
|
|
// Get role simulation vary name
|
|
$vary_name = $this->cls( 'Vary' )->get_vary_name();
|
|
$vary_val = $this->cls( 'Vary' )->finalize_default_vary( $current_crawler[ 'uid' ] );
|
|
$this->_crawler_conf[ 'cookies' ][ $vary_name ] = $vary_val;
|
|
$this->_crawler_conf[ 'cookies' ][ 'litespeed_role' ] = $current_crawler[ 'uid' ];
|
|
}
|
|
|
|
/**
|
|
* Check cookie crawler
|
|
* @since 2.8
|
|
*/
|
|
foreach ( $current_crawler as $k => $v ) {
|
|
if ( strpos( $k, 'cookie:') !== 0 ) {
|
|
continue;
|
|
}
|
|
|
|
if ( $v == '_null' ) {
|
|
continue;
|
|
}
|
|
|
|
$this->_crawler_conf[ 'cookies' ][ substr( $k, 7 ) ] = $v;
|
|
}
|
|
|
|
/**
|
|
* Set WebP simulation
|
|
* @since 1.9.1
|
|
*/
|
|
if ( ! empty( $current_crawler[ 'webp' ] ) ) {
|
|
$this->_crawler_conf[ 'headers' ][] = 'Accept: image/webp,*/*';
|
|
}
|
|
|
|
/**
|
|
* Set mobile crawler
|
|
* @since 2.8
|
|
*/
|
|
if ( ! empty( $current_crawler[ 'mobile' ] ) ) {
|
|
$this->_crawler_conf[ 'ua' ] = 'Mobile iPhone';
|
|
}
|
|
|
|
/**
|
|
* Limit delay to use server setting
|
|
* @since 1.8.3
|
|
*/
|
|
$this->_crawler_conf[ 'run_delay' ] = $this->conf( Base::O_CRAWLER_USLEEP ); // microseconds
|
|
if ( ! empty( $_SERVER[ Base::ENV_CRAWLER_USLEEP ] ) && $_SERVER[ Base::ENV_CRAWLER_USLEEP ] > $this->_crawler_conf[ 'run_delay' ] ) {
|
|
$this->_crawler_conf[ 'run_delay' ] = $_SERVER[ Base::ENV_CRAWLER_USLEEP ];
|
|
}
|
|
|
|
$this->_crawler_conf[ 'run_duration' ] = $this->conf( Base::O_CRAWLER_RUN_DURATION );
|
|
|
|
$this->_crawler_conf[ 'load_limit' ] = $this->conf( Base::O_CRAWLER_LOAD_LIMIT );
|
|
if ( ! empty( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ] ) ) {
|
|
$this->_crawler_conf[ 'load_limit' ] = $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ];
|
|
}
|
|
elseif ( ! empty( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] ) && $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] < $this->_crawler_conf[ 'load_limit' ] ) {
|
|
$this->_crawler_conf[ 'load_limit' ] = $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ];
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* Start crawler
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _engine_start() {
|
|
// check if is running
|
|
if ( $this->_summary['is_running'] && time() - $this->_summary['is_running'] < $this->_crawler_conf[ 'run_duration' ] ) {
|
|
$this->_end_reason = 'stopped';
|
|
Debug2::debug( '🐞 The crawler is running.' );
|
|
return;
|
|
}
|
|
|
|
// check current load
|
|
$this->_adjust_current_threads();
|
|
if ( $this->_cur_threads == 0 ) {
|
|
$this->_end_reason = 'stopped_highload';
|
|
Debug2::debug( '🐞 Stopped due to heavy load.' );
|
|
return;
|
|
}
|
|
|
|
// log started time
|
|
$this->_summary['last_start_time'] = time();
|
|
self::save_summary();
|
|
|
|
// set time limit
|
|
$maxTime = (int) ini_get( 'max_execution_time' );
|
|
Debug2::debug( '🐞 ini_get max_execution_time=' . $maxTime );
|
|
if ( $maxTime == 0 ) {
|
|
$maxTime = 300; // hardlimit
|
|
}
|
|
else {
|
|
$maxTime -= 5;
|
|
}
|
|
if ( $maxTime >= $this->_crawler_conf[ 'run_duration' ] ) {
|
|
$maxTime = $this->_crawler_conf[ 'run_duration' ];
|
|
Debug2::debug( '🐞 Use run_duration setting as max_execution_time=' . $maxTime );
|
|
}
|
|
elseif ( ini_set( 'max_execution_time', $this->_crawler_conf[ 'run_duration' ] + 15 ) !== false ) {
|
|
$maxTime = $this->_crawler_conf[ 'run_duration' ];
|
|
Debug2::debug( '🐞 ini_set max_execution_time=' . $maxTime );
|
|
}
|
|
Debug2::debug( '🐞 final max_execution_time=' . $maxTime );
|
|
$this->_max_run_time = $maxTime + time();
|
|
|
|
// mark running
|
|
$this->_prepare_running();
|
|
// run cralwer
|
|
$this->_do_running();
|
|
$this->_terminate_running();
|
|
}
|
|
|
|
/**
|
|
* Adjust threads dynamically
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _adjust_current_threads() {
|
|
/**
|
|
* If server is windows, exit
|
|
* @see https://wordpress.org/support/topic/crawler-keeps-causing-crashes/
|
|
*/
|
|
if ( ! function_exists( 'sys_getloadavg' ) ) {
|
|
Debug2::debug( '🐞 set threads=0 due to func sys_getloadavg not exist!' );
|
|
$this->_cur_threads = 0;
|
|
return;
|
|
}
|
|
|
|
$load = sys_getloadavg();
|
|
$curload = 1;
|
|
|
|
if ( $this->_cur_threads == -1 ) {
|
|
// init
|
|
if ( $curload > $this->_crawler_conf[ 'load_limit' ] ) {
|
|
$curthreads = 0;
|
|
}
|
|
elseif ( $curload >= ( $this->_crawler_conf[ 'load_limit' ] - 1 ) ) {
|
|
$curthreads = 1;
|
|
}
|
|
else {
|
|
$curthreads = intval( $this->_crawler_conf[ 'load_limit' ] - $curload );
|
|
if ( $curthreads > $this->conf( Base::O_CRAWLER_THREADS ) ) {
|
|
$curthreads = $this->conf( Base::O_CRAWLER_THREADS );
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// adjust
|
|
$curthreads = $this->_cur_threads;
|
|
if ( $curload >= $this->_crawler_conf[ 'load_limit' ] + 1 ) {
|
|
sleep( 5 ); // sleep 5 secs
|
|
if ( $curthreads >= 1 ) {
|
|
$curthreads --;
|
|
}
|
|
}
|
|
elseif ( $curload >= $this->_crawler_conf[ 'load_limit' ] ) {
|
|
if ( $curthreads > 1 ) {// if already 1, keep
|
|
$curthreads --;
|
|
}
|
|
}
|
|
elseif ( ($curload + 1) < $this->_crawler_conf[ 'load_limit' ] ) {
|
|
if ( $curthreads < $this->conf( Base::O_CRAWLER_THREADS ) ) {
|
|
$curthreads ++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// $log = 'set current threads = ' . $curthreads . ' previous=' . $this->_cur_threads
|
|
// . ' max_allowed=' . $this->conf( Base::O_CRAWLER_THREADS ) . ' load_limit=' . $this->_crawler_conf[ 'load_limit' ] . ' current_load=' . $curload;
|
|
|
|
$this->_cur_threads = $curthreads;
|
|
$this->_cur_thread_time = time();
|
|
}
|
|
|
|
/**
|
|
* Mark running status
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _prepare_running() {
|
|
$this->_summary[ 'is_running' ] = time();
|
|
$this->_summary[ 'done' ] = 0;// reset done status
|
|
$this->_summary[ 'last_status' ] = 'prepare running';
|
|
$this->_summary[ 'last_crawled' ] = 0;
|
|
|
|
// Current crawler starttime mark
|
|
if ( $this->_summary[ 'last_pos' ] == 0 ) {
|
|
$this->_summary[ 'curr_crawler_beginning_time' ] = time();
|
|
}
|
|
|
|
if ( $this->_summary[ 'curr_crawler' ] == 0 && $this->_summary[ 'last_pos' ] == 0 ) {
|
|
$this->_summary[ 'this_full_beginning_time' ] = time();
|
|
$this->_summary[ 'list_size' ] = $this->cls( 'Crawler_Map' )->count_map();
|
|
}
|
|
|
|
if ( $this->_summary[ 'end_reason' ] == 'end' && $this->_summary[ 'last_pos' ] == 0 ) {
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
|
|
}
|
|
|
|
self::save_summary();
|
|
}
|
|
|
|
/**
|
|
* Run crawler
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _do_running() {
|
|
$options = $this->_get_curl_options( true );
|
|
|
|
while ( $urlChunks = $this->cls( 'Crawler_Map' )->list_map( self::CHUNKS, $this->_summary['last_pos'] ) ) {
|
|
// start crawling
|
|
$urlChunks = array_chunk( $urlChunks, $this->_cur_threads );
|
|
foreach ( $urlChunks as $rows ) {
|
|
// multi curl
|
|
$rets = $this->_multi_request( $rows, $options );
|
|
|
|
// check result headers
|
|
foreach ( $rows as $row ) {
|
|
if ( empty( $rets[ $row[ 'id' ] ] ) ) { // If already in blacklist, no curl happened, no corresponding record
|
|
continue;
|
|
}
|
|
|
|
// check response
|
|
if ( $rets[ $row[ 'id' ] ][ 'code' ] == 428 ) { // HTTP/1.1 428 Precondition Required (need to test)
|
|
$this->_end_reason = 'crawler_disabled';
|
|
Debug2::debug( '🐞 crawler_disabled' );
|
|
return;
|
|
}
|
|
|
|
$status = $this->_status_parse( $rets[ $row[ 'id' ] ][ 'header' ], $rets[ $row[ 'id' ] ][ 'code' ] ); // B or H or M or N(nocache)
|
|
$this->_map_status_list[ $status ][ $row[ 'id' ] ] = array(
|
|
'url' => $row[ 'url' ],
|
|
'code' => $rets[ $row[ 'id' ] ][ 'code' ], // 201 or 200 or 404
|
|
);
|
|
if ( empty( $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ][ $status ] ) ) {
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ][ $status ] = 0;
|
|
}
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ][ $status ]++;
|
|
}
|
|
|
|
// update offset position
|
|
$_time = time();
|
|
$this->_summary[ 'last_pos' ] += $this->_cur_threads;
|
|
$this->_summary[ 'last_count' ] = $this->_cur_threads;
|
|
$this->_summary[ 'last_crawled' ] += $this->_cur_threads;
|
|
$this->_summary[ 'last_update_time' ] = $_time;
|
|
$this->_summary[ 'last_status' ] = 'updated position';
|
|
|
|
// check duration
|
|
if ( $this->_summary[ 'last_update_time' ] > $this->_max_run_time ) {
|
|
$this->_end_reason = 'stopped_maxtime';
|
|
Debug2::debug( '🐞 Terminated due to maxtime' );
|
|
return;
|
|
// return __('Stopped due to exceeding defined Maximum Run Time', 'litespeed-cache');
|
|
}
|
|
|
|
// make sure at least each 10s save meta & map status once
|
|
if ( $_time - $this->_summary[ 'meta_save_time' ] > 10 ) {
|
|
$this->_map_status_list = $this->cls( 'Crawler_Map' )->save_map_status( $this->_map_status_list, $this->_summary[ 'curr_crawler' ] );
|
|
self::save_summary();
|
|
}
|
|
|
|
// check if need to reset pos each 5s
|
|
if ( $_time > $this->_summary[ 'pos_reset_check' ] ) {
|
|
$this->_summary[ 'pos_reset_check' ] = $_time + 5;
|
|
if ( file_exists( $this->_resetfile ) && unlink( $this->_resetfile ) ) {
|
|
Debug2::debug( '🐞 Terminated due to reset file' );
|
|
|
|
$this->_summary[ 'last_pos' ] = 0;
|
|
$this->_summary[ 'curr_crawler' ] = 0;
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
|
|
// reset done status
|
|
$this->_summary[ 'done' ] = 0;
|
|
$this->_summary[ 'this_full_beginning_time' ] = 0;
|
|
$this->_end_reason = 'stopped_reset';
|
|
return;
|
|
// return __('Stopped due to reset meta position', 'litespeed-cache');
|
|
}
|
|
}
|
|
|
|
// check loads
|
|
if ( $this->_summary[ 'last_update_time' ] - $this->_cur_thread_time > 60 ) {
|
|
$this->_adjust_current_threads();
|
|
if ( $this->_cur_threads == 0 ) {
|
|
$this->_end_reason = 'stopped_highload';
|
|
Debug2::debug( '🐞 Terminated due to highload' );
|
|
return;
|
|
// return __('Stopped due to load over limit', 'litespeed-cache');
|
|
}
|
|
}
|
|
|
|
$this->_summary[ 'last_status' ] = 'sleeping ' . $this->_crawler_conf[ 'run_delay' ] . 'ms';
|
|
|
|
usleep( $this->_crawler_conf[ 'run_delay' ] );
|
|
}
|
|
}
|
|
|
|
// All URLs are done for current crawler
|
|
$this->_end_reason = 'end';
|
|
$this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ][ 'W' ] = 0;
|
|
Debug2::debug( '🐞 Crawler #' . $this->_summary['curr_crawler'] . ' touched end' );
|
|
}
|
|
|
|
/**
|
|
* Send multi curl requests
|
|
* If res=B, bypass request and won't return
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _multi_request( $rows, $options ) {
|
|
$mh = curl_multi_init();
|
|
$curls = array();
|
|
foreach ( $rows as $row ) {
|
|
if ( substr( $row[ 'res' ], $this->_summary[ 'curr_crawler' ], 1 ) == 'B' ) {
|
|
continue;
|
|
}
|
|
if ( substr( $row[ 'res' ], $this->_summary[ 'curr_crawler' ], 1 ) == 'N' ) {
|
|
continue;
|
|
}
|
|
$curls[ $row[ 'id' ] ] = curl_init();
|
|
|
|
// Append URL
|
|
$url = $row[ 'url' ];
|
|
if ( $this->conf( Base::O_CRAWLER_DROP_DOMAIN ) ) {
|
|
$url = $this->_crawler_conf[ 'base' ] . $row[ 'url' ];
|
|
}
|
|
curl_setopt( $curls[ $row[ 'id' ] ], CURLOPT_URL, $url );
|
|
Debug2::debug( '🐞 Crawling [url] ' . $url . ( $url == $row[ 'url' ] ? '' : ' [ori] ' . $row[ 'url' ] ) );
|
|
|
|
curl_setopt_array( $curls[ $row[ 'id' ] ], $options );
|
|
|
|
curl_multi_add_handle( $mh, $curls[ $row[ 'id' ] ] );
|
|
}
|
|
|
|
// execute curl
|
|
if ( $curls ) {
|
|
$last_start_time = null;
|
|
do {
|
|
curl_multi_exec( $mh, $last_start_time );
|
|
if ( curl_multi_select( $mh ) == -1 ) {
|
|
usleep( 1 );
|
|
}
|
|
} while ( $last_start_time > 0 );
|
|
}
|
|
|
|
// curl done
|
|
$ret = array();
|
|
foreach ( $rows as $row ) {
|
|
if ( substr( $row[ 'res' ], $this->_summary[ 'curr_crawler' ], 1 ) == 'B' ) {
|
|
continue;
|
|
}
|
|
if ( substr( $row[ 'res' ], $this->_summary[ 'curr_crawler' ], 1 ) == 'N' ) {
|
|
continue;
|
|
}
|
|
|
|
$ch = $curls[ $row[ 'id' ] ];
|
|
|
|
// Parse header
|
|
$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
|
|
$content = curl_multi_getcontent( $ch );
|
|
$header = substr( $content, 0, $header_size );
|
|
|
|
$ret[ $row[ 'id' ] ] = array(
|
|
'header' => $header,
|
|
'code' => curl_getinfo( $ch, CURLINFO_HTTP_CODE ),
|
|
);
|
|
|
|
curl_multi_remove_handle( $mh, $ch );
|
|
curl_close( $ch );
|
|
}
|
|
curl_multi_close( $mh );
|
|
|
|
return $ret;
|
|
}
|
|
|
|
/**
|
|
* Check returned curl header to find if cached or not
|
|
*
|
|
* @since 2.0
|
|
* @access private
|
|
*/
|
|
private function _status_parse( $header, $code ) {
|
|
if ( $code == 201 ) {
|
|
return 'H';
|
|
}
|
|
|
|
if ( stripos( $header, 'X-Litespeed-Cache-Control: no-cache' ) !== false ) {
|
|
return 'N'; // Blacklist
|
|
}
|
|
|
|
$_cache_headers = array(
|
|
'x-litespeed-cache',
|
|
'x-lsadc-cache',
|
|
'x-qc-cache',
|
|
);
|
|
|
|
foreach ( $_cache_headers as $_header ) {
|
|
if ( stripos( $header, $_header ) !== false ) {
|
|
if ( stripos( $header, $_header . ': miss' ) !== false ) {
|
|
return 'M'; // Miss
|
|
}
|
|
return 'H'; // Hit
|
|
}
|
|
}
|
|
|
|
return 'B'; // Blacklist
|
|
}
|
|
|
|
/**
|
|
* Get curl_options
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _get_curl_options( $crawler_only = false ) {
|
|
$options = array(
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_HEADER => true,
|
|
CURLOPT_CUSTOMREQUEST => 'GET',
|
|
CURLOPT_FOLLOWLOCATION => false,
|
|
CURLOPT_ENCODING => 'gzip',
|
|
CURLOPT_CONNECTTIMEOUT => 10,
|
|
CURLOPT_TIMEOUT => $this->conf( Base::O_CRAWLER_TIMEOUT ), // Larger timeout to avoid incorrect blacklist addition #900171
|
|
CURLOPT_SSL_VERIFYHOST => 0,
|
|
CURLOPT_SSL_VERIFYPEER => false,
|
|
CURLOPT_NOBODY => false,
|
|
CURLOPT_HTTPHEADER => $this->_crawler_conf[ 'headers' ],
|
|
);
|
|
$options[ CURLOPT_HTTPHEADER ][] = 'Cache-Control: max-age=0';
|
|
|
|
/**
|
|
* Try to enable http2 connection (only available since PHP7+)
|
|
* @since 1.9.1
|
|
* @since 2.2.7 Commented due to cause no-cache issue
|
|
* @since 2.9.1+ Fixed wrongly usage of CURL_HTTP_VERSION_1_1 const
|
|
*/
|
|
$options[ CURLOPT_HTTP_VERSION ] = CURL_HTTP_VERSION_1_1;
|
|
// $options[ CURL_HTTP_VERSION_2 ] = 1;
|
|
|
|
// IP resolve
|
|
if ( $this->conf( Base::O_SERVER_IP ) ) {
|
|
Utility::compatibility();
|
|
if ( ( $this->conf( Base::O_CRAWLER_DROP_DOMAIN ) || ! $crawler_only ) && $this->_crawler_conf[ 'base' ] ) {
|
|
// Resolve URL to IP
|
|
$parsed_url = parse_url( $this->_crawler_conf[ 'base' ] );
|
|
|
|
if ( ! empty( $parsed_url[ 'host' ] ) ) {
|
|
$dom = $parsed_url[ 'host' ];
|
|
$port = $parsed_url[ 'scheme' ] == 'https' ? '443' : '80';
|
|
$url = $dom . ':' . $port . ':' . $this->conf( Base::O_SERVER_IP );
|
|
|
|
$options[ CURLOPT_RESOLVE ] = array( $url );
|
|
$options[ CURLOPT_DNS_USE_GLOBAL_CACHE ] = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// if is walker
|
|
// $options[ CURLOPT_FRESH_CONNECT ] = true;
|
|
|
|
// Referer
|
|
if ( isset( $_SERVER[ 'HTTP_HOST' ] ) && isset( $_SERVER[ 'REQUEST_URI' ] ) ) {
|
|
$options[ CURLOPT_REFERER ] = 'http://' . $_SERVER[ 'HTTP_HOST' ] . $_SERVER[ 'REQUEST_URI' ];
|
|
}
|
|
|
|
// User Agent
|
|
if ( $crawler_only ) {
|
|
if ( strpos( $this->_crawler_conf[ 'ua' ], Crawler::FAST_USER_AGENT ) !== 0 ) {
|
|
$this->_crawler_conf[ 'ua' ] = Crawler::FAST_USER_AGENT . ' ' . $this->_crawler_conf[ 'ua' ];
|
|
}
|
|
}
|
|
$options[ CURLOPT_USERAGENT ] = $this->_crawler_conf[ 'ua' ];
|
|
|
|
/**
|
|
* Append hash to cookie for validation
|
|
* @since 1.9.1
|
|
*/
|
|
if ( $crawler_only ) {
|
|
$this->_crawler_conf[ 'cookies' ][ 'litespeed_hash' ] = Router::get_hash();
|
|
}
|
|
|
|
// Cookies
|
|
$cookies = array();
|
|
foreach ( $this->_crawler_conf[ 'cookies' ] as $k => $v ) {
|
|
if ( ! $v ) {
|
|
continue;
|
|
}
|
|
$cookies[] = $k . '=' . urlencode( $v );
|
|
}
|
|
if ( $cookies ) {
|
|
$options[ CURLOPT_COOKIE ] = implode( '; ', $cookies );
|
|
}
|
|
|
|
return $options;
|
|
}
|
|
|
|
/**
|
|
* Self curl to get HTML content
|
|
*
|
|
* @since 3.3
|
|
*/
|
|
public function self_curl( $url, $ua, $uid = false, $accept = false ) { // $accept not in use yet
|
|
$this->_crawler_conf[ 'base' ] = home_url();
|
|
$this->_crawler_conf[ 'ua' ] = $ua;
|
|
if ( $accept ) {
|
|
$this->_crawler_conf[ 'headers' ] = array( 'Accept: ' . $accept );
|
|
}
|
|
if ( $uid ) {
|
|
$this->_crawler_conf[ 'cookies' ][ 'litespeed_role' ] = $uid;
|
|
$this->_crawler_conf[ 'cookies' ][ 'litespeed_hash' ] = Router::get_hash();
|
|
}
|
|
|
|
$options = $this->_get_curl_options();
|
|
$options[ CURLOPT_HEADER ] = false;
|
|
$options[ CURLOPT_FOLLOWLOCATION ] = true;
|
|
|
|
$ch = curl_init();
|
|
curl_setopt_array( $ch, $options );
|
|
curl_setopt( $ch, CURLOPT_URL, $url );
|
|
$result = curl_exec( $ch );
|
|
curl_close( $ch );
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Terminate crawling
|
|
*
|
|
* @since 1.1.0
|
|
* @access private
|
|
*/
|
|
private function _terminate_running() {
|
|
$this->_map_status_list = $this->cls( 'Crawler_Map' )->save_map_status( $this->_map_status_list, $this->_summary[ 'curr_crawler' ] );
|
|
|
|
if ( $this->_end_reason == 'end' ) { // Current crawler is fully done
|
|
// $end_reason = sprintf( __( 'Crawler %s reached end of sitemap file.', 'litespeed-cache' ), '#' . ( $this->_summary['curr_crawler'] + 1 ) );
|
|
$this->_summary[ 'curr_crawler' ]++; // Jump to next cralwer
|
|
// $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array(); // reset this at next crawl time
|
|
$this->_summary[ 'last_pos' ] = 0;// reset last position
|
|
$this->_summary[ 'last_crawler_total_cost' ] = time() - $this->_summary[ 'curr_crawler_beginning_time' ];
|
|
$count_crawlers = count( $this->list_crawlers() );
|
|
if ( $this->_summary[ 'curr_crawler' ] >= $count_crawlers ) {
|
|
Debug2::debug( '🐞 _terminate_running Touched end, whole crawled. Reload crawler!' );
|
|
$this->_summary[ 'curr_crawler' ] = 0;
|
|
// $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
|
|
$this->_summary[ 'done' ] = 'touchedEnd';// log done status
|
|
$this->_summary[ 'last_full_time_cost' ] = time() - $this->_summary[ 'this_full_beginning_time' ];
|
|
}
|
|
}
|
|
$this->_summary[ 'last_status' ] = 'stopped';
|
|
$this->_summary[ 'is_running' ] = 0;
|
|
$this->_summary[ 'end_reason' ] = $this->_end_reason;
|
|
self::save_summary();
|
|
}
|
|
|
|
/**
|
|
* List all crawlers ( tagA => [ valueA => titleA, ... ] ...)
|
|
*
|
|
* @since 1.9.1
|
|
* @access public
|
|
*/
|
|
public function list_crawlers() {
|
|
if ( $this->_crawlers ) {
|
|
return $this->_crawlers;
|
|
}
|
|
|
|
$crawler_factors = array();
|
|
|
|
// Add default Guest crawler
|
|
$crawler_factors[ 'uid' ] = array( 0 => __( 'Guest', 'litespeed-cache' ) );
|
|
|
|
// WebP on/off
|
|
if ( $this->conf( Base::O_IMG_OPTM_WEBP_REPLACE ) ) {
|
|
$crawler_factors[ 'webp' ] = array( 1 => 'WebP', 0 => '' );
|
|
}
|
|
|
|
// Guest Mode on/off
|
|
if ( $this->conf( Base::O_GUEST ) ) {
|
|
$vary_name = $this->cls( 'Vary' )->get_vary_name();
|
|
$vary_val = 'guest_mode:1';
|
|
if ( ! defined( 'LSCWP_LOG' ) ) {
|
|
$vary_val = md5( $this->conf( Base::HASH ) . $vary_val );
|
|
}
|
|
$crawler_factors[ 'cookie:' . $vary_name ] = array( $vary_val => '', '_null' => '<font data-balloon-pos="up" aria-label="Guest Mode">👒</font>' );
|
|
}
|
|
|
|
// Mobile crawler
|
|
if ( $this->conf( Base::O_CACHE_MOBILE ) ) {
|
|
$crawler_factors[ 'mobile' ] = array( 1 => '<font data-balloon-pos="up" aria-label="Mobile">📱</font>', 0 => '' );
|
|
}
|
|
|
|
// Get roles set
|
|
// List all roles
|
|
foreach ( $this->conf( Base::O_CRAWLER_ROLES ) as $v ) {
|
|
$role_title = '';
|
|
$udata = get_userdata( $v );
|
|
if ( isset( $udata->roles ) && is_array( $udata->roles ) ) {
|
|
$tmp = array_values( $udata->roles );
|
|
$role_title = array_shift( $tmp );
|
|
}
|
|
if ( ! $role_title ) {
|
|
continue;
|
|
}
|
|
|
|
$crawler_factors[ 'uid' ][ $v ] = ucfirst( $role_title );
|
|
}
|
|
|
|
// Cookie crawler
|
|
foreach ( $this->conf( Base::O_CRAWLER_COOKIES ) as $v ) {
|
|
if ( empty( $v[ 'name' ] ) ) {
|
|
continue;
|
|
}
|
|
|
|
$this_cookie_key = 'cookie:' . $v[ 'name' ];
|
|
|
|
$crawler_factors[ $this_cookie_key ] = array();
|
|
|
|
foreach ( $v[ 'vals' ] as $v2 ) {
|
|
$crawler_factors[ $this_cookie_key ][ $v2 ] = $v2 == '_null' ? '' : '<font data-balloon-pos="up" aria-label="Cookie">🍪</font>' . esc_html( $v[ 'name' ] ) . '=' . esc_html( $v2 );
|
|
}
|
|
}
|
|
|
|
// Crossing generate the crawler list
|
|
$this->_crawlers = $this->_recursive_build_crawler( $crawler_factors );
|
|
|
|
return $this->_crawlers;
|
|
}
|
|
|
|
/**
|
|
* Build a crawler list recursively
|
|
*
|
|
* @since 2.8
|
|
* @access private
|
|
*/
|
|
private function _recursive_build_crawler( $crawler_factors, $group = array(), $i = 0 ) {
|
|
$current_factor = array_keys( $crawler_factors );
|
|
$current_factor = $current_factor[ $i ];
|
|
|
|
$if_touch_end = $i + 1 >= count( $crawler_factors );
|
|
|
|
$final_list = array();
|
|
|
|
foreach ( $crawler_factors[ $current_factor ] as $k => $v ) {
|
|
|
|
// Don't alter $group bcos of loop usage
|
|
$item = $group;
|
|
$item[ 'title' ] = ! empty( $group[ 'title' ] ) ? $group[ 'title' ] : '';
|
|
if ( $v ) {
|
|
if ( $item[ 'title' ] ) {
|
|
$item[ 'title' ] .= ' - ';
|
|
}
|
|
$item[ 'title' ] .= $v;
|
|
}
|
|
$item[ $current_factor ] = $k;
|
|
|
|
if ( $if_touch_end ) {
|
|
$final_list[] = $item;
|
|
}
|
|
else {
|
|
// Inception: next layer
|
|
$final_list = array_merge( $final_list, $this->_recursive_build_crawler( $crawler_factors, $item, $i + 1 ) );
|
|
}
|
|
|
|
}
|
|
|
|
return $final_list;
|
|
}
|
|
|
|
/**
|
|
* Return crawler meta file
|
|
*
|
|
* @since 1.1.0
|
|
* @access public
|
|
*/
|
|
public function json_path() {
|
|
if ( ! file_exists( LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta ) ) {
|
|
return false;
|
|
}
|
|
|
|
return LITESPEED_STATIC_URL . '/crawler/' . $this->_sitemeta;
|
|
}
|
|
|
|
|
|
/**
|
|
* Create reset pos file
|
|
*
|
|
* @since 1.1.0
|
|
* @access public
|
|
*/
|
|
public function reset_pos() {
|
|
File::save( $this->_resetfile, time() , true );
|
|
|
|
$this->_summary[ 'is_running' ] = 0;
|
|
self::save_summary();
|
|
}
|
|
|
|
/**
|
|
* Display status based by matching crawlers order
|
|
*
|
|
* @since 3.0
|
|
* @access public
|
|
*/
|
|
public function display_status( $status_row, $reason_set ) {
|
|
if ( ! $status_row ) {
|
|
return '';
|
|
}
|
|
|
|
$_status_list = array(
|
|
'-' => 'default',
|
|
'M' => 'primary',
|
|
'H' => 'success',
|
|
'B' => 'danger',
|
|
'N' => 'warning',
|
|
);
|
|
|
|
$reason_set = explode( ',', $reason_set );
|
|
|
|
$status = '';
|
|
foreach ( str_split( $status_row ) as $k => $v ) {
|
|
$reason = $reason_set[ $k ];
|
|
if ( $reason == 'Man' ) {
|
|
$reason = __( 'Manually added to blocklist', 'litespeed-cache' );
|
|
}
|
|
if ( $reason == 'Existed' ) {
|
|
$reason = __( 'Previously existed in blocklist', 'litespeed-cache' );
|
|
}
|
|
if ( $reason ) {
|
|
$reason = 'data-balloon-pos="up" aria-label="' . $reason . '"';
|
|
}
|
|
$status .= '<i class="litespeed-dot litespeed-bg-' . $_status_list[ $v ] . '" ' . $reason . '>' . ( $k + 1 ) . '</i>';
|
|
}
|
|
|
|
return $status;
|
|
}
|
|
|
|
/**
|
|
* Output info and exit
|
|
*
|
|
* @since 1.1.0
|
|
* @access protected
|
|
* @param string $error Error info
|
|
*/
|
|
protected function output($msg) {
|
|
if ( defined('DOING_CRON') ) {
|
|
echo $msg;
|
|
// exit();
|
|
}
|
|
else {
|
|
echo "<script>alert('" . htmlspecialchars($msg) . "');</script>";
|
|
// exit;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle all request actions from main cls
|
|
*
|
|
* @since 3.0
|
|
* @access public
|
|
*/
|
|
public function handler() {
|
|
$type = Router::verify_type();
|
|
|
|
switch ( $type ) {
|
|
case self::TYPE_REFRESH_MAP:
|
|
$this->cls( 'Crawler_Map' )->gen();
|
|
break;
|
|
|
|
case self::TYPE_EMPTY:
|
|
$this->cls( 'Crawler_Map' )->empty_map();
|
|
break;
|
|
|
|
case self::TYPE_BLACKLIST_EMPTY:
|
|
$this->cls( 'Crawler_Map' )->blacklist_empty();
|
|
break;
|
|
|
|
case self::TYPE_BLACKLIST_DEL:
|
|
if ( ! empty( $_GET[ 'id' ] ) ) {
|
|
$this->cls( 'Crawler_Map' )->blacklist_del( $_GET[ 'id' ] );
|
|
}
|
|
break;
|
|
|
|
case self::TYPE_BLACKLIST_ADD:
|
|
if ( ! empty( $_GET[ 'id' ] ) ) {
|
|
$this->cls( 'Crawler_Map' )->blacklist_add( $_GET[ 'id' ] );
|
|
}
|
|
break;
|
|
|
|
// Handle the ajax request to proceed crawler manually by admin
|
|
case self::TYPE_START:
|
|
self::start( true );
|
|
break;
|
|
|
|
case self::TYPE_RESET:
|
|
$this->reset_pos();
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
Admin::redirect();
|
|
}
|
|
|
|
}
|