_home_url = get_home_url(); $this->__data = Data::cls(); $this->_tb = $this->__data->tb( 'crawler' ); $this->_tb_blacklist = $this->__data->tb( 'crawler_blacklist' ); $this->_conf_map_timeout = $this->conf( Base::O_CRAWLER_MAP_TIMEOUT ); } /** * Save URLs crawl status into DB * * @since 3.0 * @access public */ public function save_map_status( $list, $curr_crawler ) { global $wpdb; Utility::compatibility(); $total_crawler = count( Crawler::cls()->list_crawlers() ); $total_crawler_pos = $total_crawler - 1; // Replace current crawler's position $curr_crawler = (int) $curr_crawler; foreach ( $list as $bit => $ids ) { // $ids = [ id => [ url, code ], ... ] if ( ! $ids ) { continue; } Debug2::debug( "πŸžπŸ—ΊοΈ Update map [crawler] $curr_crawler [bit] $bit [count] " . count( $ids ) ); // Update res first, then reason $right_pos = $total_crawler_pos - $curr_crawler; $sql_res = "CONCAT( LEFT( res, $curr_crawler ), '$bit', RIGHT( res, $right_pos ) )"; $id_all = implode( ',', array_map( 'intval', array_keys( $ids ) ) ); $wpdb->query( "UPDATE `$this->_tb` SET res = $sql_res WHERE id IN ( $id_all )" ); // Add blacklist if ( $bit == 'B' || $bit == 'N' ) { $q = "SELECT a.id, a.url FROM `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url=a.url WHERE b.id IN ( $id_all )"; $existing = $wpdb->get_results( $q, ARRAY_A ); // Update current crawler status tag in existing blacklist if ( $existing ) { $count = $wpdb->query( "UPDATE `$this->_tb_blacklist` SET res = $sql_res WHERE id IN ( " . implode( ',', array_column( $existing, 'id' ) ) . " )" ); Debug2::debug( 'πŸžπŸ—ΊοΈ Update blacklist [count] ' . $count ); } // Append new blacklist if ( count( $ids ) > count( $existing ) ) { $new_urls = array_diff( array_column( $ids, 'url' ), array_column( $existing, 'url') ); Debug2::debug( 'πŸžπŸ—ΊοΈ Insert into blacklist [count] ' . count( $new_urls ) ); $q = "INSERT INTO `$this->_tb_blacklist` ( url, res, reason ) VALUES " . implode( ',', array_fill( 0, count( $new_urls ), '( %s, %s, %s )' ) ); $data = array(); $res = array_fill( 0, $total_crawler, '-' ); $res[ $curr_crawler ] = $bit; $res = implode( '', $res ); $default_reason = $total_crawler > 1 ? str_repeat( ',', $total_crawler - 1 ) : ''; // Pre-populate default reason value first, update later foreach ( $new_urls as $url ) { $data[] = $url; $data[] = $res; $data[] = $default_reason; } $wpdb->query( $wpdb->prepare( $q, $data ) ); } } // Update sitemap reason w/ HTTP code $reason_array = array(); foreach ( $ids as $id => $v2 ) { $code = (int)$v2[ 'code' ]; if ( empty( $reason_array[ $code ] ) ) { $reason_array[ $code ] = array(); } $reason_array[ $code ][] = (int)$id; } foreach ( $reason_array as $code => $v2 ) { // Complement comma if ( $curr_crawler ) { $code = ',' . $code; } if ( $curr_crawler < $total_crawler_pos ) { $code .= ','; } $count = $wpdb->query( "UPDATE `$this->_tb` SET reason = CONCAT( SUBSTRING_INDEX( reason, ',', $curr_crawler ), '$code', SUBSTRING_INDEX( reason, ',', -$right_pos ) ) WHERE id IN (" . implode( ',', $v2 ) . ")" ); Debug2::debug( "πŸžπŸ—ΊοΈ Update map reason [code] $code [pos] left $curr_crawler right -$right_pos [count] $count" ); // Update blacklist reason if ( $bit == 'B' || $bit == 'N' ) { $count = $wpdb->query( "UPDATE `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url = a.url SET a.reason = CONCAT( SUBSTRING_INDEX( a.reason, ',', $curr_crawler ), '$code', SUBSTRING_INDEX( a.reason, ',', -$right_pos ) ) WHERE b.id IN (" . implode( ',', $v2 ) . ")" ); Debug2::debug( "πŸžπŸ—ΊοΈ Update blacklist [code] $code [pos] left $curr_crawler right -$right_pos [count] $count" ); } } // Reset list $list[ $bit ] = array(); } return $list; } /** * Add one record to blacklist * NOTE: $id is sitemap table ID * * @since 3.0 * @access public */ public function blacklist_add( $id ) { global $wpdb; $id = (int)$id; // Build res&reason $total_crawler = count( Crawler::cls()->list_crawlers() ); $res = str_repeat( 'B', $total_crawler ); $reason = implode( ',', array_fill( 0, $total_crawler, 'Man' ) ); $row = $wpdb->get_row( "SELECT a.url, b.id FROM `$this->_tb` a LEFT JOIN `$this->_tb_blacklist` b ON b.url = a.url WHERE a.id = '$id'", ARRAY_A ); if ( ! $row ) { Debug2::debug( 'πŸžπŸ—ΊοΈ blacklist failed to add [id] ' . $id ); return; } Debug2::debug( 'πŸžπŸ—ΊοΈ Add to blacklist [url] ' . $row[ 'url' ] ); $q = "UPDATE `$this->_tb` SET res = %s, reason = %s WHERE id = %d"; $wpdb->query( $wpdb->prepare( $q, array( $res, $reason, $id ) ) ); if ( $row[ 'id' ] ) { $q = "UPDATE `$this->_tb_blacklist` SET res = %s, reason = %s WHERE id = %d"; $wpdb->query( $wpdb->prepare( $q, array( $res, $reason, $row[ 'id' ] ) ) ); } else { $q = "INSERT INTO `$this->_tb_blacklist` (url, res, reason) VALUES (%s, %s, %s)"; $wpdb->query( $wpdb->prepare( $q, array( $row[ 'url' ], $res, $reason ) ) ); } } /** * Delete one record from blacklist * * @since 3.0 * @access public */ public function blacklist_del( $id ) { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) { return; } $id = (int)$id; Debug2::debug( 'πŸžπŸ—ΊοΈ blacklist delete [id] ' . $id ); $wpdb->query( "UPDATE `$this->_tb` SET res = REPLACE( REPLACE( res, 'N', '-' ), 'B', '-' ) WHERE url = ( SELECT url FROM `$this->_tb_blacklist` WHERE id = '$id' )" ); $wpdb->query( "DELETE FROM `$this->_tb_blacklist` WHERE id = '$id'" ); } /** * Empty blacklist * * @since 3.0 * @access public */ public function blacklist_empty() { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) { return; } Debug2::debug( 'πŸžπŸ—ΊοΈ Truncate blacklist' ); $wpdb->query( "UPDATE `$this->_tb` SET res = REPLACE( REPLACE( res, 'N', '-' ), 'B', '-' )" ); $wpdb->query( "TRUNCATE `$this->_tb_blacklist`" ); } /** * List blacklist * * @since 3.0 * @access public */ public function list_blacklist( $limit = false, $offset = false ) { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) { return array(); } $q = "SELECT * FROM `$this->_tb_blacklist` ORDER BY id DESC"; if ( $limit !== false ) { if ( $offset === false ) { $total = $this->count_blacklist(); $offset = Utility::pagination( $total, $limit, true ); } $q .= " LIMIT %d, %d"; $q = $wpdb->prepare( $q, $offset, $limit ); } return $wpdb->get_results( $q, ARRAY_A ); } /** * Count blacklist */ public function count_blacklist() { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) { return false; } $q = "SELECT COUNT(*) FROM `$this->_tb_blacklist`"; return $wpdb->get_var( $q ); } /** * Empty sitemap * * @since 3.0 * @access public */ public function empty_map() { Data::cls()->tb_del( 'crawler' ); $msg = __( 'Sitemap cleaned successfully', 'litespeed-cache' ); Admin_Display::succeed( $msg ); } /** * List generated sitemap * * @since 3.0 * @access public */ public function list_map( $limit, $offset = false ) { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler' ) ) { return array(); } if ( $offset === false ) { $total = $this->count_map(); $offset = Utility::pagination( $total, $limit, true ); } $q = "SELECT * FROM `$this->_tb` ORDER BY id LIMIT %d, %d"; return $wpdb->get_results( $wpdb->prepare( $q, $offset, $limit ), ARRAY_A ); } /** * Count sitemap */ public function count_map() { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler' ) ) { return false; } $q = "SELECT COUNT(*) FROM `$this->_tb`"; return $wpdb->get_var( $q ); } /** * Generate sitemap * * @since 1.1.0 * @access public */ public function gen() { $count = $this->_gen(); if ( ! $count ) { Admin_Display::error( __( 'No valid sitemap parsed for crawler.', 'litespeed-cache' ) ); return; } $msg = sprintf( __( 'Sitemap created successfully: %d items', 'litespeed-cache' ), $count ); Admin_Display::succeed( $msg ); } /** * Generate the sitemap * * @since 1.1.0 * @access private */ private function _gen() { global $wpdb; if ( ! $this->__data->tb_exist( 'crawler' ) ) { $this->__data->tb_create( 'crawler' ); } if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) { $this->__data->tb_create( 'crawler_blacklist' ); } // use custom sitemap if ( ! $sitemap = $this->conf( Base::O_CRAWLER_SITEMAP ) ) { return false; } $offset = strlen( $this->_home_url ); try { $this->_parse( $sitemap ); } catch( \Exception $e ) { Debug2::debug( 'πŸžπŸ—ΊοΈ ❌ failed to parse custom sitemap: ' . $e->getMessage() ); } if ( is_array( $this->_urls ) && ! empty( $this->_urls ) ) { if ( $this->conf( Base::O_CRAWLER_DROP_DOMAIN ) ) { foreach ( $this->_urls as $k => $v ) { if ( stripos( $v, $this->_home_url ) !== 0 ) { unset( $this->_urls[ $k ] ); continue; } $this->_urls[ $k ] = substr( $v, $offset ); } } $this->_urls = array_unique( $this->_urls ); } Debug2::debug( 'πŸžπŸ—ΊοΈ Truncate sitemap' ); $wpdb->query( "TRUNCATE `$this->_tb`" ); Debug2::debug( 'πŸžπŸ—ΊοΈ Generate sitemap' ); // Filter URLs in blacklist $blacklist = $this->list_blacklist(); $full_blacklisted = array(); $partial_blacklisted = array(); foreach ( $blacklist as $v ) { if ( strpos( $v[ 'res' ], '-' ) === false ) { // Full blacklisted $full_blacklisted[] = $v[ 'url' ]; } else { // Replace existing reason $v[ 'reason' ] = explode( ',', $v[ 'reason' ] ); $v[ 'reason' ] = array_map( function( $element ){ return $element ? 'Existed' : ''; }, $v[ 'reason' ] ); $v[ 'reason' ] = implode( ',', $v[ 'reason' ] ); $partial_blacklisted[ $v[ 'url' ] ] = array( 'res' => $v[ 'res' ], 'reason' => $v[ 'reason' ], ); } } // Drop all blacklisted URLs $this->_urls = array_diff( $this->_urls, $full_blacklisted ); // Default res & reason $crawler_count = count( Crawler::cls()->list_crawlers() ); $default_res = str_repeat( '-', $crawler_count ); $default_reason = $crawler_count > 1 ? str_repeat( ',', $crawler_count - 1 ) : ''; $data = array(); foreach ( $this->_urls as $url ) { $data[] = $url; $data[] = array_key_exists( $url, $partial_blacklisted ) ? $partial_blacklisted[ $url ][ 'res' ] : $default_res; $data[] = array_key_exists( $url, $partial_blacklisted ) ? $partial_blacklisted[ $url ][ 'reason' ] : $default_reason; } foreach ( array_chunk( $data, 300 ) as $data2 ) { $this->_save( $data2 ); } // Reset crawler Crawler::cls()->reset_pos(); return count( $this->_urls ); } /** * Save data to table * * @since 3.0 * @access private */ private function _save( $data, $fields = 'url,res,reason' ) { global $wpdb; if ( empty( $data ) ) { return; } $q = "INSERT INTO `$this->_tb` ( $fields ) VALUES "; // Add placeholder $q .= Utility::chunk_placeholder( $data, $fields ); // Store data $wpdb->query( $wpdb->prepare( $q, $data ) ); } /** * Parse custom sitemap and return urls * * @since 1.1.1 * @access private */ private function _parse( $sitemap ) { /** * Read via wp func to avoid allow_url_fopen = off * @since 2.2.7 */ $response = wp_remote_get( $sitemap, array( 'timeout' => $this->_conf_map_timeout ) ); if ( is_wp_error( $response ) ) { $error_message = $response->get_error_message(); Debug2::debug( 'πŸžπŸ—ΊοΈ failed to read sitemap: ' . $error_message ); throw new \Exception( 'Failed to remote read ' . $sitemap ); } $xml_object = simplexml_load_string( $response[ 'body' ], null, LIBXML_NOCDATA ); if ( ! $xml_object ) { if ( $this->_urls ) { return; } throw new \Exception( 'Failed to parse xml ' . $sitemap ); } // start parsing $xml_array = (array) $xml_object; if ( ! empty( $xml_array[ 'sitemap' ] ) ) { // parse sitemap set if ( is_object( $xml_array[ 'sitemap' ] ) ) { $xml_array[ 'sitemap' ] = (array) $xml_array[ 'sitemap' ]; } if ( ! empty( $xml_array[ 'sitemap' ][ 'loc' ] ) ) { // is single sitemap $this->_parse( $xml_array[ 'sitemap' ][ 'loc' ] ); } else { // parse multiple sitemaps foreach ( $xml_array[ 'sitemap' ] as $val ) { $val = (array) $val; if ( ! empty( $val[ 'loc' ] ) ) { $this->_parse( $val[ 'loc' ] ); // recursive parse sitemap } } } } elseif ( ! empty( $xml_array[ 'url' ] ) ) { // parse url set if ( is_object( $xml_array[ 'url' ] ) ) { $xml_array[ 'url' ] = (array) $xml_array[ 'url' ]; } // if only 1 element if ( ! empty( $xml_array[ 'url' ][ 'loc' ] ) ) { $this->_urls[] = $xml_array[ 'url' ][ 'loc' ]; } else { foreach ( $xml_array[ 'url' ] as $val ) { $val = (array) $val; if ( ! empty( $val[ 'loc' ] ) ) { $this->_urls[] = $val[ 'loc' ]; } } } } } }