File "wordfenceURLHoover.php"

Full Path: /home/rrterraplen/public_html/wp-content-20241221212636/plugins/wordfence/lib/wordfenceURLHoover.php
File size: 18.36 KB
MIME-type: text/x-php
Charset: utf-8

<?php
require_once(dirname(__FILE__) . '/wfAPI.php');
require_once(dirname(__FILE__) . '/wfArray.php');
class wordfenceURLHoover {
	private $debug = false;
	public $errorMsg = false;
	private $hostsToAdd = false;
	private $table = '';
	private $apiKey = false;
	private $wordpressVersion = false;
	private $useDB = true;
	private $hostKeys = array();
	private $hostList = array();
	public $currentHooverID = false;
	private $_foundSome = false;
	private $_excludedHosts = array();
	private $api = false;
	private $db = false;
	
	public static function standardExcludedHosts() {
		static $standardExcludedHosts = null;
		if ($standardExcludedHosts !== null) {
			return $standardExcludedHosts;
		}
		
		global $wpdb;
		$excludedHosts = array();
		if (is_multisite()) {
			$blogIDs = $wpdb->get_col("SELECT blog_id FROM {$wpdb->blogs}"); //Can't use wp_get_sites or get_sites because they return empty at 10k sites
			foreach ($blogIDs as $id) {
				$homeURL = get_home_url($id);
				$host = parse_url($homeURL, PHP_URL_HOST);
				if ($host) {
					$excludedHosts[$host] = 1;
				}
				$siteURL = get_site_url($id);
				$host = parse_url($siteURL, PHP_URL_HOST);
				if ($host) {
					$excludedHosts[$host] = 1;
				}
			}
		}
		else {
			$homeURL = wfUtils::wpHomeURL();
			$host = parse_url($homeURL, PHP_URL_HOST);
			if ($host) {
				$excludedHosts[$host] = 1;
			}
			$siteURL = wfUtils::wpSiteURL();
			$host = parse_url($siteURL, PHP_URL_HOST);
			if ($host) {
				$excludedHosts[$host] = 1;
			}
		}
		
		$standardExcludedHosts = array_keys($excludedHosts);
		return $standardExcludedHosts;
	}
	
	public function __sleep() {
		$this->writeHosts();	
		return array('debug', 'errorMsg', 'table', 'apiKey', 'wordpressVersion');
	}
	
	public function __wakeup() {
		$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey'));
		$this->api = new wfAPI($this->apiKey, $this->wordpressVersion);
		$this->db = new wfDB();
	}
	
	public function __construct($apiKey, $wordpressVersion, $db = false, $continuation = false) {
		$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey'));
		$this->apiKey = $apiKey;
		$this->wordpressVersion = $wordpressVersion;
		$this->api = new wfAPI($apiKey, $wordpressVersion);
		if($db){
			$this->db = $db;
		} else {
			$this->db = new wfDB();
		}
		global $wpdb;
		if(isset($wpdb)){
			$this->table = wfDB::networkTable('wfHoover');
		} else {
			$this->table = 'wp_wfHoover';
		}
		
		if (!$continuation) {
			$this->cleanup();
		}
	}
	
	public function cleanup() {
		$this->db->truncate($this->table);
	}
	
	public function hoover($id, $data, $excludedHosts = array()) {
		$this->currentHooverID = $id;
		$this->_foundSome = 0;
		$this->_excludedHosts = $excludedHosts;
		@preg_replace_callback('_((?:(?://)(?:\S+(?::\S*)?@)?(?:(?:(?:[a-z\xa1-\xff0-9.-]+)(?:\.(?:(?:xn--[a-z\xa1-\xff0-9-]+)|[a-z\xa1-\xff]{2,}))))(?::\d{2,5})?)(?:/[a-z0-9\-\_\.~\!\*\(\);\:@&\=\+\$,\?#\[\]%]*)*)_iS', array($this, 'captureURL'), $data);
		$this->writeHosts();
		return $this->_foundSome;
	}
	
	private function dbg($msg) { 
		if ($this->debug) { wordfence::status(4, 'info', $msg); } 
	}
	
	public function captureURL($matches) {
		$id = $this->currentHooverID;
		$url = 'http:' . $matches[0];
		if (!filter_var($url, FILTER_VALIDATE_URL)) {
			return;
		}
		
		$components = parse_url($url);
		if (preg_match('/\.(xn--(?:[a-z0-9-]*)[a-z0-9]+|[a-z\xa1-\xff0-9]{2,})$/i', $components['host'], $tld)) {
			$tld = strtolower($tld[1]);
			if (strpos(wfConfig::get('tldlist', ''), '|' . $tld . '|') === false) {
				return;
			}
		}
		else {
			return;
		}
		
		foreach ($this->_excludedHosts as $h) {
			if (strcasecmp($h, $components['host']) === 0) {
				return;
			}
		}
		
		$this->_foundSome++;
		
		$host = (isset($components['host']) ? $components['host'] : '');
		$path = (isset($components['path']) && !empty($components['path']) ? $components['path'] : '/');
		$hashes = $this->_generateHashes($url);
		foreach ($hashes as $h) {
			$this->hostsToAdd->push(array('owner' => $id, 'host' => $host, 'path' => $path, 'hostKey' => wfUtils::substr($h, 0, 4)));
		}
		
		if($this->hostsToAdd->size() > 1000){ $this->writeHosts(); }
	}
	
	private function writeHosts() {
		if ($this->hostsToAdd->size() < 1) { return; }
		if ($this->useDB) {
			$sql = "INSERT INTO " . $this->table . " (owner, host, path, hostKey) VALUES ";
			while ($elem = $this->hostsToAdd->shift()) {
				//This may be an issue for hyperDB or other abstraction layers, but leaving it for now.
				$sql .= sprintf("('%s', '%s', '%s', '%s'),", 
						$this->db->realEscape($elem['owner']),
						$this->db->realEscape($elem['host']),
						$this->db->realEscape($elem['path']),
						$this->db->realEscape($elem['hostKey'])
								);
			}
			$sql = rtrim($sql, ',');
			$this->db->queryWrite($sql);
			$this->hostsToAdd->collectGarbage();
		}
		else {
			while ($elem = $this->hostsToAdd->shift()) {
				$keys = str_split($elem['hostKey'], 4);
				foreach ($keys as $k) {
					$this->hostKeys[] = $k;
				}
				$this->hostList[] = array(
					'owner' => $elem['owner'],
					'host' => $elem['host'],
					'path' => $elem['path'],
					'hostKey' => $elem['hostKey']
					);
			}
			$this->hostsToAdd->collectGarbage();
		}
	}
	public function getBaddies() {
		wordfence::status(4, 'info', __("Gathering host keys.", 'wordfence'));
		$allHostKeys = '';
		if ($this->useDB) {
			global $wpdb;
			$dbh = $wpdb->dbh;
			$useMySQLi = wfUtils::useMySQLi();
			if ($useMySQLi) { //If direct-access MySQLi is available, we use it to minimize the memory footprint instead of letting it fetch everything into an array first
				wordfence::status(4, 'info', __("Using MySQLi directly.", 'wordfence'));
				$result = $dbh->query("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */
				if (!is_object($result)) {
					$this->errorMsg = "Unable to query database";
					$this->dbg($this->errorMsg);
					return false;
				}
				while ($row = $result->fetch_assoc()) {
					$allHostKeys .= $row['hostKey'];
				}
			}
			else {
				$q1 = $this->db->querySelect("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */
				foreach ($q1 as $hRec) {
					$allHostKeys .= $hRec['hostKey'];
				}
			}
		}
		else {
			$allHostKeys = implode('', array_values(array_unique($this->hostKeys)));
		}
		
		/**
		 * Check hash prefixes first. Each one is a 4-byte binary prefix of a SHA-256 hash of the URL. The response will
		 * be a binary list of 4-byte indices; The full URL for each index should be sent in the secondary query to
		 * find the true good/bad status.
		 */
		
		$allCount = wfUtils::strlen($allHostKeys) / 4;
		if ($allCount > 0) {
			if ($this->debug) {
				$this->dbg("Checking {$allCount} hostkeys");
				for ($i = 0; $i < $allCount; $i++) {
					$key = wfUtils::substr($allHostKeys, $i * 4, 4);
					$this->dbg("Checking hostkey: " . bin2hex($key));
				}
			}
			
			wordfence::status(2, 'info', sprintf(/* translators: Number of domains. */ __("Checking %d host keys against Wordfence scanning servers.", 'wordfence'), $allCount));
			$resp = $this->api->binCall('check_host_keys', $allHostKeys);
			wordfence::status(2, 'info', __("Done host key check.", 'wordfence'));
			$this->dbg("Done host key check");

			$badHostKeys = '';
			if ($resp['code'] >= 200 && $resp['code'] <= 299) {
				$this->dbg("Host key response: " . bin2hex($resp['data']));
				$dataLen = wfUtils::strlen($resp['data']);
				if ($dataLen > 0 && $dataLen % 2 == 0) {
					$this->dbg("Checking response indexes");
					for ($i = 0; $i < $dataLen; $i += 2) {
						$idx = wfUtils::array_first(unpack('n', wfUtils::substr($resp['data'], $i, 2)));
						$this->dbg("Checking index {$idx}");
						if ($idx < $allCount) {
							$prefix = wfUtils::substr($allHostKeys, $idx * 4, 4);
							$badHostKeys .= $prefix;
							$this->dbg("Got bad hostkey for record: " . bin2hex($prefix));
						}
						else {
							$this->dbg("Bad allHostKeys index: {$idx}");
							$this->errorMsg = "Bad allHostKeys index: {$idx}";
							return false;
						}
					}
				}
				else if ($dataLen > 0) {
					$this->errorMsg = "Invalid data length received from Wordfence server: " . $dataLen;
					$this->dbg($this->errorMsg);
					return false;
				}
			}
			else {
				$this->errorMsg = "Wordfence server responded with an error. HTTP code " . $resp['code'] . " and data: " . $resp['data'];
				return false;
			}
			
			$badCount = wfUtils::strlen($badHostKeys) / 4;
			if ($badCount > 0) {
				$urlsToCheck = array();
				$totalURLs = 0;
				
				//Reconcile flagged prefixes with their corresponding URLs
				for ($i = 0; $i < $badCount; $i++) {
					$prefix = wfUtils::substr($badHostKeys, $i * 4, 4);
					
					if ($this->useDB) {
						/**
						 * Putting a 10000 limit in here for sites that have a huge number of items with the same URL 
						 * that repeats. This is an edge case. But if the URLs are malicious then presumably the admin 
						 * will fix the malicious URLs and on subsequent scans the items (owners) that are above the 
						 * 10000 limit will appear.
						 */
						$q1 = $this->db->querySelect("SELECT DISTINCT owner, host, path FROM {$this->table} WHERE hostKey = %s LIMIT 10000", $prefix);
						foreach ($q1 as $rec) {
							$url = 'http://' . $rec['host'] . $rec['path'];
							if (!isset($urlsToCheck[$rec['owner']])) {
								$urlsToCheck[$rec['owner']] = array();
							}
							if (!in_array($url, $urlsToCheck[$rec['owner']])) {
								$urlsToCheck[$rec['owner']][] = $url;
								$totalURLs++;
							}
						}
					}
					else {
						foreach ($this->hostList as $rec) {
							$pos = wfUtils::strpos($rec['hostKey'], $prefix);
							if ($pos !== false && $pos % 4 == 0) {
								$url = 'http://' . $rec['host'] . $rec['path'];
								if (!isset($urlsToCheck[$rec['owner']])) {
									$urlsToCheck[$rec['owner']] = array();
								}
								if (!in_array($url, $urlsToCheck[$rec['owner']])) {
									$urlsToCheck[$rec['owner']][] = $url;
									$totalURLs++;
								}
							}
						}
					}
					if ($totalURLs > 10000) { break; }
				}

				if (count($urlsToCheck) > 0) {
					wordfence::status(2, 'info', sprintf(
						/* translators: 1. Number of URLs. 2. Number of files. */
						__('Checking %1$d URLs from %2$d sources.', 'wordfence'),
						$totalURLs,
						sizeof($urlsToCheck)
					));
					$badURLs = $this->api->call('check_bad_urls', array(), array('toCheck' => json_encode($urlsToCheck)));
					wordfence::status(2, 'info', __("Done URL check.", 'wordfence'));
					$this->dbg("Done URL check");
					if (is_array($badURLs) && count($badURLs) > 0) {
						$finalResults = array();
						foreach ($badURLs as $file => $badSiteList) {
							if (!isset($finalResults[$file])) {
								$finalResults[$file] = array();
							}
							foreach ($badSiteList as $badSite) {
								$finalResults[$file][] = array(
									'URL' => $badSite[0],
									'badList' => $badSite[1]
									);
							}
						}
						$this->dbg("Confirmed " . count($badURLs) . " bad URLs");
						return $finalResults;
					}
				}
			}
		}
		
		return array();
	}
	
	protected function _generateHashes($url) {
		//The GSB specification requires generating and sending hash prefixes for a number of additional similar URLs. See: https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions
		
		$canonicalURL = $this->_canonicalizeURL($url);
		
		//Extract the scheme
		$scheme = 'http';
		if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $canonicalURL, $matches)) {
			$scheme = strtolower($matches[1]);
			$canonicalURL = $matches[2];
		}
		
		//Separate URL and query string
		$query = '';
		if (preg_match('/^([^?]+)(\??.*)/', $canonicalURL, $matches)) {
			$canonicalURL = $matches[1];
			$query = $matches[2];
		}
		
		//Separate host and path
		$path = '';
		preg_match('~^(.*?)(?:(/.*)|$)~', $canonicalURL, $matches);
		$host = $matches[1];
		if (isset($matches[2])) {
			$path = $matches[2];
		}
		
		//Clean host
		$host = $this->_normalizeHost($host);
		
		//Generate hosts list
		$hosts = array();
		if (filter_var(trim($host, '[]'), FILTER_VALIDATE_IP)) {
			$hosts[] = $host;
		}
		else {
			$hostComponents = explode('.', $host);
			
			$numComponents = count($hostComponents) - 7;
			if ($numComponents < 1) {
				$numComponents = 1;
			}
			
			$hosts[] = $host;
			for ($i = $numComponents; $i < count($hostComponents) - 1; $i++) {
				$hosts[] = implode('.', array_slice($hostComponents, $i));
			}
		}
		
		//Generate paths list
		$paths = array('/');
		$pathComponents = array_filter(explode('/', $path));
		
		$numComponents = min(count($pathComponents), 4);
		for ($i = 1; $i < $numComponents; $i++) {
			$paths[] = '/' . implode('/', array_slice($pathComponents, 0, $i)) . '/';
		}
		if ($path != '/') {
			$paths[] = $path;
		}
		if (strlen($query) > 0) {
			$paths[] = $path . '?' . $query;
		}
		$paths = array_reverse($paths); //So we start at the most specific and move to most generic
		
		//Generate hashes
		$hashes = array();
		foreach ($hosts as $h) {
			$hashes[$h] = hash('sha256', $h, true); //WFSB compatibility -- it uses hashes without the path
			foreach ($paths as $p) {
				$key = $h . $p;
				$hashes[$key] = hash('sha256', $key, true);
				break; //We no longer have any use for the extra path variants, so just include the primary one and exit the loop after
			}
		}
		
		return $hashes;
	}
	
	protected function _canonicalizeURL($url) { //Based on https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization and Google's reference implementation https://github.com/google/safebrowsing/blob/master/urls.go
		//Strip fragment
		$url = $this->_array_first(explode('#', $url));
		
		//Trim space
		$url = trim($url);
		
		//Remove tabs, CR, LF
		$url = preg_replace('/[\t\n\r]/', '', $url);
		
		//Normalize escapes
		$url = $this->_normalizeEscape($url);
		if ($url === false) { return false; }
		
		//Extract the scheme
		$scheme = 'http';
		if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $url, $matches)) {
			$scheme = strtolower($matches[1]);
			$url = $matches[2];
		}
		
		//Separate URL and query string
		$query = '';
		if (preg_match('/^([^?]+)(\??.*)/', $url, $matches)) {
			$url = $matches[1];
			$query = $matches[2];
		}
		$endsWithSlash = substr($url, -1) == '/';
		
		//Separate host and path
		$path = '';
		preg_match('~^(.*?)(?:(/.*)|$)~', $url, $matches);
		$host = $matches[1];
		if (isset($matches[2])) {
			$path = $matches[2];
		}
		
		//Clean host
		$host = $this->_normalizeHost($host);
		if ($host === false) { return false; }
		
		//Clean path
		$path = preg_replace('~//+~', '/', $path); //Multiple slashes -> single slash
		$path = preg_replace('~(?:^|/)\.(?:$|/)~', '/', $path); //. path components removed
		while (preg_match('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', $path)) { //Resolve ..
			$path = preg_replace('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', '/', $path, 1);
		}
		$path = preg_replace('~(?:^|/)\.\.(?:$|/)~', '/', $path); //Eliminate .. at the beginning
		$path = trim($path, '.');
		$path = preg_replace('/\.\.+/', '.', $path);
		
		if ($path == '.' || $path == '') {
			$path = '/';
		}
		else if ($endsWithSlash && substr($path, -1) != '/') {
			$path .= '/';
		}
		
		return $scheme . '://' . $host . $path . $query;
	}
	
	protected function _normalizeEscape($url) {
		$maxDepth = 1024;
		$i = 0;
		while (preg_match('/%([0-9a-f]{2})/i', $url)) {
			$url = preg_replace_callback('/%([0-9a-f]{2})/i', array($this, '_hex2binCallback'), $url);
			$i++;
			
			if ($i > $maxDepth) {
				return false;
			}
		}
		
		return preg_replace_callback('/[\x00-\x20\x7f-\xff#%]/', array($this, '_bin2hexCallback'), $url);
	}
	
	protected function _hex2binCallback($matches) {
		return wfUtils::hex2bin($matches[1]);
	}

	protected function _bin2hexCallback($matches) {
		return '%' . bin2hex($matches[0]);	
	}
	
	protected function _normalizeHost($host) {
		//Strip username:password
		$host = $this->_array_last(explode('@', $host));
		
		//IPv6 literal
		if (substr($host, 0, 1) == '[') {
			if (strpos($host, ']') === false) { //No closing bracket
				return false;
			}
		}
		
		//Strip port
		$host = preg_replace('/:\d+$/', '', $host);
		
		//Unicode to IDNA
		$u = rawurldecode($host);
		if (preg_match('/[\x81-\xff]/', $u)) { //0x80 is technically Unicode, but the GSB canonicalization doesn't consider it one
			if (function_exists('idn_to_ascii')) { //Some PHP versions don't have this and we don't have a polyfill
				$host = idn_to_ascii($u);
			}
		}
		
		//Remove extra dots
		$host = trim($host, '.');
		$host = preg_replace('/\.\.+/', '.', $host);
		
		//Canonicalize IP addresses
		if ($iphost = $this->_parseIP($host)) {
			return $iphost;
		}
		
		return strtolower($host);
	}
	
	protected function _parseIP($host) {
		// The Windows resolver allows a 4-part dotted decimal IP address to have a
		// space followed by any old rubbish, so long as the total length of the
		// string doesn't get above 15 characters. So, "10.192.95.89 xy" is
		// resolved to 10.192.95.89. If the string length is greater than 15
		// characters, e.g. "10.192.95.89 xy.wildcard.example.com", it will be
		// resolved through DNS.
		if (strlen($host) <= 15) {
			$host = $this->_array_first(explode(' ', $host));
		}
		
		if (!preg_match('/^((?:0x[0-9a-f]+|[0-9\.])+)$/i', $host)) {
			return false;
		}
		
		$parts = explode('.', $host);
		if (count($parts) > 4) {
			return false;
		}
		
		$strings = array();
		foreach ($parts as $i => $p) {
			if ($i == count($parts) - 1) {
				$strings[] = $this->_canonicalNum($p, 5 - count($parts));
			}
			else {
				$strings[] = $this->_canonicalNum($p, 1);
			}
			
			if ($strings[$i] == '') {
				return '';
			}
		}
		
		return implode('.', $strings);
	}
	
	protected function _canonicalNum($part, $n) {
		if ($n <= 0 || $n > 4) {
			return '';
		}
		
		if (preg_match('/^0x(\d+)$/i', $part, $matches)) { //hex
			$part = hexdec($matches[1]);
		}
		else if (preg_match('/^0(\d+)$/i', $part, $matches)) { //octal
			$part = octdec($matches[1]);
		}
		else {
			$part = (int) $part;
		}
		
		$strings = array_fill(0, $n, '');
		for ($i = $n - 1; $i >= 0; $i--) {
			$strings[$i] = (string) ($part & 0xff);
			$part = $part >> 8;
		}
		return implode('.', $strings);
	}
	
	protected function _array_first($array) {
		if (empty($array)) {
			return null;
		}
		
		return $array[0];
	}
	
	protected function _array_last($array) {
		if (empty($array)) {
			return null;
		}
		
		return $array[count($array) - 1];
	}
}