namespace XF\Http;
use XF\App;
use function ord, strlen;
class MetadataFetcher
* @var App
protected $app;
* @var Reader
protected $reader;
protected $limits = [
'time' => 5,
'bytes' => 1.5 * 1024 * 1024 // max size of the document we'll try to download
public function __construct(App $app, Reader $reader)
$this->app = $app;
$this->reader = $reader;
public function setLimits(array $limits)
$this->limits = array_replace($this->limits, $limits);
* @param $requestUrl
* @param null $error
* @param null $startTime
* @param null $timeLimit
* @return bool|\XF\Http\Metadata
* @throws \Exception
public function fetch($requestUrl, &$error = null, $startTime = null, $timeLimit = null)
$requestUrl = $this->getValidRequestUrl($requestUrl, $startTime, $timeLimit);
if (!$requestUrl)
$error = 'Could not get a valid request URL from: ' . htmlspecialchars($requestUrl);
return false;
$response = $this->reader->getUntrusted(
if (!$response)
return false;
if ($response->getStatusCode() != 200)
$error = 'Response returned a non-successful status code: ' . $response->getStatusCode();
return false;
$headers = array_change_key_case($response->getHeaders(), CASE_LOWER);
if (
|| !preg_match('#^text/html#i', ltrim($headers['content-type'][0]))
$receivedType = empty($headers['content-type'][0]) ? '-' : $headers['content-type'][0];
$error = 'Response is not HTML (received ' . $receivedType . ')';
return false;
$responseBody = $response->getBody();
$body = '';
$bodyLen = 0;
$hasHeadOpen = false;
$additional = $responseBody->read(1 * 1024);
$readLen = strlen($additional);
$searchStart = max(0, $bodyLen - 10);
$body .= $additional;
$bodyLen += $readLen;
if (!$hasHeadOpen)
if (preg_match('#<head(\s|>)#i', $body, $match, 0, $searchStart))
$hasHeadOpen = true;
if ($hasHeadOpen)
if (preg_match('#</head>#i', $body, $match, PREG_OFFSET_CAPTURE, $searchStart))
// We found the end of the head tag, which is where the metadata should be found.
// Remove scripts and CSS quickly so we don't waste time with them.
$body = ltrim(substr($body, 0, $match[0][1] + 7));
$body = preg_replace('#<script[^>]*(?<!/)>.*</script>\\s*?#siU', '', $body);
$body = preg_replace('#<style[^>]*>.*</style>\\s*?#siU', '', $body);
if (!$hasHeadOpen && $bodyLen > 102400)
// we don't have head tags, so we're just going to read the first 100KB
while (!$responseBody->eof());
$class = 'XF\Http\Metadata';
$class = $this->app->extendClass($class);
return new $class($this->app, $body, $headers, $requestUrl);
public function getValidRequestUrl($requestUrl, $startTime = null, $timeLimit = null)
$requestUrl = preg_replace('/#.*$/', '', $requestUrl);
if (preg_match_all('/[^A-Za-z0-9._~:\/?#\[\]@!$&\'()*+,;=%-]/', $requestUrl, $matches))
foreach ($matches[0] AS $match)
$requestUrl = str_replace($match[0], '%' . strtoupper(dechex(ord($match[0]))), $requestUrl);
if ($this->canFetchUrlHtml($requestUrl, $startTime, $timeLimit))
return $requestUrl;
return false;
protected function canFetchUrlHtml($requestUrl, $startTime = null, $timeLimit = null)
if ($requestUrl != $this->app->stringFormatter()->censorText($requestUrl))
return false;
if ($startTime && $timeLimit)
if (microtime(true) - $startTime > $timeLimit)
return false;
return true;