<?php
/**
* @package s9e\TextFormatter
* @copyright Copyright (c) 2010-2021 The s9e authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Configurator\Helpers;
use DOMElement;
use DOMXPath;
class ElementInspector
{
/**
* This is an abridged version of the HTML5 content models and rules, with some liberties taken.
*
* For each element, up to three bitfields are defined: "c", "ac" and "dd". Bitfields are stored
* as raw bytes, formatted using the octal notation to keep the sources ASCII.
*
* "c" represents the categories the element belongs to. The categories are comprised of HTML5
* content models (such as "phrasing content" or "interactive content") plus a few special
* categories created to cover the parts of the specs that refer to "a group of X and Y
* elements" rather than a specific content model.
*
* "ac" represents the categories that are allowed as children of given element.
*
* "dd" represents the categories that must not appear as a descendant of given element.
*
* Sometimes, HTML5 specifies some restrictions on when an element can accept certain children,
* or what categories the element belongs to. For example, an <img> element is only part of the
* "interactive content" category if it has a "usemap" attribute. Those restrictions are
* expressed as an XPath expression and stored using the concatenation of the key of the bitfield
* plus the bit number of the category. For instance, if "interactive content" got assigned to
* bit 2, the definition of the <img> element will contain a key "c2" with value "@usemap".
*
* Additionally, other flags are set:
*
* "t" indicates that the element uses the "transparent" content model.
* "e" indicates that the element uses the "empty" content model.
* "v" indicates that the element is a void element.
* "nt" indicates that the element does not accept text nodes. (no text)
* "to" indicates that the element should only contain text. (text-only)
* "fe" indicates that the element is a formatting element. It will automatically be reopened
* when closed by an end tag of a different name.
* "b" indicates that the element is not phrasing content, which makes it likely to act like
* a block element.
*
* Finally, HTML5 defines "optional end tag" rules, where one element automatically closes its
* predecessor. Those are used to generate closeParent rules and are stored in the "cp" key.
*
* @var array
* @see /scripts/patchElementInspector.php
*/
protected static $htmlElements = [
'a'=>['c'=>"\17\0\0\0\200",'c3'=>'@href','ac'=>"\0",'dd'=>"\10\0\0\0\200",'t'=>1,'fe'=>1],
'abbr'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'address'=>['c'=>"\3\40",'ac'=>"\1",'dd'=>"\200\50",'b'=>1,'cp'=>['p']],
'area'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'article'=>['c'=>"\3\10",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'aside'=>['c'=>"\3\10",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'audio'=>['c'=>"\57",'c3'=>'@controls','c1'=>'@controls','ac'=>"\0\0\0\220",'ac28'=>'not(@src)','dd'=>"\0\0\0\0\0\1",'dd40'=>'@src','t'=>1],
'b'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'base'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1],
'bdi'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'bdo'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'blockquote'=>['c'=>"\103",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'body'=>['c'=>"\100\0\20",'ac'=>"\1",'dd'=>"\0",'b'=>1],
'br'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'button'=>['c'=>"\17\1",'ac'=>"\4",'dd'=>"\10"],
'canvas'=>['c'=>"\47",'ac'=>"\0",'dd'=>"\10",'t'=>1],
'caption'=>['c'=>"\0\2",'ac'=>"\1",'dd'=>"\0\0\0\0\1",'b'=>1],
'cite'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'code'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'col'=>['c'=>"\0\0\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1],
'colgroup'=>['c'=>"\0\2",'ac'=>"\0\0\100",'ac22'=>'not(@span)','dd'=>"\0",'nt'=>1,'e'=>1,'e?'=>'@span','b'=>1],
'data'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'datalist'=>['c'=>"\5",'ac'=>"\4\0\1\40",'dd'=>"\0"],
'dd'=>['c'=>"\0\200\0\1",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['dd','dt']],
'del'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'t'=>1],
'details'=>['c'=>"\113",'ac'=>"\1\0\0\10",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'dfn'=>['c'=>"\7\0\0\0\20",'ac'=>"\4",'dd'=>"\0\0\0\0\20"],
'dialog'=>['c'=>"\101",'ac'=>"\1",'dd'=>"\0",'b'=>1],
'div'=>['c'=>"\3\200",'ac'=>"\1\0\1\1",'ac0'=>'not(ancestor::dl)','dd'=>"\0",'b'=>1,'cp'=>['p']],
'dl'=>['c'=>"\3",'c1'=>'dt and dd','ac'=>"\0\200\1",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'dt'=>['c'=>"\0\200\0\1",'ac'=>"\1",'dd'=>"\200\10\10",'b'=>1,'cp'=>['dd','dt']],
'em'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'embed'=>['c'=>"\57",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'fieldset'=>['c'=>"\103\1",'ac'=>"\1\0\0\100",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'figcaption'=>['c'=>"\0\0\0\0\0\2",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'figure'=>['c'=>"\103",'ac'=>"\1\0\0\0\0\2",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'footer'=>['c'=>"\3\40\10",'ac'=>"\1",'dd'=>"\0\0\10",'b'=>1,'cp'=>['p']],
'form'=>['c'=>"\3\0\0\0\10",'ac'=>"\1",'dd'=>"\0\0\0\0\10",'b'=>1,'cp'=>['p']],
'h1'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'h2'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'h3'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'h4'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'h5'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'h6'=>['c'=>"\203\4",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'head'=>['c'=>"\0\0\20",'ac'=>"\20",'dd'=>"\0",'nt'=>1,'b'=>1],
'header'=>['c'=>"\3\40\10",'ac'=>"\1",'dd'=>"\0\0\10",'b'=>1,'cp'=>['p']],
'hgroup'=>['c'=>"\203",'ac'=>"\0\4\1",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'hr'=>['c'=>"\1\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1,'cp'=>['p']],
'html'=>['c'=>"\0",'ac'=>"\0\0\20",'dd'=>"\0",'nt'=>1,'b'=>1],
'i'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'iframe'=>['c'=>"\57",'ac'=>"\4",'dd'=>"\0"],
'img'=>['c'=>"\57\1\40",'c3'=>'@usemap','ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'input'=>['c'=>"\17\1",'c3'=>'@type!="hidden"','c8'=>'@type!="hidden" or @type="hidden"','c1'=>'@type!="hidden"','ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'ins'=>['c'=>"\7",'ac'=>"\0",'dd'=>"\0",'t'=>1],
'kbd'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'label'=>['c'=>"\17\0\0\0\4",'ac'=>"\4",'dd'=>"\0\0\2\0\4"],
'legend'=>['c'=>"\0\0\0\100",'ac'=>"\204",'dd'=>"\0",'b'=>1],
'li'=>['c'=>"\0\0\0\0\100",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['li']],
'link'=>['c'=>"\25",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'main'=>['c'=>"\3",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'map'=>['c'=>"\7",'ac'=>"\0",'dd'=>"\0",'t'=>1],
'mark'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'media element'=>['c'=>"\0\0\0\0\0\1",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'b'=>1],
'menu'=>['c'=>"\3\100",'c1'=>'li','ac'=>"\0\100\1",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'menuitem'=>['c'=>"\0\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'b'=>1],
'meta'=>['c'=>"\25",'c0'=>'@itemprop','c2'=>'@itemprop','ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1],
'meter'=>['c'=>"\7\0\2\0\2",'ac'=>"\4",'dd'=>"\0\0\0\0\2"],
'nav'=>['c'=>"\3\10",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'noscript'=>['c'=>"\25",'ac'=>"\0",'dd'=>"\0",'nt'=>1],
'object'=>['c'=>"\47\0\0\2",'ac'=>"\0",'dd'=>"\0",'t'=>1],
'ol'=>['c'=>"\3",'c1'=>'li','ac'=>"\0\0\1\0\100",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'optgroup'=>['c'=>"\0\0\4",'ac'=>"\0\0\1\40",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['optgroup','option']],
'option'=>['c'=>"\0\0\4\40",'ac'=>"\0",'dd'=>"\0",'b'=>1,'cp'=>['option']],
'output'=>['c'=>"\7\1",'ac'=>"\4",'dd'=>"\0"],
'p'=>['c'=>"\3",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'picture'=>['c'=>"\45",'ac'=>"\0\0\41",'dd'=>"\0",'nt'=>1],
'pre'=>['c'=>"\3",'ac'=>"\4",'dd'=>"\0",'pre'=>1,'b'=>1,'cp'=>['p']],
'progress'=>['c'=>"\7\0\2\4",'ac'=>"\4",'dd'=>"\0\0\0\4"],
'q'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'rb'=>['c'=>"\0\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'b'=>1],
'rp'=>['c'=>"\0\20",'ac'=>"\0",'dd'=>"\0",'to'=>1,'b'=>1,'cp'=>['rp','rt']],
'rt'=>['c'=>"\0\20",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['rp','rt']],
'rtc'=>['c'=>"\0\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'b'=>1],
'ruby'=>['c'=>"\7",'ac'=>"\4\20",'dd'=>"\0"],
's'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'samp'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'script'=>['c'=>"\25\0\1",'ac'=>"\0",'dd'=>"\0",'to'=>1],
'section'=>['c'=>"\3\10",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']],
'select'=>['c'=>"\17\1",'ac'=>"\0\0\5",'dd'=>"\0",'nt'=>1],
'slot'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'t'=>1],
'small'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'source'=>['c'=>"\0\0\40\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1],
'span'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'strong'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'style'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'to'=>1,'b'=>1],
'sub'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'summary'=>['c'=>"\0\0\0\10",'ac'=>"\204",'dd'=>"\0",'b'=>1],
'sup'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'table'=>['c'=>"\3\0\0\0\1",'ac'=>"\0\2\1",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'tbody'=>['c'=>"\0\2",'ac'=>"\0\0\1\0\40",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['tbody','td','th','thead','tr']],
'td'=>['c'=>"\100\0\200",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['td','th']],
'template'=>['c'=>"\25\0\101",'ac'=>"\0",'dd'=>"\0",'nt'=>1],
'textarea'=>['c'=>"\17\1",'ac'=>"\0",'dd'=>"\0",'pre'=>1,'to'=>1],
'tfoot'=>['c'=>"\0\2",'ac'=>"\0\0\1\0\40",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['tbody','td','th','thead','tr']],
'th'=>['c'=>"\0\0\200",'ac'=>"\1",'dd'=>"\200\10\10",'b'=>1,'cp'=>['td','th']],
'thead'=>['c'=>"\0\2",'ac'=>"\0\0\1\0\40",'dd'=>"\0",'nt'=>1,'b'=>1],
'time'=>['c'=>"\7",'ac'=>"\4",'ac2'=>'@datetime','dd'=>"\0"],
'title'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'to'=>1,'b'=>1],
'tr'=>['c'=>"\0\2\0\0\40",'ac'=>"\0\0\201",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['td','th','tr']],
'track'=>['c'=>"\0\0\0\200",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1],
'u'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1],
'ul'=>['c'=>"\3",'c1'=>'li','ac'=>"\0\0\1\0\100",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']],
'var'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"],
'video'=>['c'=>"\57",'c3'=>'@controls','ac'=>"\0\0\0\220",'ac28'=>'not(@src)','dd'=>"\0\0\0\0\0\1",'dd40'=>'@src','t'=>1],
'wbr'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1]
];
/**
* Test whether given child element closes given parent element
*
* @param DOMElement $child
* @param DOMElement $parent
* @return bool
*/
public static function closesParent(DOMElement $child, DOMElement $parent)
{
$parentName = $parent->nodeName;
$childName = $child->nodeName;
return !empty(self::$htmlElements[$childName]['cp']) && in_array($parentName, self::$htmlElements[$childName]['cp'], true);
}
/**
* Test whether given element disallows text nodes
*
* @param DOMElement $element
* @return bool
*/
public static function disallowsText(DOMElement $element)
{
return self::hasProperty($element, 'nt');
}
/**
* Return the "allowChild" bitfield for given element
*
* @param DOMElement $element
* @return string
*/
public static function getAllowChildBitfield(DOMElement $element)
{
return self::getBitfield($element, 'ac');
}
/**
* Return the "category" bitfield for given element
*
* @param DOMElement $element
* @return string
*/
public static function getCategoryBitfield(DOMElement $element)
{
return self::getBitfield($element, 'c');
}
/**
* Return the "denyDescendant" bitfield for given element
*
* @param DOMElement $element
* @return string
*/
public static function getDenyDescendantBitfield(DOMElement $element)
{
return self::getBitfield($element, 'dd');
}
/**
* Test whether given element is a block element
*
* @param DOMElement $element
* @return bool
*/
public static function isBlock(DOMElement $element)
{
return self::hasProperty($element, 'b');
}
/**
* Test whether given element uses the empty content model
*
* @param DOMElement $element
* @return bool
*/
public static function isEmpty(DOMElement $element)
{
return self::hasProperty($element, 'e');
}
/**
* Test whether given element is a formatting element
*
* @param DOMElement $element
* @return bool
*/
public static function isFormattingElement(DOMElement $element)
{
return self::hasProperty($element, 'fe');
}
/**
* Test whether given element only accepts text nodes
*
* @param DOMElement $element
* @return bool
*/
public static function isTextOnly(DOMElement $element)
{
return self::hasProperty($element, 'to');
}
/**
* Test whether given element uses the transparent content model
*
* @param DOMElement $element
* @return bool
*/
public static function isTransparent(DOMElement $element)
{
return self::hasProperty($element, 't');
}
/**
* Test whether given element uses the void content model
*
* @param DOMElement $element
* @return bool
*/
public static function isVoid(DOMElement $element)
{
return self::hasProperty($element, 'v');
}
/**
* Test whether given element preserves whitespace in its content
*
* @param DOMElement $element
* @return bool
*/
public static function preservesWhitespace(DOMElement $element)
{
return self::hasProperty($element, 'pre');
}
/**
* Evaluate an XPath query using given element as context node
*
* @param string $query XPath query
* @param DOMElement $element Context node
* @return bool
*/
protected static function evaluate($query, DOMElement $element)
{
$xpath = new DOMXPath($element->ownerDocument);
return $xpath->evaluate('boolean(' . $query . ')', $element);
}
/**
* Get the bitfield value for a given element
*
* @param DOMElement $element Context node
* @param string $name Bitfield name: either 'c', 'ac' or 'dd'
* @return string
*/
protected static function getBitfield(DOMElement $element, $name)
{
$props = self::getProperties($element);
$bitfield = self::toBin($props[$name]);
// For each bit set to 1, test whether there is an XPath condition to it and whether it is
// fulfilled. If not, turn the bit to 0
foreach (array_keys(array_filter(str_split($bitfield, 1))) as $bitNumber)
{
$conditionName = $name . $bitNumber;
if (isset($props[$conditionName]) && !self::evaluate($props[$conditionName], $element))
{
$bitfield[$bitNumber] = '0';
}
}
return self::toRaw($bitfield);
}
/**
* Return the properties associated with given element
*
* Returns span's properties if the element is not defined
*
* @param DOMElement $element
* @return array
*/
protected static function getProperties(DOMElement $element)
{
return self::$htmlElements[$element->nodeName] ?? self::$htmlElements['span'];
}
/**
* Test whether given element has given property in context
*
* @param DOMElement $element Context node
* @param string $propName Property name, see self::$htmlElements
* @return bool
*/
protected static function hasProperty(DOMElement $element, $propName)
{
$props = self::getProperties($element);
return !empty($props[$propName]) && (!isset($props[$propName . '?']) || self::evaluate($props[$propName . '?'], $element));
}
/**
* Convert a raw string to a series of 0 and 1 in LSB order
*
* @param string $raw
* @return string
*/
protected static function toBin($raw)
{
$bin = '';
foreach (str_split($raw, 1) as $char)
{
$bin .= strrev(substr('0000000' . decbin(ord($char)), -8));
}
return $bin;
}
/**
* Convert a series of 0 and 1 in LSB order to a raw string
*
* @param string $bin
* @return string
*/
protected static function toRaw($bin)
{
return implode('', array_map('chr', array_map('bindec', array_map('strrev', str_split($bin, 8)))));
}
}