HTML: Aligned and standardised DOMDocument usage

Adds a thin wrapper for DOMDocument to simplify and align usage within
all areas of BookStack.
Also means we move away from old depreacted mb_convert_encoding usage.

Closes #4638
This commit is contained in:
Dan Brown 2023-11-14 15:46:32 +00:00
parent 3a6f50e668
commit db7b11fe93
No known key found for this signature in database
GPG Key ID: 46D9F943C24A2EF9
8 changed files with 214 additions and 163 deletions

View File

@ -8,9 +8,8 @@ use BookStack\Entities\Models\Page;
use BookStack\Entities\Tools\Markdown\HtmlToMarkdown; use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
use BookStack\Uploads\ImageService; use BookStack\Uploads\ImageService;
use BookStack\Util\CspService; use BookStack\Util\CspService;
use DOMDocument; use BookStack\Util\HtmlDocument;
use DOMElement; use DOMElement;
use DOMXPath;
use Exception; use Exception;
use Throwable; use Throwable;
@ -151,45 +150,36 @@ class ExportFormatter
protected function htmlToPdf(string $html): string protected function htmlToPdf(string $html): string
{ {
$html = $this->containHtml($html); $html = $this->containHtml($html);
$html = $this->replaceIframesWithLinks($html); $doc = new HtmlDocument();
$html = $this->openDetailElements($html); $doc->loadCompleteHtml($html);
return $this->pdfGenerator->fromHtml($html); $this->replaceIframesWithLinks($doc);
$this->openDetailElements($doc);
$cleanedHtml = $doc->getHtml();
return $this->pdfGenerator->fromHtml($cleanedHtml);
} }
/** /**
* Within the given HTML content, Open any detail blocks. * Within the given HTML content, Open any detail blocks.
*/ */
protected function openDetailElements(string $html): string protected function openDetailElements(HtmlDocument $doc): void
{ {
libxml_use_internal_errors(true); $details = $doc->queryXPath('//details');
$doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$xPath = new DOMXPath($doc);
$details = $xPath->query('//details');
/** @var DOMElement $detail */ /** @var DOMElement $detail */
foreach ($details as $detail) { foreach ($details as $detail) {
$detail->setAttribute('open', 'open'); $detail->setAttribute('open', 'open');
} }
return $doc->saveHTML();
} }
/** /**
* Within the given HTML content, replace any iframe elements * Within the given HTML document, replace any iframe elements
* with anchor links within paragraph blocks. * with anchor links within paragraph blocks.
*/ */
protected function replaceIframesWithLinks(string $html): string protected function replaceIframesWithLinks(HtmlDocument $doc): void
{ {
libxml_use_internal_errors(true); $iframes = $doc->queryXPath('//iframe');
$doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$xPath = new DOMXPath($doc);
$iframes = $xPath->query('//iframe');
/** @var DOMElement $iframe */ /** @var DOMElement $iframe */
foreach ($iframes as $iframe) { foreach ($iframes as $iframe) {
$link = $iframe->getAttribute('src'); $link = $iframe->getAttribute('src');
@ -203,8 +193,6 @@ class ExportFormatter
$paragraph->appendChild($anchor); $paragraph->appendChild($anchor);
$iframe->parentNode->replaceChild($paragraph, $iframe); $iframe->parentNode->replaceChild($paragraph, $iframe);
} }
return $doc->saveHTML();
} }
/** /**

View File

@ -10,11 +10,10 @@ use BookStack\Theming\ThemeEvents;
use BookStack\Uploads\ImageRepo; use BookStack\Uploads\ImageRepo;
use BookStack\Uploads\ImageService; use BookStack\Uploads\ImageService;
use BookStack\Util\HtmlContentFilter; use BookStack\Util\HtmlContentFilter;
use DOMDocument; use BookStack\Util\HtmlDocument;
use DOMElement; use DOMElement;
use DOMNode; use DOMNode;
use DOMNodeList; use DOMNodeList;
use DOMXPath;
use Illuminate\Support\Str; use Illuminate\Support\Str;
class PageContent class PageContent
@ -56,27 +55,17 @@ class PageContent
return $htmlText; return $htmlText;
} }
$doc = $this->loadDocumentFromHtml($htmlText); $doc = new HtmlDocument($htmlText);
$container = $doc->documentElement;
$body = $container->childNodes->item(0);
$childNodes = $body->childNodes;
$xPath = new DOMXPath($doc);
// Get all img elements with image data blobs // Get all img elements with image data blobs
$imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]'); $imageNodes = $doc->queryXPath('//img[contains(@src, \'data:image\')]');
foreach ($imageNodes as $imageNode) { foreach ($imageNodes as $imageNode) {
$imageSrc = $imageNode->getAttribute('src'); $imageSrc = $imageNode->getAttribute('src');
$newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc); $newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);
$imageNode->setAttribute('src', $newUrl); $imageNode->setAttribute('src', $newUrl);
} }
// Generate inner html as a string return $doc->getBodyInnerHtml();
$html = '';
foreach ($childNodes as $childNode) {
$html .= $doc->saveHTML($childNode);
}
return $html;
} }
/** /**
@ -172,27 +161,18 @@ class PageContent
return $htmlText; return $htmlText;
} }
$doc = $this->loadDocumentFromHtml($htmlText); $doc = new HtmlDocument($htmlText);
$container = $doc->documentElement;
$body = $container->childNodes->item(0);
$childNodes = $body->childNodes;
$xPath = new DOMXPath($doc);
// Map to hold used ID references // Map to hold used ID references
$idMap = []; $idMap = [];
// Map to hold changing ID references // Map to hold changing ID references
$changeMap = []; $changeMap = [];
$this->updateIdsRecursively($body, 0, $idMap, $changeMap); $this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
$this->updateLinks($xPath, $changeMap); $this->updateLinks($doc, $changeMap);
// Generate inner html as a string // Generate inner html as a string & perform required string-level tweaks
$html = ''; $html = $doc->getBodyInnerHtml();
foreach ($childNodes as $childNode) {
$html .= $doc->saveHTML($childNode);
}
// Perform required string-level tweaks
$html = str_replace(' ', ' ', $html); $html = str_replace(' ', ' ', $html);
return $html; return $html;
@ -225,13 +205,13 @@ class PageContent
* Update the all links in the given xpath to apply requires changes within the * Update the all links in the given xpath to apply requires changes within the
* given $changeMap array. * given $changeMap array.
*/ */
protected function updateLinks(DOMXPath $xpath, array $changeMap): void protected function updateLinks(HtmlDocument $doc, array $changeMap): void
{ {
if (empty($changeMap)) { if (empty($changeMap)) {
return; return;
} }
$links = $xpath->query('//body//*//*[@href]'); $links = $doc->queryXPath('//body//*//*[@href]');
/** @var DOMElement $domElem */ /** @var DOMElement $domElem */
foreach ($links as $domElem) { foreach ($links as $domElem) {
$href = ltrim($domElem->getAttribute('href'), '#'); $href = ltrim($domElem->getAttribute('href'), '#');
@ -321,11 +301,10 @@ class PageContent
return []; return [];
} }
$doc = $this->loadDocumentFromHtml($htmlContent); $doc = new HtmlDocument($htmlContent);
$xPath = new DOMXPath($doc); $headers = $doc->queryXPath('//h1|//h2|//h3|//h4|//h5|//h6');
$headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6');
return $headers ? $this->headerNodesToLevelList($headers) : []; return $headers->count() === 0 ? [] : $this->headerNodesToLevelList($headers);
} }
/** /**
@ -420,7 +399,7 @@ class PageContent
protected function fetchSectionOfPage(Page $page, string $sectionId): string protected function fetchSectionOfPage(Page $page, string $sectionId): string
{ {
$topLevelTags = ['table', 'ul', 'ol', 'pre']; $topLevelTags = ['table', 'ul', 'ol', 'pre'];
$doc = $this->loadDocumentFromHtml($page->html); $doc = new HtmlDocument($page->html);
// Search included content for the id given and blank out if not exists. // Search included content for the id given and blank out if not exists.
$matchingElem = $doc->getElementById($sectionId); $matchingElem = $doc->getElementById($sectionId);
@ -430,30 +409,11 @@ class PageContent
// Otherwise replace the content with the found content // Otherwise replace the content with the found content
// Checks if the top-level wrapper should be included by matching on tag types // Checks if the top-level wrapper should be included by matching on tag types
$innerContent = '';
$isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags); $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);
if ($isTopLevel) { if ($isTopLevel) {
$innerContent .= $doc->saveHTML($matchingElem); return $doc->getNodeOuterHtml($matchingElem);
} else {
foreach ($matchingElem->childNodes as $childNode) {
$innerContent .= $doc->saveHTML($childNode);
}
} }
libxml_clear_errors();
return $innerContent; return $doc->getNodeInnerHtml($matchingElem);
}
/**
* Create and load a DOMDocument from the given html content.
*/
protected function loadDocumentFromHtml(string $html): DOMDocument
{
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$doc->loadHTML($html);
return $doc;
} }
} }

View File

@ -9,8 +9,7 @@ use BookStack\References\ModelResolvers\ChapterLinkModelResolver;
use BookStack\References\ModelResolvers\CrossLinkModelResolver; use BookStack\References\ModelResolvers\CrossLinkModelResolver;
use BookStack\References\ModelResolvers\PageLinkModelResolver; use BookStack\References\ModelResolvers\PageLinkModelResolver;
use BookStack\References\ModelResolvers\PagePermalinkModelResolver; use BookStack\References\ModelResolvers\PagePermalinkModelResolver;
use DOMDocument; use BookStack\Util\HtmlDocument;
use DOMXPath;
class CrossLinkParser class CrossLinkParser
{ {
@ -54,13 +53,8 @@ class CrossLinkParser
{ {
$links = []; $links = [];
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>'; $doc = new HtmlDocument($html);
libxml_use_internal_errors(true); $anchors = $doc->queryXPath('//a[@href]');
$doc = new DOMDocument();
$doc->loadHTML($html);
$xPath = new DOMXPath($doc);
$anchors = $xPath->query('//a[@href]');
/** @var \DOMElement $anchor */ /** @var \DOMElement $anchor */
foreach ($anchors as $anchor) { foreach ($anchors as $anchor) {

View File

@ -6,18 +6,14 @@ use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page; use BookStack\Entities\Models\Page;
use BookStack\Entities\Repos\RevisionRepo; use BookStack\Entities\Repos\RevisionRepo;
use DOMDocument; use BookStack\Util\HtmlDocument;
use DOMXPath;
class ReferenceUpdater class ReferenceUpdater
{ {
protected ReferenceFetcher $referenceFetcher; public function __construct(
protected RevisionRepo $revisionRepo; protected ReferenceFetcher $referenceFetcher,
protected RevisionRepo $revisionRepo
public function __construct(ReferenceFetcher $referenceFetcher, RevisionRepo $revisionRepo) ) {
{
$this->referenceFetcher = $referenceFetcher;
$this->revisionRepo = $revisionRepo;
} }
public function updateEntityPageReferences(Entity $entity, string $oldLink) public function updateEntityPageReferences(Entity $entity, string $oldLink)
@ -96,13 +92,8 @@ class ReferenceUpdater
return $html; return $html;
} }
$html = '<body>' . $html . '</body>'; $doc = new HtmlDocument($html);
libxml_use_internal_errors(true); $anchors = $doc->queryXPath('//a[@href]');
$doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$xPath = new DOMXPath($doc);
$anchors = $xPath->query('//a[@href]');
/** @var \DOMElement $anchor */ /** @var \DOMElement $anchor */
foreach ($anchors as $anchor) { foreach ($anchors as $anchor) {
@ -111,12 +102,6 @@ class ReferenceUpdater
$anchor->setAttribute('href', $updated); $anchor->setAttribute('href', $updated);
} }
$html = ''; return $doc->getBodyInnerHtml();
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
foreach ($topElems as $child) {
$html .= $doc->saveHTML($child);
}
return $html;
} }
} }

View File

@ -6,7 +6,7 @@ use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider; use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page; use BookStack\Entities\Models\Page;
use DOMDocument; use BookStack\Util\HtmlDocument;
use DOMNode; use DOMNode;
use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Builder;
use Illuminate\Support\Collection; use Illuminate\Support\Collection;
@ -138,16 +138,11 @@ class SearchIndex
'h6' => 1.5, 'h6' => 1.5,
]; ];
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html); $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
$doc = new HtmlDocument($html);
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML($html);
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */ /** @var DOMNode $child */
foreach ($topElems as $child) { foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName; $nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent)); $termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) { foreach ($termCounts as $term => $count) {
@ -168,7 +163,6 @@ class SearchIndex
*/ */
protected function generateTermScoreMapFromTags(array $tags): array protected function generateTermScoreMapFromTags(array $tags): array
{ {
$scoreMap = [];
$names = []; $names = [];
$values = []; $values = [];

View File

@ -3,10 +3,8 @@
namespace BookStack\Util; namespace BookStack\Util;
use DOMAttr; use DOMAttr;
use DOMDocument;
use DOMElement; use DOMElement;
use DOMNodeList; use DOMNodeList;
use DOMXPath;
class HtmlContentFilter class HtmlContentFilter
{ {
@ -19,54 +17,44 @@ class HtmlContentFilter
return $html; return $html;
} }
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>'; $doc = new HtmlDocument($html);
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML($html);
$xPath = new DOMXPath($doc);
// Remove standard script tags // Remove standard script tags
$scriptElems = $xPath->query('//script'); $scriptElems = $doc->queryXPath('//script');
static::removeNodes($scriptElems); static::removeNodes($scriptElems);
// Remove clickable links to JavaScript URI // Remove clickable links to JavaScript URI
$badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']'); $badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
static::removeNodes($badLinks); static::removeNodes($badLinks);
// Remove forms with calls to JavaScript URI // Remove forms with calls to JavaScript URI
$badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']'); $badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
static::removeNodes($badForms); static::removeNodes($badForms);
// Remove meta tag to prevent external redirects // Remove meta tag to prevent external redirects
$metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']'); $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags); static::removeNodes($metaTags);
// Remove data or JavaScript iFrames // Remove data or JavaScript iFrames
$badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]'); $badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
static::removeNodes($badIframes); static::removeNodes($badIframes);
// Remove attributes, within svg children, hiding JavaScript or data uris. // Remove attributes, within svg children, hiding JavaScript or data uris.
// A bunch of svg element and attribute combinations expose xss possibilities. // A bunch of svg element and attribute combinations expose xss possibilities.
// For example, SVG animate tag can exploit javascript in values. // For example, SVG animate tag can exploit javascript in values.
$badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']'); $badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
static::removeAttributes($badValuesAttrs); static::removeAttributes($badValuesAttrs);
// Remove elements with a xlink:href attribute // Remove elements with a xlink:href attribute
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here. // Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
$xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]'); $xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
static::removeAttributes($xlinkHrefAttributes); static::removeAttributes($xlinkHrefAttributes);
// Remove 'on*' attributes // Remove 'on*' attributes
$onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]'); $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
static::removeAttributes($onAttributes); static::removeAttributes($onAttributes);
$html = ''; return $doc->getBodyInnerHtml();
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
foreach ($topElems as $child) {
$html .= $doc->saveHTML($child);
}
return $html;
} }
/** /**

152
app/Util/HtmlDocument.php Normal file
View File

@ -0,0 +1,152 @@
<?php
namespace BookStack\Util;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMXPath;
/**
* HtmlDocument is a thin wrapper around DOMDocument built
* specifically for loading, querying and generating HTML content.
*/
class HtmlDocument
{
protected DOMDocument $document;
protected ?DOMXPath $xpath = null;
protected int $loadOptions;
public function __construct(string $partialHtml = '', int $loadOptions = 0)
{
libxml_use_internal_errors(true);
$this->document = new DOMDocument();
$this->loadOptions = $loadOptions;
if ($partialHtml) {
$this->loadPartialHtml($partialHtml);
}
}
/**
* Load some HTML content that's part of a document (e.g. body content)
* into the current document.
*/
public function loadPartialHtml(string $html): void
{
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
$this->document->loadHTML($html, $this->loadOptions);
$this->xpath = null;
}
/**
* Load a complete page of HTML content into the document.
*/
public function loadCompleteHtml(string $html): void
{
$html = '<?xml encoding="utf-8" ?>' . $html;
$this->document->loadHTML($html, $this->loadOptions);
$this->xpath = null;
}
/**
* Start an XPath query on the current document.
*/
public function queryXPath(string $expression): DOMNodeList
{
if (is_null($this->xpath)) {
$this->xpath = new DOMXPath($this->document);
}
$result = $this->xpath->query($expression);
if ($result === false) {
throw new \InvalidArgumentException("XPath query for expression [$expression] failed to execute");
}
return $result;
}
/**
* Create a new DOMElement instance within the document.
*/
public function createElement(string $localName, string $value = ''): DOMElement
{
$element = $this->document->createElement($localName, $value);
if ($element === false) {
throw new \InvalidArgumentException("Failed to create element of name [$localName] and value [$value]");
}
return $element;
}
/**
* Get an element within the document of the given ID.
*/
public function getElementById(string $elementId): ?DOMElement
{
return $this->document->getElementById($elementId);
}
/**
* Get the DOMNode that represents the HTML body.
*/
public function getBody(): DOMNode
{
return $this->document->getElementsByTagName('body')[0];
}
/**
* Get the nodes that are a direct child of the body.
* This is usually all the content nodes if loaded partially.
*/
public function getBodyChildren(): DOMNodeList
{
return $this->getBody()->childNodes;
}
/**
* Get the inner HTML content of the body.
* This is usually all the content if loaded partially.
*/
public function getBodyInnerHtml(): string
{
$html = '';
foreach ($this->getBodyChildren() as $child) {
$html .= $this->document->saveHTML($child);
}
return $html;
}
/**
* Get the HTML content of the whole document.
*/
public function getHtml(): string
{
return $this->document->saveHTML();
}
/**
* Get the inner HTML for the given node.
*/
public function getNodeInnerHtml(DOMNode $node): string
{
$html = '';
foreach ($node->childNodes as $childNode) {
$html .= $this->document->saveHTML($childNode);
}
return $html;
}
/**
* Get the outer HTML for the given node.
*/
public function getNodeOuterHtml(DOMNode $node): string
{
return $this->document->saveHTML($node);
}
}

View File

@ -2,14 +2,12 @@
namespace BookStack\Util; namespace BookStack\Util;
use DOMDocument;
use DOMElement; use DOMElement;
use DOMNodeList; use DOMNodeList;
use DOMXPath;
class HtmlNonceApplicator class HtmlNonceApplicator
{ {
protected static $placeholder = '[CSP_NONCE_VALUE]'; protected static string $placeholder = '[CSP_NONCE_VALUE]';
/** /**
* Prepare the given HTML content with nonce attributes including a placeholder * Prepare the given HTML content with nonce attributes including a placeholder
@ -21,28 +19,20 @@ class HtmlNonceApplicator
return $html; return $html;
} }
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>'; // LIBXML_SCHEMA_CREATE was found to be required here otherwise
libxml_use_internal_errors(true); // the PHP DOMDocument handling will attempt to format/close
$doc = new DOMDocument(); // HTML tags within scripts and therefore change JS content.
$doc->loadHTML($html, LIBXML_SCHEMA_CREATE); $doc = new HtmlDocument($html, LIBXML_SCHEMA_CREATE);
$xPath = new DOMXPath($doc);
// Apply to scripts // Apply to scripts
$scriptElems = $xPath->query('//script'); $scriptElems = $doc->queryXPath('//script');
static::addNonceAttributes($scriptElems, static::$placeholder); static::addNonceAttributes($scriptElems, static::$placeholder);
// Apply to styles // Apply to styles
$styleElems = $xPath->query('//style'); $styleElems = $doc->queryXPath('//style');
static::addNonceAttributes($styleElems, static::$placeholder); static::addNonceAttributes($styleElems, static::$placeholder);
$returnHtml = ''; return $doc->getBodyInnerHtml();
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
foreach ($topElems as $child) {
$content = $doc->saveHTML($child);
$returnHtml .= $content;
}
return $returnHtml;
} }
/** /**