From db7b11fe933b6f9b9962aec18f45a74b56458db5 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Tue, 14 Nov 2023 15:46:32 +0000 Subject: [PATCH] HTML: Aligned and standardised DOMDocument usage Adds a thin wrapper for DOMDocument to simplify and align usage within all areas of BookStack. Also means we move away from old depreacted mb_convert_encoding usage. Closes #4638 --- app/Entities/Tools/ExportFormatter.php | 38 +++---- app/Entities/Tools/PageContent.php | 74 +++--------- app/References/CrossLinkParser.php | 12 +- app/References/ReferenceUpdater.php | 31 ++--- app/Search/SearchIndex.php | 12 +- app/Util/HtmlContentFilter.php | 32 ++---- app/Util/HtmlDocument.php | 152 +++++++++++++++++++++++++ app/Util/HtmlNonceApplicator.php | 26 ++--- 8 files changed, 214 insertions(+), 163 deletions(-) create mode 100644 app/Util/HtmlDocument.php diff --git a/app/Entities/Tools/ExportFormatter.php b/app/Entities/Tools/ExportFormatter.php index 9a8c687b0..beddfe8e6 100644 --- a/app/Entities/Tools/ExportFormatter.php +++ b/app/Entities/Tools/ExportFormatter.php @@ -8,9 +8,8 @@ use BookStack\Entities\Models\Page; use BookStack\Entities\Tools\Markdown\HtmlToMarkdown; use BookStack\Uploads\ImageService; use BookStack\Util\CspService; -use DOMDocument; +use BookStack\Util\HtmlDocument; use DOMElement; -use DOMXPath; use Exception; use Throwable; @@ -151,45 +150,36 @@ class ExportFormatter protected function htmlToPdf(string $html): string { $html = $this->containHtml($html); - $html = $this->replaceIframesWithLinks($html); - $html = $this->openDetailElements($html); + $doc = new HtmlDocument(); + $doc->loadCompleteHtml($html); - return $this->pdfGenerator->fromHtml($html); + $this->replaceIframesWithLinks($doc); + $this->openDetailElements($doc); + $cleanedHtml = $doc->getHtml(); + + return $this->pdfGenerator->fromHtml($cleanedHtml); } /** * Within the given HTML content, Open any detail blocks. */ - protected function openDetailElements(string $html): string + protected function openDetailElements(HtmlDocument $doc): void { - libxml_use_internal_errors(true); - - $doc = new DOMDocument(); - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); - $xPath = new DOMXPath($doc); - - $details = $xPath->query('//details'); + $details = $doc->queryXPath('//details'); /** @var DOMElement $detail */ foreach ($details as $detail) { $detail->setAttribute('open', 'open'); } - - return $doc->saveHTML(); } /** - * Within the given HTML content, replace any iframe elements + * Within the given HTML document, replace any iframe elements * with anchor links within paragraph blocks. */ - protected function replaceIframesWithLinks(string $html): string + protected function replaceIframesWithLinks(HtmlDocument $doc): void { - libxml_use_internal_errors(true); + $iframes = $doc->queryXPath('//iframe'); - $doc = new DOMDocument(); - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); - $xPath = new DOMXPath($doc); - - $iframes = $xPath->query('//iframe'); /** @var DOMElement $iframe */ foreach ($iframes as $iframe) { $link = $iframe->getAttribute('src'); @@ -203,8 +193,6 @@ class ExportFormatter $paragraph->appendChild($anchor); $iframe->parentNode->replaceChild($paragraph, $iframe); } - - return $doc->saveHTML(); } /** diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php index 949816eff..3e75bd5bb 100644 --- a/app/Entities/Tools/PageContent.php +++ b/app/Entities/Tools/PageContent.php @@ -10,11 +10,10 @@ use BookStack\Theming\ThemeEvents; use BookStack\Uploads\ImageRepo; use BookStack\Uploads\ImageService; use BookStack\Util\HtmlContentFilter; -use DOMDocument; +use BookStack\Util\HtmlDocument; use DOMElement; use DOMNode; use DOMNodeList; -use DOMXPath; use Illuminate\Support\Str; class PageContent @@ -56,27 +55,17 @@ class PageContent return $htmlText; } - $doc = $this->loadDocumentFromHtml($htmlText); - $container = $doc->documentElement; - $body = $container->childNodes->item(0); - $childNodes = $body->childNodes; - $xPath = new DOMXPath($doc); + $doc = new HtmlDocument($htmlText); // Get all img elements with image data blobs - $imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]'); + $imageNodes = $doc->queryXPath('//img[contains(@src, \'data:image\')]'); foreach ($imageNodes as $imageNode) { $imageSrc = $imageNode->getAttribute('src'); $newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc); $imageNode->setAttribute('src', $newUrl); } - // Generate inner html as a string - $html = ''; - foreach ($childNodes as $childNode) { - $html .= $doc->saveHTML($childNode); - } - - return $html; + return $doc->getBodyInnerHtml(); } /** @@ -172,27 +161,18 @@ class PageContent return $htmlText; } - $doc = $this->loadDocumentFromHtml($htmlText); - $container = $doc->documentElement; - $body = $container->childNodes->item(0); - $childNodes = $body->childNodes; - $xPath = new DOMXPath($doc); + $doc = new HtmlDocument($htmlText); // Map to hold used ID references $idMap = []; // Map to hold changing ID references $changeMap = []; - $this->updateIdsRecursively($body, 0, $idMap, $changeMap); - $this->updateLinks($xPath, $changeMap); + $this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap); + $this->updateLinks($doc, $changeMap); - // Generate inner html as a string - $html = ''; - foreach ($childNodes as $childNode) { - $html .= $doc->saveHTML($childNode); - } - - // Perform required string-level tweaks + // Generate inner html as a string & perform required string-level tweaks + $html = $doc->getBodyInnerHtml(); $html = str_replace(' ', ' ', $html); return $html; @@ -225,13 +205,13 @@ class PageContent * Update the all links in the given xpath to apply requires changes within the * given $changeMap array. */ - protected function updateLinks(DOMXPath $xpath, array $changeMap): void + protected function updateLinks(HtmlDocument $doc, array $changeMap): void { if (empty($changeMap)) { return; } - $links = $xpath->query('//body//*//*[@href]'); + $links = $doc->queryXPath('//body//*//*[@href]'); /** @var DOMElement $domElem */ foreach ($links as $domElem) { $href = ltrim($domElem->getAttribute('href'), '#'); @@ -321,11 +301,10 @@ class PageContent return []; } - $doc = $this->loadDocumentFromHtml($htmlContent); - $xPath = new DOMXPath($doc); - $headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6'); + $doc = new HtmlDocument($htmlContent); + $headers = $doc->queryXPath('//h1|//h2|//h3|//h4|//h5|//h6'); - return $headers ? $this->headerNodesToLevelList($headers) : []; + return $headers->count() === 0 ? [] : $this->headerNodesToLevelList($headers); } /** @@ -420,7 +399,7 @@ class PageContent protected function fetchSectionOfPage(Page $page, string $sectionId): string { $topLevelTags = ['table', 'ul', 'ol', 'pre']; - $doc = $this->loadDocumentFromHtml($page->html); + $doc = new HtmlDocument($page->html); // Search included content for the id given and blank out if not exists. $matchingElem = $doc->getElementById($sectionId); @@ -430,30 +409,11 @@ class PageContent // Otherwise replace the content with the found content // Checks if the top-level wrapper should be included by matching on tag types - $innerContent = ''; $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags); if ($isTopLevel) { - $innerContent .= $doc->saveHTML($matchingElem); - } else { - foreach ($matchingElem->childNodes as $childNode) { - $innerContent .= $doc->saveHTML($childNode); - } + return $doc->getNodeOuterHtml($matchingElem); } - libxml_clear_errors(); - return $innerContent; - } - - /** - * Create and load a DOMDocument from the given html content. - */ - protected function loadDocumentFromHtml(string $html): DOMDocument - { - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $html = '' . $html . ''; - $doc->loadHTML($html); - - return $doc; + return $doc->getNodeInnerHtml($matchingElem); } } diff --git a/app/References/CrossLinkParser.php b/app/References/CrossLinkParser.php index 88ca5d6a7..b9c3ad205 100644 --- a/app/References/CrossLinkParser.php +++ b/app/References/CrossLinkParser.php @@ -9,8 +9,7 @@ use BookStack\References\ModelResolvers\ChapterLinkModelResolver; use BookStack\References\ModelResolvers\CrossLinkModelResolver; use BookStack\References\ModelResolvers\PageLinkModelResolver; use BookStack\References\ModelResolvers\PagePermalinkModelResolver; -use DOMDocument; -use DOMXPath; +use BookStack\Util\HtmlDocument; class CrossLinkParser { @@ -54,13 +53,8 @@ class CrossLinkParser { $links = []; - $html = '' . $html . ''; - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($html); - - $xPath = new DOMXPath($doc); - $anchors = $xPath->query('//a[@href]'); + $doc = new HtmlDocument($html); + $anchors = $doc->queryXPath('//a[@href]'); /** @var \DOMElement $anchor */ foreach ($anchors as $anchor) { diff --git a/app/References/ReferenceUpdater.php b/app/References/ReferenceUpdater.php index 2f7b70a87..82505e8ab 100644 --- a/app/References/ReferenceUpdater.php +++ b/app/References/ReferenceUpdater.php @@ -6,18 +6,14 @@ use BookStack\Entities\Models\Book; use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Page; use BookStack\Entities\Repos\RevisionRepo; -use DOMDocument; -use DOMXPath; +use BookStack\Util\HtmlDocument; class ReferenceUpdater { - protected ReferenceFetcher $referenceFetcher; - protected RevisionRepo $revisionRepo; - - public function __construct(ReferenceFetcher $referenceFetcher, RevisionRepo $revisionRepo) - { - $this->referenceFetcher = $referenceFetcher; - $this->revisionRepo = $revisionRepo; + public function __construct( + protected ReferenceFetcher $referenceFetcher, + protected RevisionRepo $revisionRepo + ) { } public function updateEntityPageReferences(Entity $entity, string $oldLink) @@ -96,13 +92,8 @@ class ReferenceUpdater return $html; } - $html = '' . $html . ''; - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); - - $xPath = new DOMXPath($doc); - $anchors = $xPath->query('//a[@href]'); + $doc = new HtmlDocument($html); + $anchors = $doc->queryXPath('//a[@href]'); /** @var \DOMElement $anchor */ foreach ($anchors as $anchor) { @@ -111,12 +102,6 @@ class ReferenceUpdater $anchor->setAttribute('href', $updated); } - $html = ''; - $topElems = $doc->documentElement->childNodes->item(0)->childNodes; - foreach ($topElems as $child) { - $html .= $doc->saveHTML($child); - } - - return $html; + return $doc->getBodyInnerHtml(); } } diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index e882e987b..d9fc4e7aa 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -6,7 +6,7 @@ use BookStack\Activity\Models\Tag; use BookStack\Entities\EntityProvider; use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Page; -use DOMDocument; +use BookStack\Util\HtmlDocument; use DOMNode; use Illuminate\Database\Eloquent\Builder; use Illuminate\Support\Collection; @@ -138,16 +138,11 @@ class SearchIndex 'h6' => 1.5, ]; - $html = '' . $html . ''; $html = str_ireplace(['
', '
', '
'], "\n", $html); + $doc = new HtmlDocument($html); - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($html); - - $topElems = $doc->documentElement->childNodes->item(0)->childNodes; /** @var DOMNode $child */ - foreach ($topElems as $child) { + foreach ($doc->getBodyChildren() as $child) { $nodeName = $child->nodeName; $termCounts = $this->textToTermCountMap(trim($child->textContent)); foreach ($termCounts as $term => $count) { @@ -168,7 +163,6 @@ class SearchIndex */ protected function generateTermScoreMapFromTags(array $tags): array { - $scoreMap = []; $names = []; $values = []; diff --git a/app/Util/HtmlContentFilter.php b/app/Util/HtmlContentFilter.php index e51f512bd..2dbb34086 100644 --- a/app/Util/HtmlContentFilter.php +++ b/app/Util/HtmlContentFilter.php @@ -3,10 +3,8 @@ namespace BookStack\Util; use DOMAttr; -use DOMDocument; use DOMElement; use DOMNodeList; -use DOMXPath; class HtmlContentFilter { @@ -19,54 +17,44 @@ class HtmlContentFilter return $html; } - $html = '' . $html . ''; - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($html); - $xPath = new DOMXPath($doc); + $doc = new HtmlDocument($html); // Remove standard script tags - $scriptElems = $xPath->query('//script'); + $scriptElems = $doc->queryXPath('//script'); static::removeNodes($scriptElems); // Remove clickable links to JavaScript URI - $badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']'); + $badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']'); static::removeNodes($badLinks); // Remove forms with calls to JavaScript URI - $badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']'); + $badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']'); static::removeNodes($badForms); // Remove meta tag to prevent external redirects - $metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']'); + $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']'); static::removeNodes($metaTags); // Remove data or JavaScript iFrames - $badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]'); + $badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]'); static::removeNodes($badIframes); // Remove attributes, within svg children, hiding JavaScript or data uris. // A bunch of svg element and attribute combinations expose xss possibilities. // For example, SVG animate tag can exploit javascript in values. - $badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']'); + $badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']'); static::removeAttributes($badValuesAttrs); // Remove elements with a xlink:href attribute // Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here. - $xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]'); + $xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]'); static::removeAttributes($xlinkHrefAttributes); // Remove 'on*' attributes - $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]'); + $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]'); static::removeAttributes($onAttributes); - $html = ''; - $topElems = $doc->documentElement->childNodes->item(0)->childNodes; - foreach ($topElems as $child) { - $html .= $doc->saveHTML($child); - } - - return $html; + return $doc->getBodyInnerHtml(); } /** diff --git a/app/Util/HtmlDocument.php b/app/Util/HtmlDocument.php new file mode 100644 index 000000000..f0e12d528 --- /dev/null +++ b/app/Util/HtmlDocument.php @@ -0,0 +1,152 @@ +document = new DOMDocument(); + $this->loadOptions = $loadOptions; + + if ($partialHtml) { + $this->loadPartialHtml($partialHtml); + } + } + + /** + * Load some HTML content that's part of a document (e.g. body content) + * into the current document. + */ + public function loadPartialHtml(string $html): void + { + $html = '' . $html . ''; + $this->document->loadHTML($html, $this->loadOptions); + $this->xpath = null; + } + + /** + * Load a complete page of HTML content into the document. + */ + public function loadCompleteHtml(string $html): void + { + $html = '' . $html; + $this->document->loadHTML($html, $this->loadOptions); + $this->xpath = null; + } + + /** + * Start an XPath query on the current document. + */ + public function queryXPath(string $expression): DOMNodeList + { + if (is_null($this->xpath)) { + $this->xpath = new DOMXPath($this->document); + } + + $result = $this->xpath->query($expression); + if ($result === false) { + throw new \InvalidArgumentException("XPath query for expression [$expression] failed to execute"); + } + + return $result; + } + + /** + * Create a new DOMElement instance within the document. + */ + public function createElement(string $localName, string $value = ''): DOMElement + { + $element = $this->document->createElement($localName, $value); + + if ($element === false) { + throw new \InvalidArgumentException("Failed to create element of name [$localName] and value [$value]"); + } + + return $element; + } + + /** + * Get an element within the document of the given ID. + */ + public function getElementById(string $elementId): ?DOMElement + { + return $this->document->getElementById($elementId); + } + + /** + * Get the DOMNode that represents the HTML body. + */ + public function getBody(): DOMNode + { + return $this->document->getElementsByTagName('body')[0]; + } + + /** + * Get the nodes that are a direct child of the body. + * This is usually all the content nodes if loaded partially. + */ + public function getBodyChildren(): DOMNodeList + { + return $this->getBody()->childNodes; + } + + /** + * Get the inner HTML content of the body. + * This is usually all the content if loaded partially. + */ + public function getBodyInnerHtml(): string + { + $html = ''; + foreach ($this->getBodyChildren() as $child) { + $html .= $this->document->saveHTML($child); + } + + return $html; + } + + /** + * Get the HTML content of the whole document. + */ + public function getHtml(): string + { + return $this->document->saveHTML(); + } + + /** + * Get the inner HTML for the given node. + */ + public function getNodeInnerHtml(DOMNode $node): string + { + $html = ''; + + foreach ($node->childNodes as $childNode) { + $html .= $this->document->saveHTML($childNode); + } + + return $html; + } + + /** + * Get the outer HTML for the given node. + */ + public function getNodeOuterHtml(DOMNode $node): string + { + return $this->document->saveHTML($node); + } +} diff --git a/app/Util/HtmlNonceApplicator.php b/app/Util/HtmlNonceApplicator.php index 07298577c..3a798e848 100644 --- a/app/Util/HtmlNonceApplicator.php +++ b/app/Util/HtmlNonceApplicator.php @@ -2,14 +2,12 @@ namespace BookStack\Util; -use DOMDocument; use DOMElement; use DOMNodeList; -use DOMXPath; class HtmlNonceApplicator { - protected static $placeholder = '[CSP_NONCE_VALUE]'; + protected static string $placeholder = '[CSP_NONCE_VALUE]'; /** * Prepare the given HTML content with nonce attributes including a placeholder @@ -21,28 +19,20 @@ class HtmlNonceApplicator return $html; } - $html = '' . $html . ''; - libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($html, LIBXML_SCHEMA_CREATE); - $xPath = new DOMXPath($doc); + // LIBXML_SCHEMA_CREATE was found to be required here otherwise + // the PHP DOMDocument handling will attempt to format/close + // HTML tags within scripts and therefore change JS content. + $doc = new HtmlDocument($html, LIBXML_SCHEMA_CREATE); // Apply to scripts - $scriptElems = $xPath->query('//script'); + $scriptElems = $doc->queryXPath('//script'); static::addNonceAttributes($scriptElems, static::$placeholder); // Apply to styles - $styleElems = $xPath->query('//style'); + $styleElems = $doc->queryXPath('//style'); static::addNonceAttributes($styleElems, static::$placeholder); - $returnHtml = ''; - $topElems = $doc->documentElement->childNodes->item(0)->childNodes; - foreach ($topElems as $child) { - $content = $doc->saveHTML($child); - $returnHtml .= $content; - } - - return $returnHtml; + return $doc->getBodyInnerHtml(); } /**