HTML: Aligned and standardised DOMDocument usage
Adds a thin wrapper for DOMDocument to simplify and align usage within all areas of BookStack. Also means we move away from old depreacted mb_convert_encoding usage. Closes #4638
This commit is contained in:
parent
3a6f50e668
commit
db7b11fe93
|
@ -8,9 +8,8 @@ use BookStack\Entities\Models\Page;
|
||||||
use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
|
use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
|
||||||
use BookStack\Uploads\ImageService;
|
use BookStack\Uploads\ImageService;
|
||||||
use BookStack\Util\CspService;
|
use BookStack\Util\CspService;
|
||||||
use DOMDocument;
|
use BookStack\Util\HtmlDocument;
|
||||||
use DOMElement;
|
use DOMElement;
|
||||||
use DOMXPath;
|
|
||||||
use Exception;
|
use Exception;
|
||||||
use Throwable;
|
use Throwable;
|
||||||
|
|
||||||
|
@ -151,45 +150,36 @@ class ExportFormatter
|
||||||
protected function htmlToPdf(string $html): string
|
protected function htmlToPdf(string $html): string
|
||||||
{
|
{
|
||||||
$html = $this->containHtml($html);
|
$html = $this->containHtml($html);
|
||||||
$html = $this->replaceIframesWithLinks($html);
|
$doc = new HtmlDocument();
|
||||||
$html = $this->openDetailElements($html);
|
$doc->loadCompleteHtml($html);
|
||||||
|
|
||||||
return $this->pdfGenerator->fromHtml($html);
|
$this->replaceIframesWithLinks($doc);
|
||||||
|
$this->openDetailElements($doc);
|
||||||
|
$cleanedHtml = $doc->getHtml();
|
||||||
|
|
||||||
|
return $this->pdfGenerator->fromHtml($cleanedHtml);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Within the given HTML content, Open any detail blocks.
|
* Within the given HTML content, Open any detail blocks.
|
||||||
*/
|
*/
|
||||||
protected function openDetailElements(string $html): string
|
protected function openDetailElements(HtmlDocument $doc): void
|
||||||
{
|
{
|
||||||
libxml_use_internal_errors(true);
|
$details = $doc->queryXPath('//details');
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
$details = $xPath->query('//details');
|
|
||||||
/** @var DOMElement $detail */
|
/** @var DOMElement $detail */
|
||||||
foreach ($details as $detail) {
|
foreach ($details as $detail) {
|
||||||
$detail->setAttribute('open', 'open');
|
$detail->setAttribute('open', 'open');
|
||||||
}
|
}
|
||||||
|
|
||||||
return $doc->saveHTML();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Within the given HTML content, replace any iframe elements
|
* Within the given HTML document, replace any iframe elements
|
||||||
* with anchor links within paragraph blocks.
|
* with anchor links within paragraph blocks.
|
||||||
*/
|
*/
|
||||||
protected function replaceIframesWithLinks(string $html): string
|
protected function replaceIframesWithLinks(HtmlDocument $doc): void
|
||||||
{
|
{
|
||||||
libxml_use_internal_errors(true);
|
$iframes = $doc->queryXPath('//iframe');
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
$iframes = $xPath->query('//iframe');
|
|
||||||
/** @var DOMElement $iframe */
|
/** @var DOMElement $iframe */
|
||||||
foreach ($iframes as $iframe) {
|
foreach ($iframes as $iframe) {
|
||||||
$link = $iframe->getAttribute('src');
|
$link = $iframe->getAttribute('src');
|
||||||
|
@ -203,8 +193,6 @@ class ExportFormatter
|
||||||
$paragraph->appendChild($anchor);
|
$paragraph->appendChild($anchor);
|
||||||
$iframe->parentNode->replaceChild($paragraph, $iframe);
|
$iframe->parentNode->replaceChild($paragraph, $iframe);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $doc->saveHTML();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -10,11 +10,10 @@ use BookStack\Theming\ThemeEvents;
|
||||||
use BookStack\Uploads\ImageRepo;
|
use BookStack\Uploads\ImageRepo;
|
||||||
use BookStack\Uploads\ImageService;
|
use BookStack\Uploads\ImageService;
|
||||||
use BookStack\Util\HtmlContentFilter;
|
use BookStack\Util\HtmlContentFilter;
|
||||||
use DOMDocument;
|
use BookStack\Util\HtmlDocument;
|
||||||
use DOMElement;
|
use DOMElement;
|
||||||
use DOMNode;
|
use DOMNode;
|
||||||
use DOMNodeList;
|
use DOMNodeList;
|
||||||
use DOMXPath;
|
|
||||||
use Illuminate\Support\Str;
|
use Illuminate\Support\Str;
|
||||||
|
|
||||||
class PageContent
|
class PageContent
|
||||||
|
@ -56,27 +55,17 @@ class PageContent
|
||||||
return $htmlText;
|
return $htmlText;
|
||||||
}
|
}
|
||||||
|
|
||||||
$doc = $this->loadDocumentFromHtml($htmlText);
|
$doc = new HtmlDocument($htmlText);
|
||||||
$container = $doc->documentElement;
|
|
||||||
$body = $container->childNodes->item(0);
|
|
||||||
$childNodes = $body->childNodes;
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
// Get all img elements with image data blobs
|
// Get all img elements with image data blobs
|
||||||
$imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]');
|
$imageNodes = $doc->queryXPath('//img[contains(@src, \'data:image\')]');
|
||||||
foreach ($imageNodes as $imageNode) {
|
foreach ($imageNodes as $imageNode) {
|
||||||
$imageSrc = $imageNode->getAttribute('src');
|
$imageSrc = $imageNode->getAttribute('src');
|
||||||
$newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);
|
$newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);
|
||||||
$imageNode->setAttribute('src', $newUrl);
|
$imageNode->setAttribute('src', $newUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate inner html as a string
|
return $doc->getBodyInnerHtml();
|
||||||
$html = '';
|
|
||||||
foreach ($childNodes as $childNode) {
|
|
||||||
$html .= $doc->saveHTML($childNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $html;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -172,27 +161,18 @@ class PageContent
|
||||||
return $htmlText;
|
return $htmlText;
|
||||||
}
|
}
|
||||||
|
|
||||||
$doc = $this->loadDocumentFromHtml($htmlText);
|
$doc = new HtmlDocument($htmlText);
|
||||||
$container = $doc->documentElement;
|
|
||||||
$body = $container->childNodes->item(0);
|
|
||||||
$childNodes = $body->childNodes;
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
// Map to hold used ID references
|
// Map to hold used ID references
|
||||||
$idMap = [];
|
$idMap = [];
|
||||||
// Map to hold changing ID references
|
// Map to hold changing ID references
|
||||||
$changeMap = [];
|
$changeMap = [];
|
||||||
|
|
||||||
$this->updateIdsRecursively($body, 0, $idMap, $changeMap);
|
$this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
|
||||||
$this->updateLinks($xPath, $changeMap);
|
$this->updateLinks($doc, $changeMap);
|
||||||
|
|
||||||
// Generate inner html as a string
|
// Generate inner html as a string & perform required string-level tweaks
|
||||||
$html = '';
|
$html = $doc->getBodyInnerHtml();
|
||||||
foreach ($childNodes as $childNode) {
|
|
||||||
$html .= $doc->saveHTML($childNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform required string-level tweaks
|
|
||||||
$html = str_replace(' ', ' ', $html);
|
$html = str_replace(' ', ' ', $html);
|
||||||
|
|
||||||
return $html;
|
return $html;
|
||||||
|
@ -225,13 +205,13 @@ class PageContent
|
||||||
* Update the all links in the given xpath to apply requires changes within the
|
* Update the all links in the given xpath to apply requires changes within the
|
||||||
* given $changeMap array.
|
* given $changeMap array.
|
||||||
*/
|
*/
|
||||||
protected function updateLinks(DOMXPath $xpath, array $changeMap): void
|
protected function updateLinks(HtmlDocument $doc, array $changeMap): void
|
||||||
{
|
{
|
||||||
if (empty($changeMap)) {
|
if (empty($changeMap)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$links = $xpath->query('//body//*//*[@href]');
|
$links = $doc->queryXPath('//body//*//*[@href]');
|
||||||
/** @var DOMElement $domElem */
|
/** @var DOMElement $domElem */
|
||||||
foreach ($links as $domElem) {
|
foreach ($links as $domElem) {
|
||||||
$href = ltrim($domElem->getAttribute('href'), '#');
|
$href = ltrim($domElem->getAttribute('href'), '#');
|
||||||
|
@ -321,11 +301,10 @@ class PageContent
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$doc = $this->loadDocumentFromHtml($htmlContent);
|
$doc = new HtmlDocument($htmlContent);
|
||||||
$xPath = new DOMXPath($doc);
|
$headers = $doc->queryXPath('//h1|//h2|//h3|//h4|//h5|//h6');
|
||||||
$headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6');
|
|
||||||
|
|
||||||
return $headers ? $this->headerNodesToLevelList($headers) : [];
|
return $headers->count() === 0 ? [] : $this->headerNodesToLevelList($headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -420,7 +399,7 @@ class PageContent
|
||||||
protected function fetchSectionOfPage(Page $page, string $sectionId): string
|
protected function fetchSectionOfPage(Page $page, string $sectionId): string
|
||||||
{
|
{
|
||||||
$topLevelTags = ['table', 'ul', 'ol', 'pre'];
|
$topLevelTags = ['table', 'ul', 'ol', 'pre'];
|
||||||
$doc = $this->loadDocumentFromHtml($page->html);
|
$doc = new HtmlDocument($page->html);
|
||||||
|
|
||||||
// Search included content for the id given and blank out if not exists.
|
// Search included content for the id given and blank out if not exists.
|
||||||
$matchingElem = $doc->getElementById($sectionId);
|
$matchingElem = $doc->getElementById($sectionId);
|
||||||
|
@ -430,30 +409,11 @@ class PageContent
|
||||||
|
|
||||||
// Otherwise replace the content with the found content
|
// Otherwise replace the content with the found content
|
||||||
// Checks if the top-level wrapper should be included by matching on tag types
|
// Checks if the top-level wrapper should be included by matching on tag types
|
||||||
$innerContent = '';
|
|
||||||
$isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);
|
$isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);
|
||||||
if ($isTopLevel) {
|
if ($isTopLevel) {
|
||||||
$innerContent .= $doc->saveHTML($matchingElem);
|
return $doc->getNodeOuterHtml($matchingElem);
|
||||||
} else {
|
|
||||||
foreach ($matchingElem->childNodes as $childNode) {
|
|
||||||
$innerContent .= $doc->saveHTML($childNode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
libxml_clear_errors();
|
|
||||||
|
|
||||||
return $innerContent;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
return $doc->getNodeInnerHtml($matchingElem);
|
||||||
* Create and load a DOMDocument from the given html content.
|
|
||||||
*/
|
|
||||||
protected function loadDocumentFromHtml(string $html): DOMDocument
|
|
||||||
{
|
|
||||||
libxml_use_internal_errors(true);
|
|
||||||
$doc = new DOMDocument();
|
|
||||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
|
||||||
$doc->loadHTML($html);
|
|
||||||
|
|
||||||
return $doc;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,7 @@ use BookStack\References\ModelResolvers\ChapterLinkModelResolver;
|
||||||
use BookStack\References\ModelResolvers\CrossLinkModelResolver;
|
use BookStack\References\ModelResolvers\CrossLinkModelResolver;
|
||||||
use BookStack\References\ModelResolvers\PageLinkModelResolver;
|
use BookStack\References\ModelResolvers\PageLinkModelResolver;
|
||||||
use BookStack\References\ModelResolvers\PagePermalinkModelResolver;
|
use BookStack\References\ModelResolvers\PagePermalinkModelResolver;
|
||||||
use DOMDocument;
|
use BookStack\Util\HtmlDocument;
|
||||||
use DOMXPath;
|
|
||||||
|
|
||||||
class CrossLinkParser
|
class CrossLinkParser
|
||||||
{
|
{
|
||||||
|
@ -54,13 +53,8 @@ class CrossLinkParser
|
||||||
{
|
{
|
||||||
$links = [];
|
$links = [];
|
||||||
|
|
||||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
$doc = new HtmlDocument($html);
|
||||||
libxml_use_internal_errors(true);
|
$anchors = $doc->queryXPath('//a[@href]');
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML($html);
|
|
||||||
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
$anchors = $xPath->query('//a[@href]');
|
|
||||||
|
|
||||||
/** @var \DOMElement $anchor */
|
/** @var \DOMElement $anchor */
|
||||||
foreach ($anchors as $anchor) {
|
foreach ($anchors as $anchor) {
|
||||||
|
|
|
@ -6,18 +6,14 @@ use BookStack\Entities\Models\Book;
|
||||||
use BookStack\Entities\Models\Entity;
|
use BookStack\Entities\Models\Entity;
|
||||||
use BookStack\Entities\Models\Page;
|
use BookStack\Entities\Models\Page;
|
||||||
use BookStack\Entities\Repos\RevisionRepo;
|
use BookStack\Entities\Repos\RevisionRepo;
|
||||||
use DOMDocument;
|
use BookStack\Util\HtmlDocument;
|
||||||
use DOMXPath;
|
|
||||||
|
|
||||||
class ReferenceUpdater
|
class ReferenceUpdater
|
||||||
{
|
{
|
||||||
protected ReferenceFetcher $referenceFetcher;
|
public function __construct(
|
||||||
protected RevisionRepo $revisionRepo;
|
protected ReferenceFetcher $referenceFetcher,
|
||||||
|
protected RevisionRepo $revisionRepo
|
||||||
public function __construct(ReferenceFetcher $referenceFetcher, RevisionRepo $revisionRepo)
|
) {
|
||||||
{
|
|
||||||
$this->referenceFetcher = $referenceFetcher;
|
|
||||||
$this->revisionRepo = $revisionRepo;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateEntityPageReferences(Entity $entity, string $oldLink)
|
public function updateEntityPageReferences(Entity $entity, string $oldLink)
|
||||||
|
@ -96,13 +92,8 @@ class ReferenceUpdater
|
||||||
return $html;
|
return $html;
|
||||||
}
|
}
|
||||||
|
|
||||||
$html = '<body>' . $html . '</body>';
|
$doc = new HtmlDocument($html);
|
||||||
libxml_use_internal_errors(true);
|
$anchors = $doc->queryXPath('//a[@href]');
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
|
||||||
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
$anchors = $xPath->query('//a[@href]');
|
|
||||||
|
|
||||||
/** @var \DOMElement $anchor */
|
/** @var \DOMElement $anchor */
|
||||||
foreach ($anchors as $anchor) {
|
foreach ($anchors as $anchor) {
|
||||||
|
@ -111,12 +102,6 @@ class ReferenceUpdater
|
||||||
$anchor->setAttribute('href', $updated);
|
$anchor->setAttribute('href', $updated);
|
||||||
}
|
}
|
||||||
|
|
||||||
$html = '';
|
return $doc->getBodyInnerHtml();
|
||||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
|
||||||
foreach ($topElems as $child) {
|
|
||||||
$html .= $doc->saveHTML($child);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $html;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ use BookStack\Activity\Models\Tag;
|
||||||
use BookStack\Entities\EntityProvider;
|
use BookStack\Entities\EntityProvider;
|
||||||
use BookStack\Entities\Models\Entity;
|
use BookStack\Entities\Models\Entity;
|
||||||
use BookStack\Entities\Models\Page;
|
use BookStack\Entities\Models\Page;
|
||||||
use DOMDocument;
|
use BookStack\Util\HtmlDocument;
|
||||||
use DOMNode;
|
use DOMNode;
|
||||||
use Illuminate\Database\Eloquent\Builder;
|
use Illuminate\Database\Eloquent\Builder;
|
||||||
use Illuminate\Support\Collection;
|
use Illuminate\Support\Collection;
|
||||||
|
@ -138,16 +138,11 @@ class SearchIndex
|
||||||
'h6' => 1.5,
|
'h6' => 1.5,
|
||||||
];
|
];
|
||||||
|
|
||||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
|
||||||
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
|
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
|
||||||
|
$doc = new HtmlDocument($html);
|
||||||
|
|
||||||
libxml_use_internal_errors(true);
|
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML($html);
|
|
||||||
|
|
||||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
|
||||||
/** @var DOMNode $child */
|
/** @var DOMNode $child */
|
||||||
foreach ($topElems as $child) {
|
foreach ($doc->getBodyChildren() as $child) {
|
||||||
$nodeName = $child->nodeName;
|
$nodeName = $child->nodeName;
|
||||||
$termCounts = $this->textToTermCountMap(trim($child->textContent));
|
$termCounts = $this->textToTermCountMap(trim($child->textContent));
|
||||||
foreach ($termCounts as $term => $count) {
|
foreach ($termCounts as $term => $count) {
|
||||||
|
@ -168,7 +163,6 @@ class SearchIndex
|
||||||
*/
|
*/
|
||||||
protected function generateTermScoreMapFromTags(array $tags): array
|
protected function generateTermScoreMapFromTags(array $tags): array
|
||||||
{
|
{
|
||||||
$scoreMap = [];
|
|
||||||
$names = [];
|
$names = [];
|
||||||
$values = [];
|
$values = [];
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,8 @@
|
||||||
namespace BookStack\Util;
|
namespace BookStack\Util;
|
||||||
|
|
||||||
use DOMAttr;
|
use DOMAttr;
|
||||||
use DOMDocument;
|
|
||||||
use DOMElement;
|
use DOMElement;
|
||||||
use DOMNodeList;
|
use DOMNodeList;
|
||||||
use DOMXPath;
|
|
||||||
|
|
||||||
class HtmlContentFilter
|
class HtmlContentFilter
|
||||||
{
|
{
|
||||||
|
@ -19,54 +17,44 @@ class HtmlContentFilter
|
||||||
return $html;
|
return $html;
|
||||||
}
|
}
|
||||||
|
|
||||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
$doc = new HtmlDocument($html);
|
||||||
libxml_use_internal_errors(true);
|
|
||||||
$doc = new DOMDocument();
|
|
||||||
$doc->loadHTML($html);
|
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
// Remove standard script tags
|
// Remove standard script tags
|
||||||
$scriptElems = $xPath->query('//script');
|
$scriptElems = $doc->queryXPath('//script');
|
||||||
static::removeNodes($scriptElems);
|
static::removeNodes($scriptElems);
|
||||||
|
|
||||||
// Remove clickable links to JavaScript URI
|
// Remove clickable links to JavaScript URI
|
||||||
$badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
|
$badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
|
||||||
static::removeNodes($badLinks);
|
static::removeNodes($badLinks);
|
||||||
|
|
||||||
// Remove forms with calls to JavaScript URI
|
// Remove forms with calls to JavaScript URI
|
||||||
$badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
|
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
|
||||||
static::removeNodes($badForms);
|
static::removeNodes($badForms);
|
||||||
|
|
||||||
// Remove meta tag to prevent external redirects
|
// Remove meta tag to prevent external redirects
|
||||||
$metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
|
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
|
||||||
static::removeNodes($metaTags);
|
static::removeNodes($metaTags);
|
||||||
|
|
||||||
// Remove data or JavaScript iFrames
|
// Remove data or JavaScript iFrames
|
||||||
$badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
|
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
|
||||||
static::removeNodes($badIframes);
|
static::removeNodes($badIframes);
|
||||||
|
|
||||||
// Remove attributes, within svg children, hiding JavaScript or data uris.
|
// Remove attributes, within svg children, hiding JavaScript or data uris.
|
||||||
// A bunch of svg element and attribute combinations expose xss possibilities.
|
// A bunch of svg element and attribute combinations expose xss possibilities.
|
||||||
// For example, SVG animate tag can exploit javascript in values.
|
// For example, SVG animate tag can exploit javascript in values.
|
||||||
$badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
|
$badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
|
||||||
static::removeAttributes($badValuesAttrs);
|
static::removeAttributes($badValuesAttrs);
|
||||||
|
|
||||||
// Remove elements with a xlink:href attribute
|
// Remove elements with a xlink:href attribute
|
||||||
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
|
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
|
||||||
$xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
|
$xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
|
||||||
static::removeAttributes($xlinkHrefAttributes);
|
static::removeAttributes($xlinkHrefAttributes);
|
||||||
|
|
||||||
// Remove 'on*' attributes
|
// Remove 'on*' attributes
|
||||||
$onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
|
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
|
||||||
static::removeAttributes($onAttributes);
|
static::removeAttributes($onAttributes);
|
||||||
|
|
||||||
$html = '';
|
return $doc->getBodyInnerHtml();
|
||||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
|
||||||
foreach ($topElems as $child) {
|
|
||||||
$html .= $doc->saveHTML($child);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $html;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,152 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace BookStack\Util;
|
||||||
|
|
||||||
|
use DOMDocument;
|
||||||
|
use DOMElement;
|
||||||
|
use DOMNode;
|
||||||
|
use DOMNodeList;
|
||||||
|
use DOMXPath;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* HtmlDocument is a thin wrapper around DOMDocument built
|
||||||
|
* specifically for loading, querying and generating HTML content.
|
||||||
|
*/
|
||||||
|
class HtmlDocument
|
||||||
|
{
|
||||||
|
protected DOMDocument $document;
|
||||||
|
protected ?DOMXPath $xpath = null;
|
||||||
|
protected int $loadOptions;
|
||||||
|
|
||||||
|
public function __construct(string $partialHtml = '', int $loadOptions = 0)
|
||||||
|
{
|
||||||
|
libxml_use_internal_errors(true);
|
||||||
|
$this->document = new DOMDocument();
|
||||||
|
$this->loadOptions = $loadOptions;
|
||||||
|
|
||||||
|
if ($partialHtml) {
|
||||||
|
$this->loadPartialHtml($partialHtml);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load some HTML content that's part of a document (e.g. body content)
|
||||||
|
* into the current document.
|
||||||
|
*/
|
||||||
|
public function loadPartialHtml(string $html): void
|
||||||
|
{
|
||||||
|
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
||||||
|
$this->document->loadHTML($html, $this->loadOptions);
|
||||||
|
$this->xpath = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load a complete page of HTML content into the document.
|
||||||
|
*/
|
||||||
|
public function loadCompleteHtml(string $html): void
|
||||||
|
{
|
||||||
|
$html = '<?xml encoding="utf-8" ?>' . $html;
|
||||||
|
$this->document->loadHTML($html, $this->loadOptions);
|
||||||
|
$this->xpath = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start an XPath query on the current document.
|
||||||
|
*/
|
||||||
|
public function queryXPath(string $expression): DOMNodeList
|
||||||
|
{
|
||||||
|
if (is_null($this->xpath)) {
|
||||||
|
$this->xpath = new DOMXPath($this->document);
|
||||||
|
}
|
||||||
|
|
||||||
|
$result = $this->xpath->query($expression);
|
||||||
|
if ($result === false) {
|
||||||
|
throw new \InvalidArgumentException("XPath query for expression [$expression] failed to execute");
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new DOMElement instance within the document.
|
||||||
|
*/
|
||||||
|
public function createElement(string $localName, string $value = ''): DOMElement
|
||||||
|
{
|
||||||
|
$element = $this->document->createElement($localName, $value);
|
||||||
|
|
||||||
|
if ($element === false) {
|
||||||
|
throw new \InvalidArgumentException("Failed to create element of name [$localName] and value [$value]");
|
||||||
|
}
|
||||||
|
|
||||||
|
return $element;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get an element within the document of the given ID.
|
||||||
|
*/
|
||||||
|
public function getElementById(string $elementId): ?DOMElement
|
||||||
|
{
|
||||||
|
return $this->document->getElementById($elementId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the DOMNode that represents the HTML body.
|
||||||
|
*/
|
||||||
|
public function getBody(): DOMNode
|
||||||
|
{
|
||||||
|
return $this->document->getElementsByTagName('body')[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the nodes that are a direct child of the body.
|
||||||
|
* This is usually all the content nodes if loaded partially.
|
||||||
|
*/
|
||||||
|
public function getBodyChildren(): DOMNodeList
|
||||||
|
{
|
||||||
|
return $this->getBody()->childNodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the inner HTML content of the body.
|
||||||
|
* This is usually all the content if loaded partially.
|
||||||
|
*/
|
||||||
|
public function getBodyInnerHtml(): string
|
||||||
|
{
|
||||||
|
$html = '';
|
||||||
|
foreach ($this->getBodyChildren() as $child) {
|
||||||
|
$html .= $this->document->saveHTML($child);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $html;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the HTML content of the whole document.
|
||||||
|
*/
|
||||||
|
public function getHtml(): string
|
||||||
|
{
|
||||||
|
return $this->document->saveHTML();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the inner HTML for the given node.
|
||||||
|
*/
|
||||||
|
public function getNodeInnerHtml(DOMNode $node): string
|
||||||
|
{
|
||||||
|
$html = '';
|
||||||
|
|
||||||
|
foreach ($node->childNodes as $childNode) {
|
||||||
|
$html .= $this->document->saveHTML($childNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $html;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the outer HTML for the given node.
|
||||||
|
*/
|
||||||
|
public function getNodeOuterHtml(DOMNode $node): string
|
||||||
|
{
|
||||||
|
return $this->document->saveHTML($node);
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,14 +2,12 @@
|
||||||
|
|
||||||
namespace BookStack\Util;
|
namespace BookStack\Util;
|
||||||
|
|
||||||
use DOMDocument;
|
|
||||||
use DOMElement;
|
use DOMElement;
|
||||||
use DOMNodeList;
|
use DOMNodeList;
|
||||||
use DOMXPath;
|
|
||||||
|
|
||||||
class HtmlNonceApplicator
|
class HtmlNonceApplicator
|
||||||
{
|
{
|
||||||
protected static $placeholder = '[CSP_NONCE_VALUE]';
|
protected static string $placeholder = '[CSP_NONCE_VALUE]';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare the given HTML content with nonce attributes including a placeholder
|
* Prepare the given HTML content with nonce attributes including a placeholder
|
||||||
|
@ -21,28 +19,20 @@ class HtmlNonceApplicator
|
||||||
return $html;
|
return $html;
|
||||||
}
|
}
|
||||||
|
|
||||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
// LIBXML_SCHEMA_CREATE was found to be required here otherwise
|
||||||
libxml_use_internal_errors(true);
|
// the PHP DOMDocument handling will attempt to format/close
|
||||||
$doc = new DOMDocument();
|
// HTML tags within scripts and therefore change JS content.
|
||||||
$doc->loadHTML($html, LIBXML_SCHEMA_CREATE);
|
$doc = new HtmlDocument($html, LIBXML_SCHEMA_CREATE);
|
||||||
$xPath = new DOMXPath($doc);
|
|
||||||
|
|
||||||
// Apply to scripts
|
// Apply to scripts
|
||||||
$scriptElems = $xPath->query('//script');
|
$scriptElems = $doc->queryXPath('//script');
|
||||||
static::addNonceAttributes($scriptElems, static::$placeholder);
|
static::addNonceAttributes($scriptElems, static::$placeholder);
|
||||||
|
|
||||||
// Apply to styles
|
// Apply to styles
|
||||||
$styleElems = $xPath->query('//style');
|
$styleElems = $doc->queryXPath('//style');
|
||||||
static::addNonceAttributes($styleElems, static::$placeholder);
|
static::addNonceAttributes($styleElems, static::$placeholder);
|
||||||
|
|
||||||
$returnHtml = '';
|
return $doc->getBodyInnerHtml();
|
||||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
|
||||||
foreach ($topElems as $child) {
|
|
||||||
$content = $doc->saveHTML($child);
|
|
||||||
$returnHtml .= $content;
|
|
||||||
}
|
|
||||||
|
|
||||||
return $returnHtml;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue