2021-06-26 23:23:15 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								<?php
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								namespace BookStack\Util;
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use DOMAttr;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-06 08:32:01 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use DOMElement;
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								use DOMNodeList;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class HtmlContentFilter
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								{
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-28 03:54:47 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								     * Remove all the script elements from the given HTML document.
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-28 03:54:47 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public static function removeScriptsFromDocument(HtmlDocument $doc)
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove standard script tags
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        $scriptElems = $doc->queryXPath('//script');
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeNodes($scriptElems);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove clickable links to JavaScript URI
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeNodes($badLinks);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove forms with calls to JavaScript URI
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeNodes($badForms);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove meta tag to prevent external redirects
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeNodes($metaTags);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove data or JavaScript iFrames
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeNodes($badIframes);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-07 00:01:56 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        // Remove attributes, within svg children, hiding JavaScript or data uris.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // A bunch of svg element and attribute combinations expose xss possibilities.
							 | 
						
					
						
							
								
									
										
										
										
											2022-08-11 17:26:33 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        // For example, SVG animate tag can exploit javascript in values.
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
							 | 
						
					
						
							
								
									
										
										
										
											2022-09-07 00:01:56 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeAttributes($badValuesAttrs);
							 | 
						
					
						
							
								
									
										
										
										
											2022-08-11 17:26:33 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove elements with a xlink:href attribute
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeAttributes($xlinkHrefAttributes);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        // Remove 'on*' attributes
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeAttributes($onAttributes);
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-28 03:54:47 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Remove scripts from the given HTML string.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public static function removeScriptsFromHtmlString(string $html): string
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if (empty($html)) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return $html;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $doc = new HtmlDocument($html);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        static::removeScriptsFromDocument($doc);
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $doc->getBodyInnerHtml();
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Create a xpath contains statement with a translation automatically built within
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * to affectively search in a cases-insensitive manner.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected static function xpathContains(string $property, string $value): string
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $value = strtolower($value);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $upperVal = strtoupper($value);
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-07 05:19:06 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return 'contains(translate(' . $property . ', \'' . $upperVal . '\', \'' . $value . '\'), \'' . $value . '\')';
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Remove all the given DOMNodes.
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-13 19:53:04 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    protected static function removeNodes(DOMNodeList $nodes): void
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($nodes as $node) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $node->parentNode->removeChild($node);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Remove all the given attribute nodes.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected static function removeAttributes(DOMNodeList $attrs): void
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        /** @var DOMAttr $attr */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($attrs as $attr) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $attrName = $attr->nodeName;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-06 08:32:01 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            /** @var DOMElement $parentNode */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $parentNode = $attr->parentNode;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $parentNode->removeAttribute($attrName);
							 | 
						
					
						
							
								
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-13 19:53:04 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 |