| 
									
										
										
										
											2021-06-26 23:23:15 +08:00
										 |  |  | <?php | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace BookStack\Util; | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  | use DOMAttr; | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  | use DOMDocument; | 
					
						
							| 
									
										
										
										
											2021-11-06 08:32:01 +08:00
										 |  |  | use DOMElement; | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  | use DOMNodeList; | 
					
						
							|  |  |  | use DOMXPath; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class HtmlContentFilter | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     /** | 
					
						
							| 
									
										
										
										
											2021-09-04 06:32:42 +08:00
										 |  |  |      * Remove all the script elements from the given HTML. | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |      */ | 
					
						
							|  |  |  |     public static function removeScripts(string $html): string | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         if (empty($html)) { | 
					
						
							|  |  |  |             return $html; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-13 19:53:04 +08:00
										 |  |  |         $html = '<body>' . $html . '</body>'; | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         libxml_use_internal_errors(true); | 
					
						
							|  |  |  |         $doc = new DOMDocument(); | 
					
						
							|  |  |  |         $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); | 
					
						
							|  |  |  |         $xPath = new DOMXPath($doc); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Remove standard script tags
 | 
					
						
							|  |  |  |         $scriptElems = $xPath->query('//script'); | 
					
						
							|  |  |  |         static::removeNodes($scriptElems); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Remove clickable links to JavaScript URI
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |         $badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']'); | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         static::removeNodes($badLinks); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Remove forms with calls to JavaScript URI
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |         $badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']'); | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         static::removeNodes($badForms); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Remove meta tag to prevent external redirects
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |         $metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']'); | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         static::removeNodes($metaTags); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Remove data or JavaScript iFrames
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |         $badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]'); | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         static::removeNodes($badIframes); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-07 00:01:56 +08:00
										 |  |  |         // Remove attributes, within svg children, hiding JavaScript or data uris.
 | 
					
						
							|  |  |  |         // A bunch of svg element and attribute combinations expose xss possibilities.
 | 
					
						
							| 
									
										
										
										
											2022-08-11 17:26:33 +08:00
										 |  |  |         // For example, SVG animate tag can exploit javascript in values.
 | 
					
						
							| 
									
										
										
										
											2022-09-07 00:01:56 +08:00
										 |  |  |         $badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']'); | 
					
						
							|  |  |  |         static::removeAttributes($badValuesAttrs); | 
					
						
							| 
									
										
										
										
											2022-08-11 17:26:33 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  |         // Remove elements with a xlink:href attribute
 | 
					
						
							|  |  |  |         // Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
 | 
					
						
							|  |  |  |         $xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]'); | 
					
						
							|  |  |  |         static::removeAttributes($xlinkHrefAttributes); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |         // Remove 'on*' attributes
 | 
					
						
							|  |  |  |         $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]'); | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  |         static::removeAttributes($onAttributes); | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         $html = ''; | 
					
						
							|  |  |  |         $topElems = $doc->documentElement->childNodes->item(0)->childNodes; | 
					
						
							|  |  |  |         foreach ($topElems as $child) { | 
					
						
							|  |  |  |             $html .= $doc->saveHTML($child); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return $html; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * Create a xpath contains statement with a translation automatically built within | 
					
						
							|  |  |  |      * to affectively search in a cases-insensitive manner. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected static function xpathContains(string $property, string $value): string | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $value = strtolower($value); | 
					
						
							|  |  |  |         $upperVal = strtoupper($value); | 
					
						
							| 
									
										
										
										
											2021-09-07 05:19:06 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-03 05:02:30 +08:00
										 |  |  |         return 'contains(translate(' . $property . ', \'' . $upperVal . '\', \'' . $value . '\'), \'' . $value . '\')'; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |     /** | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  |      * Remove all the given DOMNodes. | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |      */ | 
					
						
							| 
									
										
										
										
											2021-06-13 19:53:04 +08:00
										 |  |  |     protected static function removeNodes(DOMNodeList $nodes): void | 
					
						
							| 
									
										
										
										
											2021-05-04 06:59:52 +08:00
										 |  |  |     { | 
					
						
							|  |  |  |         foreach ($nodes as $node) { | 
					
						
							|  |  |  |             $node->parentNode->removeChild($node); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Remove all the given attribute nodes. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected static function removeAttributes(DOMNodeList $attrs): void | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         /** @var DOMAttr $attr */ | 
					
						
							|  |  |  |         foreach ($attrs as $attr) { | 
					
						
							|  |  |  |             $attrName = $attr->nodeName; | 
					
						
							| 
									
										
										
										
											2021-11-06 08:32:01 +08:00
										 |  |  |             /** @var DOMElement $parentNode */ | 
					
						
							|  |  |  |             $parentNode = $attr->parentNode; | 
					
						
							|  |  |  |             $parentNode->removeAttribute($attrName); | 
					
						
							| 
									
										
										
										
											2021-09-04 05:34:49 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-06-13 19:53:04 +08:00
										 |  |  | } |