| 
									
										
										
										
											2021-06-26 23:23:15 +08:00
										 |  |  | <?php | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-16 18:27:22 +08:00
										 |  |  | namespace BookStack\Search; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  | use BookStack\Actions\Tag; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | use BookStack\Entities\EntityProvider; | 
					
						
							|  |  |  | use BookStack\Entities\Models\Entity; | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  | use BookStack\Entities\Models\Page; | 
					
						
							|  |  |  | use DOMDocument; | 
					
						
							|  |  |  | use DOMNode; | 
					
						
							| 
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 |  |  | use Illuminate\Database\Eloquent\Builder; | 
					
						
							| 
									
										
										
										
											2020-11-29 00:42:12 +08:00
										 |  |  | use Illuminate\Support\Collection; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | class SearchIndex | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2021-11-13 02:03:44 +08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * A list of delimiter characters used to break-up parsed content into terms for indexing. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @var string | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     public static $delimiters = " \n\t.,!?:;()[]{}<>`'\""; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * @var EntityProvider | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected $entityProvider; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |     public function __construct(EntityProvider $entityProvider) | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     { | 
					
						
							|  |  |  |         $this->entityProvider = $entityProvider; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Index the given entity. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     public function indexEntity(Entity $entity) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $this->deleteEntityTerms($entity); | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |         $terms = $this->entityToTermDataArray($entity); | 
					
						
							|  |  |  |         SearchTerm::query()->insert($terms); | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							| 
									
										
										
										
											2021-06-26 23:23:15 +08:00
										 |  |  |      * Index multiple Entities at once. | 
					
						
							|  |  |  |      * | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |      * @param Entity[] $entities | 
					
						
							|  |  |  |      */ | 
					
						
							| 
									
										
										
										
											2021-11-08 19:29:25 +08:00
										 |  |  |     public function indexEntities(array $entities) | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     { | 
					
						
							|  |  |  |         $terms = []; | 
					
						
							|  |  |  |         foreach ($entities as $entity) { | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |             $entityTerms = $this->entityToTermDataArray($entity); | 
					
						
							|  |  |  |             array_push($terms, ...$entityTerms); | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $chunkedTerms = array_chunk($terms, 500); | 
					
						
							|  |  |  |         foreach ($chunkedTerms as $termChunk) { | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |             SearchTerm::query()->insert($termChunk); | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Delete and re-index the terms for all entities in the system. | 
					
						
							| 
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 |  |  |      * Can take a callback which is used for reporting progress. | 
					
						
							|  |  |  |      * Callback receives three arguments: | 
					
						
							|  |  |  |      * - An instance of the model being processed | 
					
						
							|  |  |  |      * - The number that have been processed so far. | 
					
						
							|  |  |  |      * - The total number of that model to be processed. | 
					
						
							|  |  |  |      * | 
					
						
							| 
									
										
										
										
											2021-11-20 22:03:56 +08:00
										 |  |  |      * @param callable(Entity, int, int):void|null $progressCallback | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |      */ | 
					
						
							| 
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 |  |  |     public function indexAllEntities(?callable $progressCallback = null) | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     { | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |         SearchTerm::query()->truncate(); | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         foreach ($this->entityProvider->all() as $entityModel) { | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |             $indexContentField = $entityModel instanceof Page ? 'html' : 'description'; | 
					
						
							|  |  |  |             $selectFields = ['id', 'name', $indexContentField]; | 
					
						
							| 
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 |  |  |             /** @var Builder<Entity> $query */ | 
					
						
							|  |  |  |             $query = $entityModel->newQuery(); | 
					
						
							|  |  |  |             $total = $query->withTrashed()->count(); | 
					
						
							| 
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 |  |  |             $chunkSize = 250; | 
					
						
							|  |  |  |             $processed = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) { | 
					
						
							|  |  |  |                 $this->indexEntities($entities->all()); | 
					
						
							|  |  |  |                 $processed = min($processed + $chunkSize, $total); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 if (is_callable($progressCallback)) { | 
					
						
							|  |  |  |                     $progressCallback($entityModel, $processed, $total); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |             $entityModel->newQuery() | 
					
						
							|  |  |  |                 ->select($selectFields) | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  |                 ->with(['tags:id,name,value,entity_id,entity_type']) | 
					
						
							| 
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 |  |  |                 ->chunk($chunkSize, $chunkCallback); | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Delete related Entity search terms. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     public function deleteEntityTerms(Entity $entity) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $entity->searchTerms()->delete(); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |      * Create a scored term array from the given text, where the keys are the terms | 
					
						
							|  |  |  |      * and the values are their scores. | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |      * | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |      * @returns array<string, int> | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |      */ | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |     protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $termMap = $this->textToTermCountMap($text); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         foreach ($termMap as $term => $count) { | 
					
						
							|  |  |  |             $termMap[$term] = $count * $scoreAdjustment; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return $termMap; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Create a scored term array from the given HTML, where the keys are the terms | 
					
						
							|  |  |  |      * and the values are their scores. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @returns array<string, int> | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected function generateTermScoreMapFromHtml(string $html): array | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         if (empty($html)) { | 
					
						
							|  |  |  |             return []; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $scoresByTerm = []; | 
					
						
							|  |  |  |         $elementScoreAdjustmentMap = [ | 
					
						
							|  |  |  |             'h1' => 10, | 
					
						
							|  |  |  |             'h2' => 5, | 
					
						
							|  |  |  |             'h3' => 4, | 
					
						
							|  |  |  |             'h4' => 3, | 
					
						
							|  |  |  |             'h5' => 2, | 
					
						
							|  |  |  |             'h6' => 1.5, | 
					
						
							|  |  |  |         ]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $html = '<body>' . $html . '</body>'; | 
					
						
							| 
									
										
										
										
											2022-06-21 06:47:42 +08:00
										 |  |  |         $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |         libxml_use_internal_errors(true); | 
					
						
							|  |  |  |         $doc = new DOMDocument(); | 
					
						
							|  |  |  |         $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $topElems = $doc->documentElement->childNodes->item(0)->childNodes; | 
					
						
							|  |  |  |         /** @var DOMNode $child */ | 
					
						
							|  |  |  |         foreach ($topElems as $child) { | 
					
						
							|  |  |  |             $nodeName = $child->nodeName; | 
					
						
							|  |  |  |             $termCounts = $this->textToTermCountMap(trim($child->textContent)); | 
					
						
							|  |  |  |             foreach ($termCounts as $term => $count) { | 
					
						
							|  |  |  |                 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1); | 
					
						
							|  |  |  |                 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return $scoresByTerm; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * Create a scored term map from the given set of entity tags. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @param Tag[] $tags | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @returns array<string, int> | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected function generateTermScoreMapFromTags(array $tags): array | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $scoreMap = []; | 
					
						
							|  |  |  |         $names = []; | 
					
						
							|  |  |  |         $values = []; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 |  |  |         foreach ($tags as $tag) { | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  |             $names[] = $tag->name; | 
					
						
							|  |  |  |             $values[] = $tag->value; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3); | 
					
						
							|  |  |  |         $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return $this->mergeTermScoreMaps($nameMap, $valueMap); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * For the given text, return an array where the keys are the unique term words | 
					
						
							|  |  |  |      * and the values are the frequency of that term. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @returns array<string, int> | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected function textToTermCountMap(string $text): array | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     { | 
					
						
							|  |  |  |         $tokenMap = []; // {TextToken => OccurrenceCount}
 | 
					
						
							| 
									
										
										
										
											2021-11-13 02:03:44 +08:00
										 |  |  |         $splitChars = static::$delimiters; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |         $token = strtok($text, $splitChars); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         while ($token !== false) { | 
					
						
							|  |  |  |             if (!isset($tokenMap[$token])) { | 
					
						
							|  |  |  |                 $tokenMap[$token] = 0; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             $tokenMap[$token]++; | 
					
						
							|  |  |  |             $token = strtok($splitChars); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |         return $tokenMap; | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * For the given entity, Generate an array of term data details. | 
					
						
							|  |  |  |      * Is the raw term data, not instances of SearchTerm models. | 
					
						
							|  |  |  |      * | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |      * @returns array{term: string, score: float, entity_id: int, entity_type: string}[] | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |      */ | 
					
						
							|  |  |  |     protected function entityToTermDataArray(Entity $entity): array | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |         $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor); | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  |         $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all()); | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if ($entity instanceof Page) { | 
					
						
							|  |  |  |             $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html); | 
					
						
							|  |  |  |         } else { | 
					
						
							| 
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 |  |  |             $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor); | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 |  |  |         $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap); | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         $dataArray = []; | 
					
						
							|  |  |  |         $entityId = $entity->id; | 
					
						
							|  |  |  |         $entityType = $entity->getMorphClass(); | 
					
						
							|  |  |  |         foreach ($mergedScoreMap as $term => $score) { | 
					
						
							|  |  |  |             $dataArray[] = [ | 
					
						
							| 
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 |  |  |                 'term'        => $term, | 
					
						
							|  |  |  |                 'score'       => $score, | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |                 'entity_type' => $entityType, | 
					
						
							| 
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 |  |  |                 'entity_id'   => $entityId, | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |             ]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return $dataArray; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * For the given term data arrays, Merge their contents by term | 
					
						
							|  |  |  |      * while combining any scores. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @param array<string, int>[] ...$scoreMaps | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @returns array<string, int> | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     protected function mergeTermScoreMaps(...$scoreMaps): array | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $mergedMap = []; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         foreach ($scoreMaps as $scoreMap) { | 
					
						
							|  |  |  |             foreach ($scoreMap as $term => $score) { | 
					
						
							|  |  |  |                 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score; | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 |  |  |         return $mergedMap; | 
					
						
							| 
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 |  |  | } |