2021-06-26 23:23:15 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								<?php
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-08-16 18:27:22 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								namespace BookStack\Search;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-18 00:56:55 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use BookStack\Activity\Models\Tag;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								use BookStack\Entities\EntityProvider;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								use BookStack\Entities\Models\Entity;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use BookStack\Entities\Models\Page;
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use BookStack\Util\HtmlDocument;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								use DOMNode;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use Illuminate\Database\Eloquent\Builder;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-29 00:42:12 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								use Illuminate\Support\Collection;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class SearchIndex
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								{
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 02:03:44 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * A list of delimiter characters used to break-up parsed content into terms for indexing.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2025-02-15 03:25:59 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
							 | 
						
					
						
							
								
									
										
										
										
											2025-02-15 03:01:51 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public static string $softDelimiters = ".-";
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-02-24 06:59:26 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    public function __construct(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        protected EntityProvider $entityProvider
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ) {
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Index the given entity.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-02-24 06:59:26 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public function indexEntity(Entity $entity): void
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $this->deleteEntityTerms($entity);
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        $terms = $this->entityToTermDataArray($entity);
							 | 
						
					
						
							
								
									
										
										
										
											2024-12-11 23:53:57 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        $this->insertTerms($terms);
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-26 23:23:15 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Index multiple Entities at once.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @param Entity[] $entities
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-02-24 06:59:26 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public function indexEntities(array $entities): void
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $terms = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($entities as $entity) {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $entityTerms = $this->entityToTermDataArray($entity);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            array_push($terms, ...$entityTerms);
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-12-11 23:53:57 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $this->insertTerms($terms);
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Delete and re-index the terms for all entities in the system.
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								     * Can take a callback which is used for reporting progress.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Callback receives three arguments:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * - An instance of the model being processed
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * - The number that have been processed so far.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * - The total number of that model to be processed.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-20 22:03:56 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								     * @param callable(Entity, int, int):void|null $progressCallback
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-02-24 06:59:26 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public function indexAllEntities(?callable $progressCallback = null): void
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        SearchTerm::query()->truncate();
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($this->entityProvider->all() as $entityModel) {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $selectFields = ['id', 'name', $indexContentField];
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            /** @var Builder<Entity> $query */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $query = $entityModel->newQuery();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $total = $query->withTrashed()->count();
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $chunkSize = 250;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $processed = 0;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $this->indexEntities($entities->all());
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $processed = min($processed + $chunkSize, $total);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if (is_callable($progressCallback)) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    $progressCallback($entityModel, $processed, $total);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $entityModel->newQuery()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ->select($selectFields)
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                ->with(['tags:id,name,value,entity_id,entity_type'])
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 22:10:11 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                ->chunk($chunkSize, $chunkCallback);
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Delete related Entity search terms.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-02-24 06:59:26 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    public function deleteEntityTerms(Entity $entity): void
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $entity->searchTerms()->delete();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-12-11 23:53:57 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Insert the given terms into the database.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Chunks through the given terms to remain within database limits.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @param array[] $terms
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function insertTerms(array $terms): void
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $chunkedTerms = array_chunk($terms, 500);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($chunkedTerms as $termChunk) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            SearchTerm::query()->insert($termChunk);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Create a scored term array from the given text, where the keys are the terms
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * and the values are their scores.
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array<string, int>
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-22 04:50:04 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $termMap = $this->textToTermCountMap($text);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($termMap as $term => $count) {
							 | 
						
					
						
							
								
									
										
										
										
											2023-01-22 04:50:04 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $termMap[$term] = floor($count * $scoreAdjustment);
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $termMap;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Create a scored term array from the given HTML, where the keys are the terms
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * and the values are their scores.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array<string, int>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function generateTermScoreMapFromHtml(string $html): array
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if (empty($html)) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $scoresByTerm = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $elementScoreAdjustmentMap = [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h1' => 10,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h2' => 5,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h3' => 4,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h4' => 3,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h5' => 2,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'h6' => 1.5,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-21 06:47:42 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $doc = new HtmlDocument($html);
							 | 
						
					
						
							
								
									
										
										
										
											2022-06-21 06:47:42 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        /** @var DOMNode $child */
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 23:46:32 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($doc->getBodyChildren() as $child) {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $nodeName = $child->nodeName;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $termCounts = $this->textToTermCountMap(trim($child->textContent));
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            foreach ($termCounts as $term => $count) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $scoresByTerm;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Create a scored term map from the given set of entity tags.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @param Tag[] $tags
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array<string, int>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function generateTermScoreMapFromTags(array $tags): array
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $names = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $values = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        foreach ($tags as $tag) {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $names[] = $tag->name;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $values[] = $tag->value;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $this->mergeTermScoreMaps($nameMap, $valueMap);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * For the given text, return an array where the keys are the unique term words
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * and the values are the frequency of that term.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array<string, int>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function textToTermCountMap(string $text): array
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $tokenMap = []; // {TextToken => OccurrenceCount}
							 | 
						
					
						
							
								
									
										
										
										
											2025-02-15 03:01:51 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $softDelims = static::$softDelimiters;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $extendedToken = '';
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $extendedLen = 0;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $token = $tokenizer->next();
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        while ($token !== false) {
							 | 
						
					
						
							
								
									
										
										
										
											2025-02-15 03:01:51 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $delim = $tokenizer->previousDelimiter();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $extendedToken .= $delim . $token;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $extendedLen++;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            } else {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                if ($extendedLen > 1) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $extendedToken = $token;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $extendedLen = 1;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							
								
									
										
										
										
											2025-02-15 03:01:51 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if ($token) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $token = $tokenizer->next();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ($extendedLen > 1) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $tokenMap;
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * For the given entity, Generate an array of term data details.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * Is the raw term data, not instances of SearchTerm models.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function entityToTermDataArray(Entity $entity): array
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if ($entity instanceof Page) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        } else {
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-23 07:33:55 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 01:06:01 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $dataArray = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $entityId = $entity->id;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $entityType = $entity->getMorphClass();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($mergedScoreMap as $term => $score) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            $dataArray[] = [
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'term'        => $term,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'score'       => $score,
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'entity_type' => $entityType,
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-13 21:28:17 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'entity_id'   => $entityId,
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $dataArray;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    /**
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * For the given term data arrays, Merge their contents by term
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * while combining any scores.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @param array<string, int>[] ...$scoreMaps
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     *
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     * @returns array<string, int>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								     */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    protected function mergeTermScoreMaps(...$scoreMaps): array
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        $mergedMap = [];
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        foreach ($scoreMaps as $scoreMap) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            foreach ($scoreMap as $term => $score) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-12 21:47:23 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return $mergedMap;
							 | 
						
					
						
							
								
									
										
										
										
											2021-11-11 21:36:49 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2020-11-22 08:17:45 +08:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 |