258 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
			
		
		
	
	
			258 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
<?php
 | 
						|
 | 
						|
namespace BookStack\Search;
 | 
						|
 | 
						|
use BookStack\Activity\Models\Tag;
 | 
						|
use BookStack\Entities\EntityProvider;
 | 
						|
use BookStack\Entities\Models\Entity;
 | 
						|
use BookStack\Entities\Models\Page;
 | 
						|
use BookStack\Util\HtmlDocument;
 | 
						|
use DOMNode;
 | 
						|
use Illuminate\Database\Eloquent\Builder;
 | 
						|
use Illuminate\Support\Collection;
 | 
						|
 | 
						|
class SearchIndex
 | 
						|
{
 | 
						|
    /**
 | 
						|
     * A list of delimiter characters used to break-up parsed content into terms for indexing.
 | 
						|
     */
 | 
						|
    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
 | 
						|
 | 
						|
    public function __construct(
 | 
						|
        protected EntityProvider $entityProvider
 | 
						|
    ) {
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Index the given entity.
 | 
						|
     */
 | 
						|
    public function indexEntity(Entity $entity): void
 | 
						|
    {
 | 
						|
        $this->deleteEntityTerms($entity);
 | 
						|
        $terms = $this->entityToTermDataArray($entity);
 | 
						|
        SearchTerm::query()->insert($terms);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Index multiple Entities at once.
 | 
						|
     *
 | 
						|
     * @param Entity[] $entities
 | 
						|
     */
 | 
						|
    public function indexEntities(array $entities): void
 | 
						|
    {
 | 
						|
        $terms = [];
 | 
						|
        foreach ($entities as $entity) {
 | 
						|
            $entityTerms = $this->entityToTermDataArray($entity);
 | 
						|
            array_push($terms, ...$entityTerms);
 | 
						|
        }
 | 
						|
 | 
						|
        $chunkedTerms = array_chunk($terms, 500);
 | 
						|
        foreach ($chunkedTerms as $termChunk) {
 | 
						|
            SearchTerm::query()->insert($termChunk);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Delete and re-index the terms for all entities in the system.
 | 
						|
     * Can take a callback which is used for reporting progress.
 | 
						|
     * Callback receives three arguments:
 | 
						|
     * - An instance of the model being processed
 | 
						|
     * - The number that have been processed so far.
 | 
						|
     * - The total number of that model to be processed.
 | 
						|
     *
 | 
						|
     * @param callable(Entity, int, int):void|null $progressCallback
 | 
						|
     */
 | 
						|
    public function indexAllEntities(?callable $progressCallback = null): void
 | 
						|
    {
 | 
						|
        SearchTerm::query()->truncate();
 | 
						|
 | 
						|
        foreach ($this->entityProvider->all() as $entityModel) {
 | 
						|
            $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
 | 
						|
            $selectFields = ['id', 'name', $indexContentField];
 | 
						|
            /** @var Builder<Entity> $query */
 | 
						|
            $query = $entityModel->newQuery();
 | 
						|
            $total = $query->withTrashed()->count();
 | 
						|
            $chunkSize = 250;
 | 
						|
            $processed = 0;
 | 
						|
 | 
						|
            $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
 | 
						|
                $this->indexEntities($entities->all());
 | 
						|
                $processed = min($processed + $chunkSize, $total);
 | 
						|
 | 
						|
                if (is_callable($progressCallback)) {
 | 
						|
                    $progressCallback($entityModel, $processed, $total);
 | 
						|
                }
 | 
						|
            };
 | 
						|
 | 
						|
            $entityModel->newQuery()
 | 
						|
                ->select($selectFields)
 | 
						|
                ->with(['tags:id,name,value,entity_id,entity_type'])
 | 
						|
                ->chunk($chunkSize, $chunkCallback);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Delete related Entity search terms.
 | 
						|
     */
 | 
						|
    public function deleteEntityTerms(Entity $entity): void
 | 
						|
    {
 | 
						|
        $entity->searchTerms()->delete();
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Create a scored term array from the given text, where the keys are the terms
 | 
						|
     * and the values are their scores.
 | 
						|
     *
 | 
						|
     * @returns array<string, int>
 | 
						|
     */
 | 
						|
    protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
 | 
						|
    {
 | 
						|
        $termMap = $this->textToTermCountMap($text);
 | 
						|
 | 
						|
        foreach ($termMap as $term => $count) {
 | 
						|
            $termMap[$term] = floor($count * $scoreAdjustment);
 | 
						|
        }
 | 
						|
 | 
						|
        return $termMap;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Create a scored term array from the given HTML, where the keys are the terms
 | 
						|
     * and the values are their scores.
 | 
						|
     *
 | 
						|
     * @returns array<string, int>
 | 
						|
     */
 | 
						|
    protected function generateTermScoreMapFromHtml(string $html): array
 | 
						|
    {
 | 
						|
        if (empty($html)) {
 | 
						|
            return [];
 | 
						|
        }
 | 
						|
 | 
						|
        $scoresByTerm = [];
 | 
						|
        $elementScoreAdjustmentMap = [
 | 
						|
            'h1' => 10,
 | 
						|
            'h2' => 5,
 | 
						|
            'h3' => 4,
 | 
						|
            'h4' => 3,
 | 
						|
            'h5' => 2,
 | 
						|
            'h6' => 1.5,
 | 
						|
        ];
 | 
						|
 | 
						|
        $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
 | 
						|
        $doc = new HtmlDocument($html);
 | 
						|
 | 
						|
        /** @var DOMNode $child */
 | 
						|
        foreach ($doc->getBodyChildren() as $child) {
 | 
						|
            $nodeName = $child->nodeName;
 | 
						|
            $termCounts = $this->textToTermCountMap(trim($child->textContent));
 | 
						|
            foreach ($termCounts as $term => $count) {
 | 
						|
                $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
 | 
						|
                $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        return $scoresByTerm;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * Create a scored term map from the given set of entity tags.
 | 
						|
     *
 | 
						|
     * @param Tag[] $tags
 | 
						|
     *
 | 
						|
     * @returns array<string, int>
 | 
						|
     */
 | 
						|
    protected function generateTermScoreMapFromTags(array $tags): array
 | 
						|
    {
 | 
						|
        $names = [];
 | 
						|
        $values = [];
 | 
						|
 | 
						|
        foreach ($tags as $tag) {
 | 
						|
            $names[] = $tag->name;
 | 
						|
            $values[] = $tag->value;
 | 
						|
        }
 | 
						|
 | 
						|
        $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
 | 
						|
        $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
 | 
						|
 | 
						|
        return $this->mergeTermScoreMaps($nameMap, $valueMap);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * For the given text, return an array where the keys are the unique term words
 | 
						|
     * and the values are the frequency of that term.
 | 
						|
     *
 | 
						|
     * @returns array<string, int>
 | 
						|
     */
 | 
						|
    protected function textToTermCountMap(string $text): array
 | 
						|
    {
 | 
						|
        $tokenMap = []; // {TextToken => OccurrenceCount}
 | 
						|
        $splitChars = static::$delimiters;
 | 
						|
        $token = strtok($text, $splitChars);
 | 
						|
 | 
						|
        while ($token !== false) {
 | 
						|
            if (!isset($tokenMap[$token])) {
 | 
						|
                $tokenMap[$token] = 0;
 | 
						|
            }
 | 
						|
            $tokenMap[$token]++;
 | 
						|
            $token = strtok($splitChars);
 | 
						|
        }
 | 
						|
 | 
						|
        return $tokenMap;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * For the given entity, Generate an array of term data details.
 | 
						|
     * Is the raw term data, not instances of SearchTerm models.
 | 
						|
     *
 | 
						|
     * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
 | 
						|
     */
 | 
						|
    protected function entityToTermDataArray(Entity $entity): array
 | 
						|
    {
 | 
						|
        $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
 | 
						|
        $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
 | 
						|
 | 
						|
        if ($entity instanceof Page) {
 | 
						|
            $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
 | 
						|
        } else {
 | 
						|
            $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
 | 
						|
        }
 | 
						|
 | 
						|
        $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
 | 
						|
 | 
						|
        $dataArray = [];
 | 
						|
        $entityId = $entity->id;
 | 
						|
        $entityType = $entity->getMorphClass();
 | 
						|
        foreach ($mergedScoreMap as $term => $score) {
 | 
						|
            $dataArray[] = [
 | 
						|
                'term'        => $term,
 | 
						|
                'score'       => $score,
 | 
						|
                'entity_type' => $entityType,
 | 
						|
                'entity_id'   => $entityId,
 | 
						|
            ];
 | 
						|
        }
 | 
						|
 | 
						|
        return $dataArray;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     * For the given term data arrays, Merge their contents by term
 | 
						|
     * while combining any scores.
 | 
						|
     *
 | 
						|
     * @param array<string, int>[] ...$scoreMaps
 | 
						|
     *
 | 
						|
     * @returns array<string, int>
 | 
						|
     */
 | 
						|
    protected function mergeTermScoreMaps(...$scoreMaps): array
 | 
						|
    {
 | 
						|
        $mergedMap = [];
 | 
						|
 | 
						|
        foreach ($scoreMaps as $scoreMap) {
 | 
						|
            foreach ($scoreMap as $term => $score) {
 | 
						|
                $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        return $mergedMap;
 | 
						|
    }
 | 
						|
}
 |