<?php
/**
 * ============================================
 * FLOWBOT DCI - RELEVANCE SCORER v1.0
 * ============================================
 * Calculates content relevance scores based on
 * keyword matching with weighted zones.
 *
 * Features:
 * - Configurable zone weights (title, h1, h2, etc.)
 * - Phrase and word matching
 * - Forced domains bypass
 * - Search engine URL detection
 * - Noindex/nofollow detection
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

use DOMDocument;
use DOMXPath;

class RelevanceScorer
{
    const VERSION = '1.0';

    /**
     * Default weights for different content zones
     */
    private array $weights = [
        'title' => 10,
        'h1' => 8,
        'h2' => 5,
        'h3' => 3,
        'meta_description' => 6,
        'meta_keywords' => 4,
        'paragraph' => 3,
        'list_item' => 2,
        'anchor_text' => 2,
        'strong' => 2,
        'emphasis' => 1,
    ];

    /**
     * Minimum relevance score to consider content relevant
     */
    private float $threshold = 2.0;

    /**
     * Domains that bypass relevance scoring
     */
    private array $forcedDomains = [];

    /**
     * Search engine domains to always reject
     */
    private array $searchEngineDomains = [
        'google.com', 'bing.com', 'yahoo.com', 'duckduckgo.com',
        'baidu.com', 'yandex.com', 'ask.com', 'aol.com'
    ];

    /**
     * Configure weights for content zones
     */
    public function setWeights(array $weights): self
    {
        $this->weights = array_merge($this->weights, $weights);
        return $this;
    }

    /**
     * Set minimum relevance threshold
     */
    public function setThreshold(float $threshold): self
    {
        $this->threshold = max(0, $threshold);
        return $this;
    }

    /**
     * Set forced domains (bypass relevance scoring)
     */
    public function setForcedDomains(array $domains): self
    {
        $this->forcedDomains = array_map('strtolower', $domains);
        return $this;
    }

    /**
     * Get current weights configuration
     */
    public function getWeights(): array
    {
        return $this->weights;
    }

    /**
     * Get current threshold
     */
    public function getThreshold(): float
    {
        return $this->threshold;
    }

    /**
     * Calculate relevance score for a DOM document
     *
     * @param DOMDocument $dom The parsed HTML document
     * @param array $searchTerms Array of search terms to match
     * @return float The calculated relevance score
     */
    public function calculateScore(DOMDocument $dom, array $searchTerms): float
    {
        if (empty($searchTerms)) {
            // No search terms means everything is relevant
            return 999.0;
        }

        $xpath = new DOMXPath($dom);
        $score = 0.0;

        // Normalize search terms
        $normalizedTerms = $this->normalizeTerms($searchTerms);

        // Extract and score each zone
        $zones = $this->extractZones($xpath);

        foreach ($zones as $zoneName => $content) {
            $weight = $this->weights[$zoneName] ?? 1;
            $zoneScore = $this->scoreContent($content, $normalizedTerms);
            $score += $zoneScore * $weight;
        }

        return round($score, 2);
    }

    /**
     * Check if content is relevant based on threshold
     *
     * @param DOMDocument $dom The parsed HTML document
     * @param string $url The URL being checked
     * @param array $searchTerms Array of search terms
     * @return bool True if relevant, false otherwise
     */
    public function isRelevant(DOMDocument $dom, string $url, array $searchTerms): bool
    {
        // Check forced domains first
        if ($this->isForcedDomain($url)) {
            return true;
        }

        // Reject search engine URLs
        if ($this->isSearchEngineUrl($url)) {
            return false;
        }

        // Check for noindex/nofollow
        if ($this->hasNoindexNofollow($dom)) {
            return false;
        }

        // Calculate score and compare to threshold
        $score = $this->calculateScore($dom, $searchTerms);
        return $score >= $this->threshold;
    }

    /**
     * Get detailed relevance analysis
     */
    public function analyze(DOMDocument $dom, array $searchTerms): array
    {
        $xpath = new DOMXPath($dom);
        $zones = $this->extractZones($xpath);
        $normalizedTerms = $this->normalizeTerms($searchTerms);

        $analysis = [
            'total_score' => 0,
            'threshold' => $this->threshold,
            'is_relevant' => false,
            'zones' => [],
            'matched_terms' => [],
        ];

        foreach ($zones as $zoneName => $content) {
            $weight = $this->weights[$zoneName] ?? 1;
            $matches = $this->findMatches($content, $normalizedTerms);
            $zoneScore = count($matches) * $weight;

            $analysis['zones'][$zoneName] = [
                'weight' => $weight,
                'matches' => $matches,
                'score' => $zoneScore,
                'content_preview' => mb_substr($content, 0, 200),
            ];

            $analysis['total_score'] += $zoneScore;
            $analysis['matched_terms'] = array_unique(array_merge(
                $analysis['matched_terms'],
                $matches
            ));
        }

        $analysis['total_score'] = round($analysis['total_score'], 2);
        $analysis['is_relevant'] = $analysis['total_score'] >= $this->threshold;

        return $analysis;
    }

    /**
     * Extract content from different zones of the document
     */
    private function extractZones(DOMXPath $xpath): array
    {
        $zones = [];

        // Title
        $titleNodes = $xpath->query('//title');
        $zones['title'] = $titleNodes->length > 0
            ? mb_strtolower(trim($titleNodes->item(0)->textContent))
            : '';

        // Meta description
        $descNodes = $xpath->query('//meta[@name="description"]/@content');
        $zones['meta_description'] = $descNodes->length > 0
            ? mb_strtolower(trim($descNodes->item(0)->nodeValue))
            : '';

        // Meta keywords
        $keywordsNodes = $xpath->query('//meta[@name="keywords"]/@content');
        $zones['meta_keywords'] = $keywordsNodes->length > 0
            ? mb_strtolower(trim($keywordsNodes->item(0)->nodeValue))
            : '';

        // H1
        $h1Text = '';
        $h1Nodes = $xpath->query('//h1');
        foreach ($h1Nodes as $node) {
            $h1Text .= ' ' . $node->textContent;
        }
        $zones['h1'] = mb_strtolower(trim($h1Text));

        // H2
        $h2Text = '';
        $h2Nodes = $xpath->query('//h2');
        foreach ($h2Nodes as $node) {
            $h2Text .= ' ' . $node->textContent;
        }
        $zones['h2'] = mb_strtolower(trim($h2Text));

        // H3
        $h3Text = '';
        $h3Nodes = $xpath->query('//h3');
        foreach ($h3Nodes as $node) {
            $h3Text .= ' ' . $node->textContent;
        }
        $zones['h3'] = mb_strtolower(trim($h3Text));

        // Paragraphs (limit to avoid memory issues)
        $pText = '';
        $pNodes = $xpath->query('//p');
        $count = 0;
        foreach ($pNodes as $node) {
            if ($count++ > 50) break; // Limit to first 50 paragraphs
            $pText .= ' ' . $node->textContent;
        }
        $zones['paragraph'] = mb_strtolower(trim($pText));

        // List items
        $liText = '';
        $liNodes = $xpath->query('//li');
        $count = 0;
        foreach ($liNodes as $node) {
            if ($count++ > 30) break;
            $liText .= ' ' . $node->textContent;
        }
        $zones['list_item'] = mb_strtolower(trim($liText));

        // Anchor text
        $aText = '';
        $aNodes = $xpath->query('//a');
        $count = 0;
        foreach ($aNodes as $node) {
            if ($count++ > 50) break;
            $aText .= ' ' . $node->textContent;
        }
        $zones['anchor_text'] = mb_strtolower(trim($aText));

        // Strong/bold text
        $strongText = '';
        $strongNodes = $xpath->query('//strong | //b');
        foreach ($strongNodes as $node) {
            $strongText .= ' ' . $node->textContent;
        }
        $zones['strong'] = mb_strtolower(trim($strongText));

        // Emphasis text
        $emText = '';
        $emNodes = $xpath->query('//em | //i');
        foreach ($emNodes as $node) {
            $emText .= ' ' . $node->textContent;
        }
        $zones['emphasis'] = mb_strtolower(trim($emText));

        return $zones;
    }

    /**
     * Normalize search terms for matching
     */
    private function normalizeTerms(array $terms): array
    {
        $normalized = [];
        foreach ($terms as $term) {
            // Add the full phrase
            $normalized[] = mb_strtolower(trim($term));

            // Also add individual words
            $words = preg_split('/\s+/', $term);
            foreach ($words as $word) {
                $word = mb_strtolower(trim($word));
                if (mb_strlen($word) > 2) { // Ignore very short words
                    $normalized[] = $word;
                }
            }
        }
        return array_unique($normalized);
    }

    /**
     * Score content based on term matches
     */
    private function scoreContent(string $content, array $terms): float
    {
        if (empty($content) || empty($terms)) {
            return 0;
        }

        $score = 0;
        foreach ($terms as $term) {
            $count = mb_substr_count($content, $term);
            $score += $count;
        }

        return $score;
    }

    /**
     * Find all matching terms in content
     */
    private function findMatches(string $content, array $terms): array
    {
        $matches = [];
        foreach ($terms as $term) {
            if (mb_strpos($content, $term) !== false) {
                $matches[] = $term;
            }
        }
        return $matches;
    }

    /**
     * Check if URL is from a forced domain
     */
    private function isForcedDomain(string $url): bool
    {
        $host = parse_url($url, PHP_URL_HOST);
        if (!$host) {
            return false;
        }

        $host = strtolower($host);
        foreach ($this->forcedDomains as $domain) {
            if (strpos($host, $domain) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Check if URL is from a search engine
     */
    private function isSearchEngineUrl(string $url): bool
    {
        $host = parse_url($url, PHP_URL_HOST);
        if (!$host) {
            return false;
        }

        $host = strtolower($host);
        foreach ($this->searchEngineDomains as $domain) {
            if (strpos($host, $domain) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Check if document has noindex or nofollow meta tag
     */
    private function hasNoindexNofollow(DOMDocument $dom): bool
    {
        $xpath = new DOMXPath($dom);
        $robotsMeta = $xpath->query('//meta[@name="robots"]/@content');

        if ($robotsMeta->length > 0) {
            $content = strtolower($robotsMeta->item(0)->nodeValue);
            if (strpos($content, 'noindex') !== false || strpos($content, 'nofollow') !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Static factory method for quick scoring
     */
    public static function score(string $html, array $searchTerms, array $options = []): float
    {
        $dom = new DOMDocument();
        @$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);

        $scorer = new self();

        if (isset($options['weights'])) {
            $scorer->setWeights($options['weights']);
        }
        if (isset($options['threshold'])) {
            $scorer->setThreshold($options['threshold']);
        }
        if (isset($options['forcedDomains'])) {
            $scorer->setForcedDomains($options['forcedDomains']);
        }

        return $scorer->calculateScore($dom, $searchTerms);
    }
}
