<?php
/**
 * ============================================
 * FLOWBOT DCI - DUPLICATE DETECTOR v1.0
 * ============================================
 * Detects duplicate URLs and content to prevent
 * redundant processing and storage.
 *
 * Features:
 * - URL normalization
 * - Content hash comparison
 * - Title similarity detection
 * - Cross-job deduplication
 * - Database-backed tracking
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

use FlowbotDCI\Core\Database;
use PDO;

class DuplicateDetector
{
    const VERSION = '1.1'; // Security fix: SQL injection prevention

    private ?Database $database = null;
    private ?PDO $pdo = null;

    /**
     * In-memory cache for fast lookups
     */
    private array $urlCache = [];
    private array $contentCache = [];
    private int $cacheMaxSize = 10000;

    /**
     * SECURITY FIX: Whitelist of allowed table names to prevent SQL injection
     */
    private const ALLOWED_TABLES = [
        'pinfeeds',
        'crawler_content_hashes',
        'crawler_seen_links',
        'crawler_activity_log',
    ];

    /**
     * URL parameters to strip during normalization
     */
    private array $trackingParams = [
        'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
        'fbclid', 'gclid', 'msclkid', 'dclid',
        'mc_eid', 'mc_cid',
        'ref', 'referrer', 'source',
        '_ga', '_gl',
        'affiliate', 'partner',
    ];

    /**
     * Set database connection
     */
    public function setDatabase(Database $database): self
    {
        $this->database = $database;
        $this->pdo = $database->getConnection();
        return $this;
    }

    /**
     * Add custom tracking parameters to strip
     */
    public function addTrackingParams(array $params): self
    {
        $this->trackingParams = array_merge($this->trackingParams, $params);
        return $this;
    }

    /**
     * Normalize a URL for comparison
     *
     * - Converts to lowercase
     * - Removes tracking parameters
     * - Normalizes protocol (https preferred)
     * - Removes trailing slashes
     * - Sorts query parameters
     * - Removes fragments
     */
    public function normalizeUrl(string $url): string
    {
        $url = trim($url);

        // Parse URL
        $parts = parse_url($url);
        if (!$parts || !isset($parts['host'])) {
            return $url;
        }

        // Normalize protocol
        $scheme = strtolower($parts['scheme'] ?? 'https');
        if ($scheme === 'http') {
            $scheme = 'https'; // Upgrade to https
        }

        // Normalize host
        $host = strtolower($parts['host']);
        $host = preg_replace('/^www\./', '', $host); // Remove www

        // Normalize path
        $path = $parts['path'] ?? '/';
        $path = rtrim($path, '/'); // Remove trailing slash
        if (empty($path)) {
            $path = '/';
        }

        // Parse and filter query parameters
        $query = '';
        if (!empty($parts['query'])) {
            parse_str($parts['query'], $queryParams);

            // Remove tracking parameters
            foreach ($this->trackingParams as $param) {
                unset($queryParams[$param]);
            }

            // Sort and rebuild query string
            if (!empty($queryParams)) {
                ksort($queryParams);
                $query = '?' . http_build_query($queryParams);
            }
        }

        // Reconstruct URL without fragment
        $normalized = "{$scheme}://{$host}{$path}{$query}";

        return $normalized;
    }

    /**
     * Generate hash for a URL
     */
    public function hashUrl(string $url): string
    {
        $normalized = $this->normalizeUrl($url);
        return md5($normalized);
    }

    /**
     * Generate hash for content
     */
    public function hashContent(string $content): string
    {
        // Normalize content
        $content = strtolower(trim($content));
        $content = preg_replace('/\s+/', ' ', $content); // Collapse whitespace
        $content = strip_tags($content); // Remove HTML

        return md5($content);
    }

    /**
     * Check if URL has been seen before (in memory cache)
     */
    public function isUrlSeen(string $url): bool
    {
        $hash = $this->hashUrl($url);
        return isset($this->urlCache[$hash]);
    }

    /**
     * Mark URL as seen (in memory cache)
     */
    public function markUrlSeen(string $url): void
    {
        $hash = $this->hashUrl($url);
        $this->urlCache[$hash] = true;

        // Limit cache size
        if (count($this->urlCache) > $this->cacheMaxSize) {
            array_shift($this->urlCache);
        }
    }

    /**
     * Check if URL exists in database
     * SECURITY FIX: Table name validated against whitelist to prevent SQL injection
     */
    public function existsInDatabase(string $url, string $table = 'pinfeeds'): bool
    {
        if (!$this->pdo) {
            return false;
        }

        // SECURITY: Validate table name against whitelist
        if (!in_array($table, self::ALLOWED_TABLES, true)) {
            error_log("DuplicateDetector::existsInDatabase SECURITY: Invalid table name attempted: " . $table);
            return false;
        }

        try {
            $stmt = $this->pdo->prepare("SELECT 1 FROM {$table} WHERE link = ? LIMIT 1");
            $stmt->execute([$url]);
            return $stmt->fetchColumn() !== false;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::existsInDatabase error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Check if URL hash exists in content hashes table
     */
    public function existsInContentHashes(string $url): bool
    {
        if (!$this->pdo) {
            return false;
        }

        $urlHash = $this->hashUrl($url);

        try {
            $stmt = $this->pdo->prepare(
                "SELECT 1 FROM crawler_content_hashes WHERE url_hash = ? LIMIT 1"
            );
            $stmt->execute([$urlHash]);
            return $stmt->fetchColumn() !== false;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::existsInContentHashes error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Check for duplicate content by hash
     */
    public function isDuplicateContent(string $content): bool
    {
        $contentHash = $this->hashContent($content);

        // Check memory cache first
        if (isset($this->contentCache[$contentHash])) {
            return true;
        }

        // Check database
        if ($this->pdo) {
            try {
                $stmt = $this->pdo->prepare(
                    "SELECT 1 FROM crawler_content_hashes WHERE content_hash = ? LIMIT 1"
                );
                $stmt->execute([$contentHash]);
                if ($stmt->fetchColumn() !== false) {
                    $this->contentCache[$contentHash] = true;
                    return true;
                }
            } catch (\Exception $e) {
                error_log("DuplicateDetector::isDuplicateContent error: " . $e->getMessage());
            }
        }

        return false;
    }

    /**
     * Record URL and content hashes in database
     */
    public function recordContent(
        string $url,
        string $content,
        ?string $title = null,
        ?string $jobId = null
    ): bool {
        if (!$this->pdo) {
            return false;
        }

        $urlHash = $this->hashUrl($url);
        $contentHash = $this->hashContent($content);
        $titleHash = $title ? $this->hashContent($title) : null;

        try {
            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_content_hashes
                    (url_hash, original_url, content_hash, title_hash, first_job_id)
                VALUES
                    (?, ?, ?, ?, ?)
                ON DUPLICATE KEY UPDATE
                    last_seen = NOW(),
                    occurrence_count = occurrence_count + 1
            ");
            $stmt->execute([$urlHash, $url, $contentHash, $titleHash, $jobId]);

            // Update memory caches
            $this->urlCache[$urlHash] = true;
            $this->contentCache[$contentHash] = true;

            return true;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::recordContent error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Record URL as seen for a specific job/process
     */
    public function recordSeenLink(string $processId, string $url, int $depth = 0): bool
    {
        if (!$this->pdo) {
            return false;
        }

        try {
            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_seen_links (process_id, link, depth)
                VALUES (?, ?, ?)
            ");
            $stmt->execute([$processId, $url, $depth]);
            return true;
        } catch (\Exception $e) {
            // Duplicate key is expected, not an error
            if (strpos($e->getMessage(), 'Duplicate') !== false) {
                return false;
            }
            error_log("DuplicateDetector::recordSeenLink error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Check if link was already seen in this process
     */
    public function wasSeenInProcess(string $processId, string $url): bool
    {
        if (!$this->pdo) {
            return false;
        }

        try {
            $stmt = $this->pdo->prepare(
                "SELECT 1 FROM crawler_seen_links WHERE process_id = ? AND link = ? LIMIT 1"
            );
            $stmt->execute([$processId, $url]);
            return $stmt->fetchColumn() !== false;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::wasSeenInProcess error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Batch check multiple URLs against pinfeeds
     */
    public function batchCheckPinfeeds(array $urls): array
    {
        if (!$this->pdo || empty($urls)) {
            return [];
        }

        try {
            $placeholders = str_repeat('?,', count($urls) - 1) . '?';
            $stmt = $this->pdo->prepare(
                "SELECT link FROM pinfeeds WHERE link IN ({$placeholders})"
            );
            $stmt->execute($urls);
            return $stmt->fetchAll(PDO::FETCH_COLUMN);
        } catch (\Exception $e) {
            error_log("DuplicateDetector::batchCheckPinfeeds error: " . $e->getMessage());
            return [];
        }
    }

    /**
     * Clear seen links for a specific process
     */
    public function clearProcessLinks(string $processId): bool
    {
        if (!$this->pdo) {
            return false;
        }

        try {
            $stmt = $this->pdo->prepare(
                "DELETE FROM crawler_seen_links WHERE process_id = ?"
            );
            $stmt->execute([$processId]);
            return true;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::clearProcessLinks error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Get duplicate statistics
     */
    public function getStats(): array
    {
        $stats = [
            'url_cache_size' => count($this->urlCache),
            'content_cache_size' => count($this->contentCache),
            'cache_max_size' => $this->cacheMaxSize,
        ];

        if ($this->pdo) {
            try {
                $stmt = $this->pdo->query("SELECT COUNT(*) FROM crawler_content_hashes");
                $stats['total_content_hashes'] = (int) $stmt->fetchColumn();

                $stmt = $this->pdo->query(
                    "SELECT COUNT(*) FROM crawler_content_hashes WHERE occurrence_count > 1"
                );
                $stats['duplicate_content_count'] = (int) $stmt->fetchColumn();
            } catch (\Exception $e) {
                $stats['database_error'] = $e->getMessage();
            }
        }

        return $stats;
    }

    /**
     * Clear in-memory caches
     */
    public function clearCache(): void
    {
        $this->urlCache = [];
        $this->contentCache = [];
    }

    /**
     * Get canonical URL for a content hash
     */
    public function getCanonicalUrl(string $contentHash): ?string
    {
        if (!$this->pdo) {
            return null;
        }

        try {
            $stmt = $this->pdo->prepare(
                "SELECT canonical_url FROM crawler_content_hashes
                 WHERE content_hash = ? AND canonical_url IS NOT NULL
                 LIMIT 1"
            );
            $stmt->execute([$contentHash]);
            $result = $stmt->fetchColumn();
            return $result !== false ? $result : null;
        } catch (\Exception $e) {
            return null;
        }
    }

    /**
     * Set canonical URL for content
     */
    public function setCanonicalUrl(string $url, string $canonicalUrl): bool
    {
        if (!$this->pdo) {
            return false;
        }

        $urlHash = $this->hashUrl($url);

        try {
            $stmt = $this->pdo->prepare(
                "UPDATE crawler_content_hashes SET canonical_url = ? WHERE url_hash = ?"
            );
            $stmt->execute([$canonicalUrl, $urlHash]);
            return $stmt->rowCount() > 0;
        } catch (\Exception $e) {
            error_log("DuplicateDetector::setCanonicalUrl error: " . $e->getMessage());
            return false;
        }
    }

    /**
     * Calculate similarity between two strings (Levenshtein-based)
     */
    public function calculateSimilarity(string $str1, string $str2): float
    {
        $str1 = strtolower(trim($str1));
        $str2 = strtolower(trim($str2));

        if ($str1 === $str2) {
            return 1.0;
        }

        $maxLen = max(strlen($str1), strlen($str2));
        if ($maxLen === 0) {
            return 1.0;
        }

        $distance = levenshtein($str1, $str2);
        return 1.0 - ($distance / $maxLen);
    }

    /**
     * Check if two titles are similar (above threshold)
     */
    public function areTitlesSimilar(string $title1, string $title2, float $threshold = 0.85): bool
    {
        return $this->calculateSimilarity($title1, $title2) >= $threshold;
    }
}
