<?php
/**
 * ============================================
 * FLOWBOT DCI - ROBOTS HANDLER v1.0
 * ============================================
 * Parses and validates robots.txt files.
 *
 * Features:
 * - robots.txt parsing
 * - User-agent matching
 * - Allow/Disallow rule checking
 * - Crawl-delay extraction
 * - Sitemap detection
 * - Caching support
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

use FlowbotDCI\Core\Database;
use PDO;

class RobotsHandler
{
    const VERSION = '1.0';

    private ?Database $database = null;
    private ?PDO $pdo = null;

    /**
     * User agent to identify as
     */
    private string $userAgent = 'FlowbotDCI';

    /**
     * Cache TTL in seconds (24 hours default)
     */
    private int $cacheTtl = 86400;

    /**
     * In-memory cache
     */
    private array $cache = [];

    /**
     * Set database connection for caching
     */
    public function setDatabase(Database $database): self
    {
        $this->database = $database;
        $this->pdo = $database->getConnection();
        return $this;
    }

    /**
     * Set user agent
     */
    public function setUserAgent(string $userAgent): self
    {
        $this->userAgent = $userAgent;
        return $this;
    }

    /**
     * Set cache TTL
     */
    public function setCacheTtl(int $seconds): self
    {
        $this->cacheTtl = max(0, $seconds);
        return $this;
    }

    /**
     * Check if URL is allowed by robots.txt
     */
    public function isAllowed(string $url): bool
    {
        $parts = parse_url($url);
        if (!$parts || !isset($parts['host'])) {
            return true; // Invalid URL, allow by default
        }

        $scheme = $parts['scheme'] ?? 'https';
        $host = $parts['host'];
        $path = $parts['path'] ?? '/';

        $robotsUrl = "{$scheme}://{$host}/robots.txt";
        $rules = $this->getRules($robotsUrl, $host);

        return $this->checkRules($rules, $path);
    }

    /**
     * Get crawl delay for domain
     */
    public function getCrawlDelay(string $url): int
    {
        $parts = parse_url($url);
        if (!$parts || !isset($parts['host'])) {
            return 0;
        }

        $scheme = $parts['scheme'] ?? 'https';
        $host = $parts['host'];
        $robotsUrl = "{$scheme}://{$host}/robots.txt";

        $rules = $this->getRules($robotsUrl, $host);
        return $rules['crawl_delay'] ?? 0;
    }

    /**
     * Get sitemaps from robots.txt
     */
    public function getSitemaps(string $url): array
    {
        $parts = parse_url($url);
        if (!$parts || !isset($parts['host'])) {
            return [];
        }

        $scheme = $parts['scheme'] ?? 'https';
        $host = $parts['host'];
        $robotsUrl = "{$scheme}://{$host}/robots.txt";

        $rules = $this->getRules($robotsUrl, $host);
        return $rules['sitemaps'] ?? [];
    }

    /**
     * Get rules for a domain
     */
    private function getRules(string $robotsUrl, string $host): array
    {
        // Check memory cache
        if (isset($this->cache[$host])) {
            return $this->cache[$host];
        }

        // Check database cache
        $rules = $this->loadFromDatabase($host);
        if ($rules !== null) {
            $this->cache[$host] = $rules;
            return $rules;
        }

        // Fetch and parse
        $content = $this->fetchRobotsTxt($robotsUrl);
        $rules = $this->parse($content);

        // Save to cache
        $this->saveToDatabase($host, $content, $rules);
        $this->cache[$host] = $rules;

        return $rules;
    }

    /**
     * Fetch robots.txt content
     */
    private function fetchRobotsTxt(string $url): string
    {
        $context = stream_context_create([
            'http' => [
                'timeout' => 5,
                'user_agent' => $this->userAgent,
                'follow_location' => true,
                'max_redirects' => 3,
            ],
            'ssl' => [
                'verify_peer' => false,
                'verify_peer_name' => false,
            ],
        ]);

        $content = @file_get_contents($url, false, $context);
        return $content !== false ? $content : '';
    }

    /**
     * Parse robots.txt content
     */
    public function parse(string $content): array
    {
        $rules = [
            'allow' => [],
            'disallow' => [],
            'crawl_delay' => 0,
            'sitemaps' => [],
        ];

        if (empty($content)) {
            return $rules;
        }

        $lines = explode("\n", $content);
        $currentUserAgent = null;
        $matchesOurAgent = false;
        $wildcardRules = ['allow' => [], 'disallow' => []];

        foreach ($lines as $line) {
            $line = trim($line);

            // Skip comments and empty lines
            if (empty($line) || $line[0] === '#') {
                continue;
            }

            // Parse directive
            if (strpos($line, ':') === false) {
                continue;
            }

            [$directive, $value] = array_map('trim', explode(':', $line, 2));
            $directive = strtolower($directive);

            switch ($directive) {
                case 'user-agent':
                    $currentUserAgent = strtolower($value);
                    $matchesOurAgent = (
                        $currentUserAgent === '*' ||
                        stripos($currentUserAgent, strtolower($this->userAgent)) !== false ||
                        stripos(strtolower($this->userAgent), $currentUserAgent) !== false
                    );
                    break;

                case 'allow':
                    if ($matchesOurAgent && !empty($value)) {
                        $rules['allow'][] = $this->patternToRegex($value);
                    } elseif ($currentUserAgent === '*' && !empty($value)) {
                        $wildcardRules['allow'][] = $this->patternToRegex($value);
                    }
                    break;

                case 'disallow':
                    if ($matchesOurAgent && !empty($value)) {
                        $rules['disallow'][] = $this->patternToRegex($value);
                    } elseif ($currentUserAgent === '*' && !empty($value)) {
                        $wildcardRules['disallow'][] = $this->patternToRegex($value);
                    }
                    break;

                case 'crawl-delay':
                    if ($matchesOurAgent) {
                        $rules['crawl_delay'] = (int) $value;
                    }
                    break;

                case 'sitemap':
                    if (!empty($value) && filter_var($value, FILTER_VALIDATE_URL)) {
                        $rules['sitemaps'][] = $value;
                    }
                    break;
            }
        }

        // If no specific rules for our agent, use wildcard rules
        if (empty($rules['allow']) && empty($rules['disallow'])) {
            $rules['allow'] = $wildcardRules['allow'];
            $rules['disallow'] = $wildcardRules['disallow'];
        }

        return $rules;
    }

    /**
     * Convert robots.txt pattern to regex
     */
    private function patternToRegex(string $pattern): string
    {
        // Escape regex special characters except * and $
        $pattern = preg_quote($pattern, '/');

        // Convert * to .*
        $pattern = str_replace('\*', '.*', $pattern);

        // Convert $ at end to end anchor
        if (substr($pattern, -2) === '\$') {
            $pattern = substr($pattern, 0, -2) . '$';
        }

        return '/^' . $pattern . '/i';
    }

    /**
     * Check if path is allowed by rules
     */
    private function checkRules(array $rules, string $path): bool
    {
        // Check allow rules first (they take precedence for matching patterns)
        foreach ($rules['allow'] as $pattern) {
            if (preg_match($pattern, $path)) {
                return true;
            }
        }

        // Check disallow rules
        foreach ($rules['disallow'] as $pattern) {
            if (preg_match($pattern, $path)) {
                return false;
            }
        }

        // Default: allow
        return true;
    }

    /**
     * Load rules from database cache
     */
    private function loadFromDatabase(string $host): ?array
    {
        if (!$this->pdo) {
            return null;
        }

        try {
            $stmt = $this->pdo->prepare("
                SELECT robots_txt_cached, crawl_delay
                FROM crawler_domain_stats
                WHERE domain = ?
                  AND robots_txt_expires > NOW()
                LIMIT 1
            ");
            $stmt->execute([$host]);
            $row = $stmt->fetch();

            if (!$row || empty($row['robots_txt_cached'])) {
                return null;
            }

            $rules = $this->parse($row['robots_txt_cached']);
            $rules['crawl_delay'] = (int) $row['crawl_delay'];

            return $rules;

        } catch (\Exception $e) {
            return null;
        }
    }

    /**
     * Save rules to database cache
     */
    private function saveToDatabase(string $host, string $content, array $rules): void
    {
        if (!$this->pdo) {
            return;
        }

        try {
            $expires = date('Y-m-d H:i:s', time() + $this->cacheTtl);

            $stmt = $this->pdo->prepare("
                INSERT INTO crawler_domain_stats (domain, robots_txt_cached, robots_txt_expires, crawl_delay)
                VALUES (?, ?, ?, ?)
                ON DUPLICATE KEY UPDATE
                    robots_txt_cached = VALUES(robots_txt_cached),
                    robots_txt_expires = VALUES(robots_txt_expires),
                    crawl_delay = VALUES(crawl_delay)
            ");
            $stmt->execute([
                $host,
                $content,
                $expires,
                $rules['crawl_delay'],
            ]);
        } catch (\Exception $e) {
            // Ignore cache errors
        }
    }

    /**
     * Clear cache for a domain
     */
    public function clearCache(string $host): void
    {
        unset($this->cache[$host]);

        if ($this->pdo) {
            try {
                $stmt = $this->pdo->prepare("
                    UPDATE crawler_domain_stats
                    SET robots_txt_cached = NULL, robots_txt_expires = NULL
                    WHERE domain = ?
                ");
                $stmt->execute([$host]);
            } catch (\Exception $e) {
                // Ignore
            }
        }
    }

    /**
     * Clear all cache
     */
    public function clearAllCache(): void
    {
        $this->cache = [];
    }
}
