<?php
/**
 * FLOWB0T NEXUS - Main Crawler Engine
 * Handles crawling orchestration, multi-phase processing, worker management
 *
 * @package Flowb0t\Engine
 * @version 1.0.0
 */

namespace Flowb0t\Engine;

use Flowb0t\Core\Database;
use Flowb0t\Core\Logger;

class CrawlerEngine {
    private Database $db;
    private Logger $logger;
    private array $config;
    private string $jobUuid;
    private int $jobId;
    private bool $isRunning = false;
    private array $stats = [];

    // Processing phases
    const PHASE_SEED_COLLECTION = 0;
    const PHASE_SEARCH_EXPANSION = 1;
    const PHASE_CRAWL_FAST = 2;
    const PHASE_CRAWL_MEDIUM = 3;
    const PHASE_CRAWL_SLOW = 4;
    const PHASE_IMPORT = 5;

    // Phase configurations
    private array $phases = [
        0 => ['name' => 'Seed Collection',    'concurrent' => 1,   'timeout' => 30],
        1 => ['name' => 'Search Expansion',   'concurrent' => 50,  'timeout' => 15],
        2 => ['name' => 'Fast Crawl',         'concurrent' => 500, 'timeout' => 3],
        3 => ['name' => 'Medium Crawl',       'concurrent' => 200, 'timeout' => 8],
        4 => ['name' => 'Slow Crawl',         'concurrent' => 50,  'timeout' => 15],
        5 => ['name' => 'Import to Database', 'concurrent' => 100, 'timeout' => 5],
    ];

    // User agents for rotation
    private array $userAgents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    ];

    /**
     * Constructor
     */
    public function __construct(string $jobUuid) {
        $this->db = Database::getInstance();
        $this->logger = new Logger($jobUuid);
        $this->jobUuid = $jobUuid;
        $this->loadJobConfig();
    }

    /**
     * Load job configuration from database
     */
    private function loadJobConfig(): void {
        $job = $this->db->fetchOne(
            "SELECT * FROM nexus_jobs WHERE job_uuid = ?",
            [$this->jobUuid]
        );

        if (!$job) {
            throw new \Exception("Job not found: {$this->jobUuid}");
        }

        $this->config = $job;
        $this->jobId = (int)$job['id'];
    }

    /**
     * Start the crawler
     */
    public function start(): void {
        $this->isRunning = true;
        $this->logger->info('ENGINE', 'Starting crawler engine', ['job_uuid' => $this->jobUuid]);

        // Update job status
        $this->updateJobStatus('running');
        $this->db->update('nexus_jobs', [
            'started_at' => date('Y-m-d H:i:s')
        ], 'id = ?', [$this->jobId]);

        try {
            // Phase 0: Collect seeds
            $this->runPhase(self::PHASE_SEED_COLLECTION);

            // Phase 1: Expand with search
            $searchTerms = json_decode($this->config['search_terms'] ?? '[]', true);
            if (!empty($searchTerms)) {
                $this->runPhase(self::PHASE_SEARCH_EXPANSION);
            }

            // Phases 2-4: Crawl with cascading timeouts
            $this->runPhase(self::PHASE_CRAWL_FAST);
            $this->runPhase(self::PHASE_CRAWL_MEDIUM);
            $this->runPhase(self::PHASE_CRAWL_SLOW);

            // Phase 5: Import results
            if ($this->config['auto_import']) {
                $this->runPhase(self::PHASE_IMPORT);
            }

            // Mark completed
            $this->updateJobStatus('completed');
            $this->logger->info('ENGINE', 'Crawler completed successfully');

        } catch (\Exception $e) {
            $this->logger->critical('ENGINE', 'Fatal error', ['error' => $e->getMessage()]);
            $this->updateJobStatus('failed');
            throw $e;
        }
    }

    /**
     * Pause the crawler
     */
    public function pause(): void {
        $this->isRunning = false;
        $this->updateJobStatus('paused');
        $this->db->update('nexus_jobs', [
            'paused_at' => date('Y-m-d H:i:s')
        ], 'id = ?', [$this->jobId]);
        $this->logger->info('ENGINE', 'Job paused');
    }

    /**
     * Resume the crawler
     */
    public function resume(): void {
        $this->isRunning = true;
        $this->updateJobStatus('running');
        $this->logger->info('ENGINE', 'Job resumed');

        // Reload config and continue
        $this->loadJobConfig();
        $currentPhase = (int)$this->config['current_phase'];

        // Continue from current phase
        for ($phase = $currentPhase; $phase <= self::PHASE_IMPORT; $phase++) {
            if (!$this->isRunning) break;
            $this->runPhase($phase);
        }

        if ($this->isRunning) {
            $this->updateJobStatus('completed');
        }
    }

    /**
     * Cancel the crawler
     */
    public function cancel(): void {
        $this->isRunning = false;
        $this->updateJobStatus('cancelled');
        $this->logger->info('ENGINE', 'Job cancelled');
    }

    /**
     * Restart the crawler
     */
    public function restart(): void {
        // Reset all progress
        $this->db->delete('nexus_queue', 'job_id = ?', [$this->jobId]);
        $this->db->delete('nexus_results', 'job_id = ?', [$this->jobId]);
        $this->db->delete('nexus_discovered_links', 'job_id = ?', [$this->jobId]);

        $this->db->update('nexus_jobs', [
            'progress_percent' => 0,
            'current_phase' => 0,
            'total_seeds' => 0,
            'total_queued' => 0,
            'total_processed' => 0,
            'total_successful' => 0,
            'total_failed' => 0,
            'total_skipped' => 0,
            'total_imported' => 0,
            'total_duplicates' => 0,
            'started_at' => null,
            'paused_at' => null,
            'completed_at' => null,
            'status' => 'pending'
        ], 'id = ?', [$this->jobId]);

        $this->logger->info('ENGINE', 'Job restarted');
        $this->loadJobConfig();
        $this->start();
    }

    /**
     * Run a specific phase
     */
    private function runPhase(int $phase): void {
        if (!$this->isRunning) return;

        $this->db->update('nexus_jobs', [
            'current_phase' => $phase
        ], 'id = ?', [$this->jobId]);

        $this->logger->info('ENGINE', "Starting phase: {$this->phases[$phase]['name']}", [
            'phase' => $phase
        ]);

        switch ($phase) {
            case self::PHASE_SEED_COLLECTION:
                $this->collectSeeds();
                break;
            case self::PHASE_SEARCH_EXPANSION:
                $this->expandWithSearch();
                break;
            case self::PHASE_CRAWL_FAST:
            case self::PHASE_CRAWL_MEDIUM:
            case self::PHASE_CRAWL_SLOW:
                $this->crawlPhase($phase);
                break;
            case self::PHASE_IMPORT:
                $this->importResults();
                break;
        }
    }

    /**
     * Phase 0: Collect seed URLs
     */
    private function collectSeeds(): void {
        $added = 0;

        // Add direct URLs
        $directUrls = json_decode($this->config['direct_urls'] ?? '[]', true);
        foreach ($directUrls as $url) {
            if ($this->addToQueue($url, 'seed', null, 0)) {
                $added++;
            }
        }

        // Add target domains
        $domains = json_decode($this->config['target_domains'] ?? '[]', true);
        foreach ($domains as $domain) {
            $domain = trim($domain);
            if (empty($domain)) continue;

            // Add multiple entry points for each domain
            $urls = [
                "https://{$domain}",
                "https://www.{$domain}",
                "https://{$domain}/sitemap.xml",
                "https://{$domain}/feed",
                "https://{$domain}/rss"
            ];

            foreach ($urls as $url) {
                if ($this->addToQueue($url, 'seed', null, 0)) {
                    $added++;
                }
            }
        }

        $this->db->update('nexus_jobs', [
            'total_seeds' => $added
        ], 'id = ?', [$this->jobId]);

        $this->logger->info('ENGINE', 'Seeds collected', ['count' => $added]);
    }

    /**
     * Phase 1: Expand with search
     */
    private function expandWithSearch(): void {
        $searchTerms = json_decode($this->config['search_terms'] ?? '[]', true);
        $searchTypes = json_decode($this->config['search_types'] ?? '["web"]', true);
        $maxPages = (int)($this->config['search_pages'] ?? 10);

        require_once __DIR__ . '/SearchProviders/BingProvider.php';
        $searchProvider = new SearchProviders\BingProvider();

        $totalUrls = 0;

        foreach ($searchTerms as $term) {
            if (!$this->isRunning) break;

            $term = trim($term);
            if (empty($term)) continue;

            foreach ($searchTypes as $type) {
                for ($page = 0; $page < $maxPages; $page++) {
                    if (!$this->isRunning) break;

                    $results = $searchProvider->search($term, $type, $page);

                    foreach ($results as $url) {
                        if ($this->addToQueue($url, 'search', $term, 0)) {
                            $totalUrls++;
                        }
                    }

                    // Delay between search pages
                    usleep(500000); // 0.5s
                }
            }

            $this->updateProgress();
        }

        $this->logger->info('ENGINE', 'Search expansion completed', [
            'total_urls' => $totalUrls
        ]);
    }

    /**
     * Phases 2-4: Crawl phase with specific timeout
     */
    private function crawlPhase(int $phase): void {
        $phaseConfig = $this->phases[$phase];
        $batchSize = min($phaseConfig['concurrent'], 100);
        $timeout = $phaseConfig['timeout'];

        $this->logger->info('ENGINE', "Crawl phase started", [
            'phase' => $phase,
            'name' => $phaseConfig['name'],
            'concurrent' => $batchSize,
            'timeout' => $timeout
        ]);

        $processed = 0;

        while ($this->isRunning) {
            // Get batch of pending URLs
            $batch = $this->db->fetchAll(
                "SELECT * FROM nexus_queue
                 WHERE job_id = ? AND status = 'pending' AND retry_count < 3
                 ORDER BY priority DESC, scheduled_for ASC
                 LIMIT ?",
                [$this->jobId, $batchSize]
            );

            if (empty($batch)) {
                break; // No more URLs to process
            }

            // Mark as processing
            $ids = array_column($batch, 'id');
            $placeholders = implode(',', array_fill(0, count($ids), '?'));
            $this->db->query(
                "UPDATE nexus_queue SET status = 'processing', started_at = NOW() WHERE id IN ({$placeholders})",
                $ids
            );

            // Process batch with cURL multi
            $results = $this->fetchBatch($batch, $phaseConfig['concurrent'], $timeout);

            // Process results
            foreach ($results as $urlHash => $result) {
                $this->processResult($urlHash, $result, $phase);
                $processed++;
            }

            // Update progress
            $this->updateProgress();

            // Memory management
            if ($processed % 1000 === 0) {
                gc_collect_cycles();
                $this->logger->debug('ENGINE', 'Memory cleanup', [
                    'processed' => $processed,
                    'memory' => Logger::formatBytes(memory_get_usage(true))
                ]);
            }
        }

        $this->logger->info('ENGINE', "Crawl phase completed", [
            'phase' => $phase,
            'processed' => $processed
        ]);
    }

    /**
     * Fetch batch of URLs using cURL multi
     */
    private function fetchBatch(array $urls, int $concurrent, int $timeout): array {
        $mh = curl_multi_init();
        $handles = [];
        $results = [];

        // Set reasonable limits
        curl_multi_setopt($mh, CURLMOPT_MAXCONNECTS, $concurrent);

        foreach ($urls as $item) {
            $ch = curl_init($item['url']);
            curl_setopt_array($ch, [
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS      => 5,
                CURLOPT_ENCODING       => 'gzip,deflate',
                CURLOPT_CONNECTTIMEOUT => min(5, $timeout),
                CURLOPT_TIMEOUT        => $timeout,
                CURLOPT_USERAGENT      => $this->getRandomUserAgent(),
                CURLOPT_SSL_VERIFYPEER => false,
                CURLOPT_SSL_VERIFYHOST => false,
                CURLOPT_HTTPHEADER     => [
                    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language: en-US,en;q=0.5',
                    'Connection: keep-alive',
                    'Upgrade-Insecure-Requests: 1'
                ],
                CURLOPT_PRIVATE        => $item['url_hash']
            ]);

            curl_multi_add_handle($mh, $ch);
            $handles[$item['url_hash']] = $ch;
        }

        // Execute all requests
        do {
            $status = curl_multi_exec($mh, $running);
            if ($status > 0) {
                $this->logger->warning('ENGINE', 'cURL multi error', ['status' => $status]);
            }
            curl_multi_select($mh, 0.1);

            // Process completed handles
            while ($info = curl_multi_info_read($mh)) {
                $ch = $info['handle'];
                $urlHash = curl_getinfo($ch, CURLINFO_PRIVATE);

                $results[$urlHash] = [
                    'html'          => curl_multi_getcontent($ch),
                    'http_code'     => curl_getinfo($ch, CURLINFO_HTTP_CODE),
                    'final_url'     => curl_getinfo($ch, CURLINFO_EFFECTIVE_URL),
                    'content_type'  => curl_getinfo($ch, CURLINFO_CONTENT_TYPE),
                    'response_time' => curl_getinfo($ch, CURLINFO_TOTAL_TIME) * 1000,
                    'content_length'=> curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD),
                    'error'         => curl_error($ch),
                    'error_code'    => curl_errno($ch)
                ];

                curl_multi_remove_handle($mh, $ch);
                curl_close($ch);
            }
        } while ($running > 0);

        curl_multi_close($mh);
        return $results;
    }

    /**
     * Process crawl result
     */
    private function processResult(string $urlHash, array $result, int $phase): void {
        $queueItem = $this->db->fetchOne(
            "SELECT * FROM nexus_queue WHERE job_id = ? AND url_hash = ?",
            [$this->jobId, $urlHash]
        );

        if (!$queueItem) return;

        // Check if request failed
        if ($result['error_code'] > 0 || $result['http_code'] === 0) {
            // Retry logic for fast phase, fail in slower phases
            if ($phase === self::PHASE_CRAWL_FAST) {
                $this->db->update('nexus_queue', [
                    'status' => 'pending',
                    'retry_count' => $queueItem['retry_count'] + 1,
                    'last_error' => $result['error'],
                    'completed_at' => date('Y-m-d H:i:s')
                ], 'id = ?', [$queueItem['id']]);
            } else {
                $this->db->update('nexus_queue', [
                    'status' => 'failed',
                    'last_error' => $result['error'],
                    'completed_at' => date('Y-m-d H:i:s')
                ], 'id = ?', [$queueItem['id']]);
            }
            return;
        }

        // Check HTTP status
        if ($result['http_code'] < 200 || $result['http_code'] >= 400) {
            $this->db->update('nexus_queue', [
                'status' => 'skipped',
                'last_error' => "HTTP {$result['http_code']}",
                'completed_at' => date('Y-m-d H:i:s')
            ], 'id = ?', [$queueItem['id']]);
            return;
        }

        // Extract content
        $extracted = $this->extractContent($result['html'], $result['final_url']);

        // Calculate relevance score
        $relevanceScore = $this->calculateRelevance($extracted, $queueItem['search_term']);

        // Check relevance threshold
        if ($relevanceScore < $this->config['relevance_threshold']) {
            $this->db->update('nexus_queue', [
                'status' => 'skipped',
                'last_error' => "Low relevance: {$relevanceScore}",
                'completed_at' => date('Y-m-d H:i:s')
            ], 'id = ?', [$queueItem['id']]);
            return;
        }

        // Save result
        $this->db->insertIgnore('nexus_results', [
            'job_id' => $this->jobId,
            'queue_id' => $queueItem['id'],
            'url_hash' => $urlHash,
            'url' => $queueItem['url'],
            'http_status' => $result['http_code'],
            'content_type' => $result['content_type'],
            'content_length' => $result['content_length'],
            'response_time_ms' => (int)$result['response_time'],
            'final_url' => $result['final_url'],
            'title' => $extracted['title'],
            'description' => $extracted['description'],
            'content' => $extracted['content'],
            'content_hash' => hash('sha256', $extracted['content'] ?? ''),
            'thumbnail' => $extracted['thumbnail'],
            'author' => $extracted['author'],
            'publish_date' => $extracted['publish_date'],
            'language' => $extracted['language'],
            'word_count' => $extracted['word_count'],
            'relevance_score' => $relevanceScore,
            'internal_links' => count($extracted['internal_links']),
            'external_links' => count($extracted['external_links']),
            'image_links' => count($extracted['images'])
        ]);

        // Mark queue item as completed
        $this->db->update('nexus_queue', [
            'status' => 'completed',
            'completed_at' => date('Y-m-d H:i:s')
        ], 'id = ?', [$queueItem['id']]);

        // Discover new links if depth allows
        if ($queueItem['depth'] < $this->config['max_depth']) {
            $this->discoverLinks($urlHash, $extracted, $queueItem);
        }
    }

    /**
     * Extract content from HTML
     */
    private function extractContent(string $html, string $url): array {
        $result = [
            'title' => '',
            'description' => '',
            'content' => '',
            'thumbnail' => '',
            'author' => '',
            'publish_date' => null,
            'language' => 'en',
            'word_count' => 0,
            'internal_links' => [],
            'external_links' => [],
            'images' => []
        ];

        if (empty($html)) return $result;

        // Suppress HTML parsing errors
        libxml_use_internal_errors(true);
        $dom = new \DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
        libxml_clear_errors();

        $xpath = new \DOMXPath($dom);
        $baseDomain = parse_url($url, PHP_URL_HOST);

        // Extract title
        $titleNodes = $xpath->query('//title');
        if ($titleNodes->length > 0) {
            $result['title'] = trim($titleNodes->item(0)->textContent);
        }

        // Extract meta description
        $descNodes = $xpath->query('//meta[@name="description"]/@content');
        if ($descNodes->length > 0) {
            $result['description'] = trim($descNodes->item(0)->nodeValue);
        }

        // Extract OG image
        $ogImage = $xpath->query('//meta[@property="og:image"]/@content');
        if ($ogImage->length > 0) {
            $result['thumbnail'] = $ogImage->item(0)->nodeValue;
        }

        // Extract author
        $authorNodes = $xpath->query('//meta[@name="author"]/@content');
        if ($authorNodes->length > 0) {
            $result['author'] = trim($authorNodes->item(0)->nodeValue);
        }

        // Extract publish date
        $dateSelectors = [
            '//meta[@property="article:published_time"]/@content',
            '//meta[@name="date"]/@content',
            '//time/@datetime'
        ];
        foreach ($dateSelectors as $selector) {
            $dateNodes = $xpath->query($selector);
            if ($dateNodes->length > 0) {
                $dateStr = $dateNodes->item(0)->nodeValue;
                $timestamp = strtotime($dateStr);
                if ($timestamp) {
                    $result['publish_date'] = date('Y-m-d H:i:s', $timestamp);
                    break;
                }
            }
        }

        // Extract main content (remove scripts, styles, nav, footer)
        $removeSelectors = ['//script', '//style', '//nav', '//footer', '//header', '//aside', '//form'];
        foreach ($removeSelectors as $selector) {
            $nodes = $xpath->query($selector);
            foreach ($nodes as $node) {
                $node->parentNode->removeChild($node);
            }
        }

        // Get content from article or main or body
        $contentSelectors = ['//article', '//main', '//*[@class="content"]', '//*[@class="post-content"]', '//body'];
        foreach ($contentSelectors as $selector) {
            $contentNodes = $xpath->query($selector);
            if ($contentNodes->length > 0) {
                $result['content'] = trim($contentNodes->item(0)->textContent);
                break;
            }
        }

        // Clean content
        $result['content'] = preg_replace('/\s+/', ' ', $result['content']);
        $result['word_count'] = str_word_count($result['content']);

        // Extract links
        $linkNodes = $xpath->query('//a[@href]');
        foreach ($linkNodes as $link) {
            $href = $link->getAttribute('href');
            if (empty($href) || strpos($href, '#') === 0 || strpos($href, 'javascript:') === 0) {
                continue;
            }

            // Make absolute URL
            if (strpos($href, '//') === 0) {
                $href = 'https:' . $href;
            } elseif (strpos($href, '/') === 0) {
                $href = 'https://' . $baseDomain . $href;
            } elseif (strpos($href, 'http') !== 0) {
                $href = dirname($url) . '/' . $href;
            }

            $linkDomain = parse_url($href, PHP_URL_HOST);
            if ($linkDomain === $baseDomain || strpos($linkDomain, $baseDomain) !== false) {
                $result['internal_links'][] = $href;
            } else {
                $result['external_links'][] = $href;
            }
        }

        // Extract images
        $imgNodes = $xpath->query('//img[@src]');
        foreach ($imgNodes as $img) {
            $src = $img->getAttribute('src');
            if (!empty($src)) {
                $result['images'][] = $src;
            }
        }

        return $result;
    }

    /**
     * Calculate relevance score
     */
    private function calculateRelevance(array $extracted, ?string $searchTerm): float {
        if (empty($searchTerm)) {
            return 5.0; // Default score for non-search URLs
        }

        $score = 0;
        $searchTerm = strtolower($searchTerm);
        $terms = explode(' ', $searchTerm);

        // Title matching (weight: 4)
        $title = strtolower($extracted['title'] ?? '');
        foreach ($terms as $term) {
            if (strpos($title, $term) !== false) {
                $score += 4;
            }
        }

        // Description matching (weight: 3)
        $desc = strtolower($extracted['description'] ?? '');
        foreach ($terms as $term) {
            if (strpos($desc, $term) !== false) {
                $score += 3;
            }
        }

        // Content matching (weight: 1)
        $content = strtolower($extracted['content'] ?? '');
        foreach ($terms as $term) {
            $count = substr_count($content, $term);
            $score += min($count * 0.5, 5); // Cap at 5
        }

        // Exact phrase match bonus
        if (strpos($title, $searchTerm) !== false) {
            $score += 5;
        }

        return min($score, 10); // Cap at 10
    }

    /**
     * Discover and queue new links
     */
    private function discoverLinks(string $sourceUrlHash, array $extracted, array $queueItem): void {
        $newDepth = $queueItem['depth'] + 1;
        $addedLinks = 0;
        $maxLinksPerPage = 50;

        // Only add internal links (focused crawling)
        $links = array_slice($extracted['internal_links'], 0, $maxLinksPerPage);

        foreach ($links as $url) {
            $targetUrlHash = hash('sha256', $this->normalizeUrl($url));

            // Save to discovered links
            $this->db->insertIgnore('nexus_discovered_links', [
                'job_id' => $this->jobId,
                'source_url_hash' => $sourceUrlHash,
                'target_url_hash' => $targetUrlHash,
                'target_url' => $url,
                'link_type' => 'internal',
                'anchor_text' => null,
                'added_to_queue' => 0
            ]);

            // Add to queue
            if ($this->addToQueue($url, 'discovered', $queueItem['search_term'], $newDepth)) {
                $addedLinks++;

                // Mark as added
                $this->db->update('nexus_discovered_links',
                    ['added_to_queue' => 1],
                    'job_id = ? AND target_url_hash = ?',
                    [$this->jobId, $targetUrlHash]
                );
            }
        }

        if ($addedLinks > 0) {
            $this->logger->debug('ENGINE', 'Discovered new links', [
                'count' => $addedLinks,
                'depth' => $newDepth
            ]);
        }
    }

    /**
     * Phase 5: Import results to pinfeeds
     */
    private function importResults(): void {
        $this->logger->info('ENGINE', 'Starting import phase');

        $batchSize = 100;
        $imported = 0;
        $duplicates = 0;

        while ($this->isRunning) {
            // Get batch of pending results
            $results = $this->db->fetchAll(
                "SELECT * FROM nexus_results
                 WHERE job_id = ? AND import_status = 'pending'
                 ORDER BY relevance_score DESC
                 LIMIT ?",
                [$this->jobId, $batchSize]
            );

            if (empty($results)) {
                break;
            }

            foreach ($results as $result) {
                if (!$this->isRunning) break;

                try {
                    // Check for duplicate in pinfeeds
                    $exists = $this->db->exists('pinfeeds', 'link = ?', [$result['url']]);

                    if ($exists) {
                        $duplicates++;
                        $this->db->update('nexus_results', [
                            'import_status' => 'skipped',
                            'imported_at' => date('Y-m-d H:i:s')
                        ], 'id = ?', [$result['id']]);
                        continue;
                    }

                    // Generate random author if needed
                    $author = $result['author'] ?: $this->getRandomAuthor();

                    // Insert to pinfeeds
                    $pinfeedId = $this->db->insert('pinfeeds', [
                        'title' => $result['title'] ?: 'Untitled',
                        'description' => $result['description'] ?: '',
                        'link' => $result['url'],
                        'thumbnail_url' => $result['thumbnail'] ?: '',
                        'source_domain' => parse_url($result['url'], PHP_URL_HOST),
                        'pubDate' => $result['publish_date'] ?: date('Y-m-d H:i:s'),
                        'author' => $author,
                        'guid' => $result['url_hash']
                    ]);

                    // Update result
                    $this->db->update('nexus_results', [
                        'import_status' => 'imported',
                        'pinfeeds_id' => $pinfeedId,
                        'imported_at' => date('Y-m-d H:i:s'),
                        'imported_to' => json_encode(['pinfeeds' => $pinfeedId])
                    ], 'id = ?', [$result['id']]);

                    $imported++;

                } catch (\Exception $e) {
                    $this->logger->error('ENGINE', 'Import failed', [
                        'url' => $result['url'],
                        'error' => $e->getMessage()
                    ]);

                    $this->db->update('nexus_results', [
                        'import_status' => 'failed'
                    ], 'id = ?', [$result['id']]);
                }
            }

            // Update job stats
            $this->db->update('nexus_jobs', [
                'total_imported' => $imported,
                'total_duplicates' => $duplicates
            ], 'id = ?', [$this->jobId]);
        }

        $this->logger->info('ENGINE', 'Import completed', [
            'imported' => $imported,
            'duplicates' => $duplicates
        ]);
    }

    /**
     * Add URL to queue
     */
    private function addToQueue(string $url, string $sourceType, ?string $searchTerm, int $depth): bool {
        $url = $this->normalizeUrl($url);
        if (!$this->isValidUrl($url)) return false;

        $urlHash = hash('sha256', $url);
        $domain = parse_url($url, PHP_URL_HOST);

        // Check domain filters
        if (!$this->isDomainAllowed($domain)) return false;

        // Check max pages limit
        $currentCount = $this->db->count('nexus_queue', 'job_id = ?', [$this->jobId]);
        if ($currentCount >= $this->config['max_pages']) return false;

        try {
            $this->db->insertIgnore('nexus_queue', [
                'job_id' => $this->jobId,
                'url_hash' => $urlHash,
                'url' => $url,
                'domain' => $domain,
                'source_type' => $sourceType,
                'search_term' => $searchTerm,
                'depth' => $depth,
                'priority' => max(1, 10 - $depth)
            ]);

            return true;
        } catch (\Exception $e) {
            return false;
        }
    }

    /**
     * Normalize URL
     */
    private function normalizeUrl(string $url): string {
        $url = trim($url);
        $url = preg_replace('/#.*$/', '', $url); // Remove fragment
        $url = preg_replace('/\?utm_[^&]+&?/', '?', $url); // Remove UTM params
        $url = preg_replace('/\?$/', '', $url); // Remove trailing ?
        $url = rtrim($url, '/'); // Remove trailing slash
        return $url;
    }

    /**
     * Validate URL
     */
    private function isValidUrl(string $url): bool {
        if (empty($url)) return false;
        if (strlen($url) > 2048) return false;
        if (!filter_var($url, FILTER_VALIDATE_URL)) return false;
        if (!preg_match('/^https?:\/\//i', $url)) return false;

        // Exclude common non-content URLs
        $exclude = [
            '/wp-admin/', '/wp-login', '/admin/', '/login',
            '.css', '.js', '.json', '.xml', '.pdf', '.doc',
            '.zip', '.exe', '.dmg', '.apk',
            'javascript:', 'mailto:', 'tel:'
        ];

        foreach ($exclude as $pattern) {
            if (stripos($url, $pattern) !== false) {
                return false;
            }
        }

        return true;
    }

    /**
     * Check if domain is allowed
     */
    private function isDomainAllowed(string $domain): bool {
        // Check blocklist
        $blocked = json_decode($this->config['blocked_domains'] ?? '[]', true);
        foreach ($blocked as $blockedDomain) {
            if (stripos($domain, $blockedDomain) !== false) {
                return false;
            }
        }

        // Check allowlist (if set)
        $allowed = json_decode($this->config['allowed_domains'] ?? '[]', true);
        if (!empty($allowed)) {
            foreach ($allowed as $allowedDomain) {
                if (stripos($domain, $allowedDomain) !== false) {
                    return true;
                }
            }
            return false;
        }

        return true;
    }

    /**
     * Update job status
     */
    private function updateJobStatus(string $status): void {
        $data = ['status' => $status];

        if (in_array($status, ['completed', 'failed', 'cancelled'])) {
            $data['completed_at'] = date('Y-m-d H:i:s');
        }

        $this->db->update('nexus_jobs', $data, 'id = ?', [$this->jobId]);
    }

    /**
     * Update progress statistics
     */
    private function updateProgress(): void {
        $stats = $this->db->fetchOne(
            "SELECT
                COUNT(*) as total,
                SUM(status = 'completed') as completed,
                SUM(status = 'failed') as failed,
                SUM(status = 'skipped') as skipped,
                SUM(status = 'pending') as pending,
                SUM(status = 'processing') as processing
             FROM nexus_queue WHERE job_id = ?",
            [$this->jobId]
        );

        $total = max((int)$stats['total'], 1);
        $processed = (int)$stats['completed'] + (int)$stats['failed'] + (int)$stats['skipped'];
        $percent = round(($processed / $total) * 100, 2);

        $this->db->update('nexus_jobs', [
            'progress_percent' => $percent,
            'total_queued' => $total,
            'total_processed' => $processed,
            'total_successful' => (int)$stats['completed'],
            'total_failed' => (int)$stats['failed'],
            'total_skipped' => (int)$stats['skipped']
        ], 'id = ?', [$this->jobId]);
    }

    /**
     * Get random user agent
     */
    private function getRandomUserAgent(): string {
        return $this->userAgents[array_rand($this->userAgents)];
    }

    /**
     * Get random author name
     */
    private function getRandomAuthor(): string {
        $firstNames = ['John', 'Jane', 'Michael', 'Sarah', 'David', 'Emily', 'James', 'Emma', 'Robert', 'Olivia'];
        $lastNames = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez'];

        return $firstNames[array_rand($firstNames)] . ' ' . $lastNames[array_rand($lastNames)];
    }

    /**
     * Get current status
     */
    public function getStatus(): array {
        $this->loadJobConfig();
        return [
            'job_uuid' => $this->jobUuid,
            'status' => $this->config['status'],
            'progress' => (float)$this->config['progress_percent'],
            'current_phase' => $this->phases[$this->config['current_phase']]['name'] ?? 'Unknown',
            'stats' => [
                'queued' => (int)$this->config['total_queued'],
                'processed' => (int)$this->config['total_processed'],
                'successful' => (int)$this->config['total_successful'],
                'failed' => (int)$this->config['total_failed'],
                'skipped' => (int)$this->config['total_skipped'],
                'imported' => (int)$this->config['total_imported'],
                'duplicates' => (int)$this->config['total_duplicates']
            ],
            'started_at' => $this->config['started_at'],
            'completed_at' => $this->config['completed_at']
        ];
    }
}
