-- ===========================================
-- FLOWBOT DCI v6.0 - CRAWLER TABLES MIGRATION
-- ===========================================
-- Deep Crawler database structure
--
-- Tables:
-- - crawler_jobs: Crawler job definitions and status
-- - crawler_urls: URLs discovered during crawling
--
-- Run with: mysql -u user -p database < v6_crawler_tables.sql

-- ============================================
-- Table: crawler_jobs
-- Stores crawler job definitions and status
-- ============================================
CREATE TABLE IF NOT EXISTS crawler_jobs (
    id VARCHAR(50) PRIMARY KEY COMMENT 'Unique job identifier (UUID)',
    type ENUM('deep', 'search', 'sitemap') NOT NULL COMMENT 'Job type',
    start_url TEXT COMMENT 'Starting URL for deep crawl',
    search_term VARCHAR(500) COMMENT 'Search query for search mode',
    search_engines JSON COMMENT 'Search engines used',
    config JSON COMMENT 'Job configuration (maxPages, maxDepth, etc)',
    status ENUM('pending', 'running', 'paused', 'completed', 'failed', 'cancelled') DEFAULT 'pending' COMMENT 'Job status',
    visited_count INT UNSIGNED DEFAULT 0 COMMENT 'Number of URLs visited',
    found_count INT UNSIGNED DEFAULT 0 COMMENT 'URLs matching criteria found',
    processed_count INT UNSIGNED DEFAULT 0 COMMENT 'URLs processed to pinfeeds',
    error_count INT UNSIGNED DEFAULT 0 COMMENT 'Number of errors',
    current_depth INT UNSIGNED DEFAULT 0 COMMENT 'Current crawl depth',
    error_message TEXT COMMENT 'Last error message if failed',
    started_at DATETIME COMMENT 'When job started',
    completed_at DATETIME COMMENT 'When job completed',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT 'When job was created',
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'Last update',

    INDEX idx_status (status),
    INDEX idx_type (type),
    INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='Crawler job definitions and progress tracking';

-- ============================================
-- Table: crawler_urls
-- Stores URLs discovered during crawling
-- ============================================
CREATE TABLE IF NOT EXISTS crawler_urls (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    job_id VARCHAR(50) NOT NULL COMMENT 'Reference to crawler_jobs',
    url TEXT NOT NULL COMMENT 'Discovered URL',
    url_hash VARCHAR(64) AS (SHA2(url, 256)) STORED COMMENT 'URL hash for fast lookups',
    domain VARCHAR(255) COMMENT 'URL domain',
    depth INT UNSIGNED DEFAULT 0 COMMENT 'Discovery depth level',
    status ENUM('pending', 'visiting', 'visited', 'found', 'processed', 'error', 'skipped') DEFAULT 'pending' COMMENT 'URL status',
    found_term BOOLEAN DEFAULT FALSE COMMENT 'Whether search term was found',
    title VARCHAR(500) COMMENT 'Page title if extracted',
    http_code SMALLINT UNSIGNED COMMENT 'HTTP response code',
    response_time FLOAT COMMENT 'Response time in seconds',
    content_type VARCHAR(100) COMMENT 'Response content type',
    links_found INT UNSIGNED DEFAULT 0 COMMENT 'Number of links found on page',
    error_message TEXT COMMENT 'Error message if failed',
    source VARCHAR(50) COMMENT 'How URL was discovered (crawl, search_engine_name, sitemap)',
    parent_url_id INT UNSIGNED COMMENT 'Parent URL that linked to this one',
    discovered_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT 'When URL was discovered',
    visited_at DATETIME COMMENT 'When URL was visited',

    INDEX idx_job_id (job_id),
    INDEX idx_status (status),
    INDEX idx_depth (depth),
    INDEX idx_url_hash (url_hash),
    INDEX idx_domain (domain),
    INDEX idx_job_status (job_id, status),
    FOREIGN KEY (job_id) REFERENCES crawler_jobs(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='URLs discovered and processed by crawler';

-- ============================================
-- Table: crawler_search_results
-- Stores search engine results
-- ============================================
CREATE TABLE IF NOT EXISTS crawler_search_results (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    job_id VARCHAR(50) NOT NULL COMMENT 'Reference to crawler_jobs',
    search_engine VARCHAR(50) NOT NULL COMMENT 'Search engine name',
    url TEXT NOT NULL COMMENT 'Result URL',
    url_hash VARCHAR(64) AS (SHA2(url, 256)) STORED COMMENT 'URL hash',
    title VARCHAR(500) COMMENT 'Result title',
    snippet TEXT COMMENT 'Result snippet/description',
    position INT UNSIGNED COMMENT 'Position in search results',
    processed BOOLEAN DEFAULT FALSE COMMENT 'Whether URL was processed',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,

    INDEX idx_job_id (job_id),
    INDEX idx_engine (search_engine),
    INDEX idx_url_hash (url_hash),
    INDEX idx_processed (processed),
    FOREIGN KEY (job_id) REFERENCES crawler_jobs(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='Search engine results';

-- ============================================
-- Useful Views
-- ============================================

-- View: crawler_job_stats
-- Quick statistics for each crawler job
CREATE OR REPLACE VIEW crawler_job_stats AS
SELECT
    j.id,
    j.type,
    j.status,
    j.start_url,
    j.search_term,
    j.visited_count,
    j.found_count,
    j.processed_count,
    j.error_count,
    j.current_depth,
    COUNT(DISTINCT u.id) as total_urls,
    COUNT(DISTINCT CASE WHEN u.status = 'visited' THEN u.id END) as visited_urls,
    COUNT(DISTINCT CASE WHEN u.status = 'pending' THEN u.id END) as pending_urls,
    COUNT(DISTINCT CASE WHEN u.found_term = 1 THEN u.id END) as found_urls,
    COUNT(DISTINCT u.domain) as unique_domains,
    TIMESTAMPDIFF(SECOND, j.started_at, COALESCE(j.completed_at, NOW())) as duration_seconds,
    j.created_at,
    j.started_at,
    j.completed_at
FROM crawler_jobs j
LEFT JOIN crawler_urls u ON j.id = u.job_id
GROUP BY j.id;

-- View: crawler_domain_stats
-- Statistics by domain for a job
CREATE OR REPLACE VIEW crawler_domain_stats AS
SELECT
    job_id,
    domain,
    COUNT(*) as total_urls,
    SUM(CASE WHEN status = 'visited' THEN 1 ELSE 0 END) as visited,
    SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
    SUM(CASE WHEN found_term = 1 THEN 1 ELSE 0 END) as found,
    AVG(response_time) as avg_response_time,
    MAX(depth) as max_depth
FROM crawler_urls
WHERE domain IS NOT NULL
GROUP BY job_id, domain;

-- ============================================
-- Sample Data (Optional - Comment out in production)
-- ============================================

-- INSERT INTO crawler_jobs (id, type, start_url, status, config)
-- VALUES ('test-job-001', 'deep', 'https://example.com', 'pending',
--         '{"maxPages": 100, "maxDepth": 3, "sameDomainOnly": true}');

-- ============================================
-- Cleanup stored procedures (Optional)
-- ============================================

DELIMITER //

-- Procedure to clean up old completed jobs
CREATE PROCEDURE IF NOT EXISTS cleanup_old_crawler_jobs(IN days_old INT)
BEGIN
    DELETE FROM crawler_jobs
    WHERE status IN ('completed', 'failed', 'cancelled')
    AND created_at < DATE_SUB(NOW(), INTERVAL days_old DAY);
END //

-- Procedure to get job summary
CREATE PROCEDURE IF NOT EXISTS get_crawler_job_summary(IN p_job_id VARCHAR(50))
BEGIN
    SELECT
        j.*,
        (SELECT COUNT(*) FROM crawler_urls WHERE job_id = j.id) as total_urls,
        (SELECT COUNT(*) FROM crawler_urls WHERE job_id = j.id AND status = 'visited') as visited_urls,
        (SELECT COUNT(*) FROM crawler_urls WHERE job_id = j.id AND found_term = 1) as found_urls,
        (SELECT COUNT(DISTINCT domain) FROM crawler_urls WHERE job_id = j.id) as unique_domains
    FROM crawler_jobs j
    WHERE j.id = p_job_id;
END //

DELIMITER ;

-- ============================================
-- Migration complete message
-- ============================================
SELECT 'Flowb0t DCI v6.0 Crawler tables created successfully!' as message;
