<?php
/**
 * Test script for Deep Crawler v7.0
 * Tests Yahoo search functionality (primary) with Bing fallback
 */

error_reporting(E_ALL);
ini_set('display_errors', 1);
set_time_limit(60);

echo "=== Deep Crawler v7.0 Test ===\n\n";

// User agents
$userAgents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
];

// Excluded domains
$excludedDomains = [
    'facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com',
    'bing.com', 'google.com', 'yahoo.com',
];

/**
 * Build Yahoo search URL
 */
function buildYahooSearchUrl(string $query, int $offset = 0): string {
    $params = ['p' => $query];
    if ($offset > 0) {
        $params['b'] = $offset + 1;
    }
    return 'https://search.yahoo.com/search?' . http_build_query($params);
}

/**
 * Parse Yahoo search results
 */
function parseYahooResults(string $html): array {
    $results = [];
    $seen = [];

    // Pattern: Links through Yahoo redirect
    if (preg_match_all('/<a[^>]*href="(https:\/\/r\.search\.yahoo\.com[^"]+)"[^>]*>(?:<b>|<span[^>]*>)?([^<]+)/is', $html, $matches, PREG_SET_ORDER)) {
        foreach ($matches as $match) {
            // Decode Yahoo redirect URL
            if (preg_match('/RU=([^\/]+)/', $match[1], $ru)) {
                $realUrl = urldecode($ru[1]);
                $title = html_entity_decode(strip_tags($match[2]), ENT_QUOTES, 'UTF-8');

                // Skip internal Yahoo links
                if (strpos($realUrl, 'yahoo.com') !== false) continue;
                if (strpos($realUrl, 'search.yahoo') !== false) continue;

                if (!empty($realUrl) && !empty($title) && strlen($title) > 3 && !isset($seen[$realUrl])) {
                    $seen[$realUrl] = true;
                    $results[] = [
                        'url' => $realUrl,
                        'title' => $title,
                    ];
                }
            }
        }
    }

    return $results;
}

/**
 * Check if URL is from excluded domain
 */
function isExcludedDomain(string $url, array $excludedDomains): bool {
    $host = parse_url($url, PHP_URL_HOST);
    if (!$host) return true;

    foreach ($excludedDomains as $excluded) {
        if (stripos($host, $excluded) !== false) {
            return true;
        }
    }
    return false;
}

// Test query
$query = 'Batman the Movie';
echo "1. Testing Yahoo search for: \"$query\"\n";

$url = buildYahooSearchUrl($query);
echo "   URL: $url\n";

// Fetch search page
$ch = curl_init();
curl_setopt_array($ch, [
    CURLOPT_URL => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_TIMEOUT => 15,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_USERAGENT => $userAgents[0],
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_HTTPHEADER => [
        'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language: en-US,en;q=0.5',
        'Accept-Encoding: gzip, deflate',
        'Connection: keep-alive',
    ],
    CURLOPT_ENCODING => 'gzip, deflate',
]);

$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);

echo "   HTTP Code: $httpCode\n";

if ($error) {
    echo "   Error: $error\n";
    exit(1);
}

if ($httpCode !== 200 || empty($html)) {
    echo "   Failed to fetch search page\n";
    exit(1);
}

echo "   HTML Length: " . strlen($html) . " bytes\n";

// Check if we got Yahoo page
if (strpos($html, 'yahoo.com') === false) {
    echo "   Warning: Response doesn't look like Yahoo search page\n";
    file_put_contents('debug_yahoo_response.html', $html);
    echo "   Saved response to debug_yahoo_response.html\n";
}

// Parse results
echo "\n2. Parsing search results...\n";
$results = parseYahooResults($html);

if (empty($results)) {
    echo "   No results found in HTML\n";
    echo "   Checking for r.search.yahoo.com: " . (strpos($html, 'r.search.yahoo.com') !== false ? 'Found' : 'Not found') . "\n";
    echo "   Checking for algo class: " . (strpos($html, 'algo') !== false ? 'Found' : 'Not found') . "\n";

    // Save for debugging
    file_put_contents('debug_yahoo_response.html', $html);
    echo "   Saved response to debug_yahoo_response.html\n";
} else {
    echo "   Found " . count($results) . " results:\n";

    $filtered = [];
    foreach ($results as $result) {
        if (!isExcludedDomain($result['url'], $excludedDomains)) {
            $filtered[] = $result;
        }
    }

    echo "   After filtering: " . count($filtered) . " results\n\n";

    foreach (array_slice($filtered, 0, 10) as $i => $result) {
        $num = $i + 1;
        $title = substr($result['title'], 0, 60);
        echo "   $num. $title...\n";
        echo "      URL: " . $result['url'] . "\n";
    }
}

echo "\n=== Test Complete ===\n";
