<?php
/**
 * ===========================================
 * FLOWBOT DCI - WEB SCRAPER SERVICE v2.4
 * ===========================================
 * Handles parallel HTTP requests using cURL multi
 * With enhanced anti-bot evasion, smart delays, and social media support
 *
 * v2.4 Changes (PERF-005/PERF-006):
 * - DNS cache timeout (default 120s) to avoid repeated lookups
 * - TCP Keep-Alive support for connection reuse
 * - DNS pre-fetch functionality for batch processing warmup
 * - Configurable DNS and Keep-Alive settings
 *
 * v2.3 Changes:
 * - Smart DomainRateLimiter integration for 429 handling
 * - Retry-After header parsing and respect
 * - Aggressive backoff for problematic domains
 * - Domain cooldown periods
 * - Auto-recovery when domain responds well
 *
 * v2.2 Changes:
 * - 30+ User-Agents for better rotation
 * - Dynamic headers per request (randomized)
 * - Smart delays per domain based on response codes
 * - Retry strategies with different fingerprints for 403 errors
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

class WebScraper
{
    private bool $sslVerify;

    // BACK-003: Circuit breaker for failing domains
    private ?CircuitBreaker $circuitBreaker = null;

    // v2.3: Smart domain rate limiter for 429 handling
    private ?DomainRateLimiter $rateLimiter = null;

    // v2.3: Response headers storage for Retry-After parsing
    private array $responseHeaders = [];

    // v2.2: Expanded User-Agent pool (30+ browsers) for better anti-bot evasion
    private array $userAgents = [
        // Chrome Windows (10 variants)
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        // Chrome Mac (4 variants)
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        // Chrome Linux (2 variants)
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        // Firefox Windows (4 variants)
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0',
        // Firefox Mac (2 variants)
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
        // Firefox Linux (2 variants)
        'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0',
        // Safari Mac (4 variants)
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
        // Edge Windows (4 variants)
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.0.0',
        // Mobile iOS Safari (4 variants)
        'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPad; CPU OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
        // Mobile Android Chrome (4 variants)
        'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
        // Googlebot (for sites that allow bots)
        'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        'Googlebot/2.1 (+http://www.google.com/bot.html)',
    ];

    // v2.2: Accept header variants for randomization
    private array $acceptHeaders = [
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    ];

    // v2.2: Accept-Language variants
    private array $languages = [
        'en-US,en;q=0.9',
        'en-US,en;q=0.9,es;q=0.8',
        'en-GB,en;q=0.9,en-US;q=0.8',
        'en-US,en;q=0.9,pt;q=0.8',
        'en,en-US;q=0.9',
        'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    ];

    // v2.2: Sec-Ch-Ua variants (Chrome client hints)
    private array $secChUaVariants = [
        '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        '"Chromium";v="120", "Not_A Brand";v="24", "Google Chrome";v="120"',
        '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
        '"Microsoft Edge";v="120", "Chromium";v="120", "Not_A Brand";v="24"',
    ];

    // v2.2: Platform variants
    private array $platforms = [
        '"Windows"',
        '"macOS"',
        '"Linux"',
    ];

    // v2.2: Smart delays per domain (adaptive based on response codes)
    private array $domainDelays = [];

    // Rate limiting: max concurrent requests per domain
    private int $maxPerDomain = 3;

    // PERF-005: DNS cache timeout in seconds (default: 120s = 2 minutes)
    private int $dnsCacheTimeout = 120;

    // PERF-005: Keep-Alive settings
    private bool $tcpKeepAlive = true;
    private int $tcpKeepIdle = 60;    // Seconds before sending keep-alive probes
    private int $tcpKeepInterval = 30; // Interval between keep-alive probes

    // PERF-006: Pre-fetched DNS domains (already resolved)
    private array $prefetchedDomains = [];

    // Tracking per domain
    private array $domainCounts = [];

    // Cookie file for session persistence
    private string $cookieFile;

    // SEC-008: Private cookie directory path
    private string $cookieDir;

    public function __construct(bool $sslVerify = true)
    {
        $this->sslVerify = $sslVerify;

        // SEC-008: Use private directory for cookies instead of shared temp
        // This prevents other processes from accessing session cookies
        $this->cookieDir = dirname(__DIR__, 2) . '/temp/.cookies';

        // Create cookie directory if it doesn't exist
        if (!is_dir($this->cookieDir)) {
            @mkdir($this->cookieDir, 0700, true); // Owner-only permissions
        }

        // Generate unique cookie file with secure random suffix
        $this->cookieFile = $this->cookieDir . '/session_' . bin2hex(random_bytes(16)) . '.txt';

        // SEC-008: Add .gitignore to prevent cookie files from being committed
        $gitignorePath = $this->cookieDir . '/.gitignore';
        if (!file_exists($gitignorePath)) {
            @file_put_contents($gitignorePath, "*\n!.gitignore\n");
        }
    }

    /**
     * Destructor: Clean up cookie file
     */
    public function __destruct()
    {
        $this->cleanupCookieFile();
    }

    /**
     * SEC-008: Clean up cookie file securely
     */
    private function cleanupCookieFile(): void
    {
        if (isset($this->cookieFile) && file_exists($this->cookieFile)) {
            @unlink($this->cookieFile);
        }

        // Clean up old cookie files (older than 1 hour)
        if (isset($this->cookieDir) && is_dir($this->cookieDir)) {
            $files = glob($this->cookieDir . '/session_*.txt');
            $cutoff = time() - 3600; // 1 hour
            foreach ($files ?: [] as $file) {
                if (filemtime($file) < $cutoff) {
                    @unlink($file);
                }
            }
        }
    }

    /**
     * Get random User-Agent from pool
     */
    private function getRandomUserAgent(): string
    {
        return $this->userAgents[array_rand($this->userAgents)];
    }

    /**
     * v2.2: Get User-Agent by type for retry strategies
     */
    private function getUserAgentByType(string $type): string
    {
        $typeMap = [
            'chrome' => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'firefox' => [11, 12, 13, 14, 15, 16, 17, 18],
            'safari' => [19, 20, 21, 22],
            'edge' => [23, 24, 25, 26],
            'mobile' => [27, 28, 29, 30, 31, 32, 33, 34],
            'googlebot' => [35, 36],
        ];

        $indices = $typeMap[$type] ?? $typeMap['chrome'];
        $index = $indices[array_rand($indices)];
        return $this->userAgents[$index] ?? $this->userAgents[0];
    }

    /**
     * v2.2: Get browser-like headers with dynamic randomization
     * Each request gets slightly different headers to avoid fingerprinting
     */
    private function getHeaders(string $url, bool $isMobile = false): array
    {
        $parsed = parse_url($url);
        $host = $parsed['host'] ?? '';
        $referer = $this->generateReferer($host);

        $headers = [
            'Accept: ' . $this->acceptHeaders[array_rand($this->acceptHeaders)],
            'Accept-Language: ' . $this->languages[array_rand($this->languages)],
            'Accept-Encoding: gzip, deflate, br',
            'Connection: keep-alive',
            'Upgrade-Insecure-Requests: 1',
        ];

        // Add referer sometimes (not always, more natural)
        if (rand(0, 100) > 20) {
            $headers[] = 'Referer: ' . $referer;
        }

        // Add Sec-Ch-Ua headers for Chrome-like browsers
        if (rand(0, 100) > 30) {
            $headers[] = 'Sec-Ch-Ua: ' . $this->secChUaVariants[array_rand($this->secChUaVariants)];
            $headers[] = 'Sec-Ch-Ua-Mobile: ' . ($isMobile ? '?1' : '?0');
            $headers[] = 'Sec-Ch-Ua-Platform: ' . $this->platforms[array_rand($this->platforms)];
        }

        // Sec-Fetch headers (vary slightly)
        $headers[] = 'Sec-Fetch-Dest: document';
        $headers[] = 'Sec-Fetch-Mode: navigate';
        $headers[] = 'Sec-Fetch-Site: ' . (rand(0, 1) ? 'none' : 'same-origin');
        $headers[] = 'Sec-Fetch-User: ?1';

        // Cache-Control (vary)
        $cacheOptions = ['no-cache', 'max-age=0', 'no-store'];
        $headers[] = 'Cache-Control: ' . $cacheOptions[array_rand($cacheOptions)];

        // DNT header sometimes
        if (rand(0, 100) > 70) {
            $headers[] = 'DNT: 1';
        }

        return $headers;
    }

    /**
     * v2.2: Generate referer URL (varies to look natural)
     */
    private function generateReferer(string $host): string
    {
        $refererOptions = [
            'https://' . $host,
            'https://' . $host . '/',
            'https://www.google.com/',
            'https://www.google.com/search?q=' . urlencode($host),
            'https://duckduckgo.com/',
            '',  // Sometimes no referer
        ];

        $referer = $refererOptions[array_rand($refererOptions)];
        return $referer ?: 'https://' . $host;
    }

    /**
     * v2.2: Get smart delay for domain based on previous responses
     * Domains that return 403/429 get longer delays
     */
    private function getSmartDelay(string $domain): int
    {
        $baseDelay = $this->domainDelays[$domain] ?? 100; // Default 100ms

        // Add jitter (±30%)
        $jitter = (int)($baseDelay * (rand(-30, 30) / 100));
        $delay = max(50, $baseDelay + $jitter);

        return $delay;
    }

    /**
     * v2.2: Adjust delay for domain based on response code
     */
    private function adjustDomainDelay(string $domain, int $httpCode): void
    {
        $currentDelay = $this->domainDelays[$domain] ?? 100;

        if ($httpCode === 403 || $httpCode === 429) {
            // Increase delay by 50% for this domain (max 5 seconds)
            $this->domainDelays[$domain] = min(5000, (int)($currentDelay * 1.5));
        } elseif ($httpCode === 200) {
            // Decrease delay by 10% gradually (min 50ms)
            $this->domainDelays[$domain] = max(50, (int)($currentDelay * 0.9));
        }
    }

    /**
     * v2.2: Check if domain needs special handling (social media, protected sites)
     */
    private function needsSpecialHandling(string $url): bool
    {
        $socialDomains = [
            'twitter.com', 'x.com', 'facebook.com', 'fb.com', 'fb.watch',
            'instagram.com', 'tiktok.com', 't.me', 'telegram.org',
        ];

        $host = parse_url($url, PHP_URL_HOST) ?? '';
        foreach ($socialDomains as $domain) {
            if (strpos($host, $domain) !== false) {
                return true;
            }
        }
        return false;
    }

    /**
     * Set max concurrent requests per domain
     */
    public function setMaxPerDomain(int $max): self
    {
        $this->maxPerDomain = max(1, $max);
        return $this;
    }

    /**
     * BACK-003: Set circuit breaker for failing domains
     */
    public function setCircuitBreaker(CircuitBreaker $cb): self
    {
        $this->circuitBreaker = $cb;
        return $this;
    }

    /**
     * BACK-003: Get circuit breaker instance
     */
    public function getCircuitBreaker(): ?CircuitBreaker
    {
        return $this->circuitBreaker;
    }

    /**
     * v2.3: Set domain rate limiter for smart 429 handling
     */
    public function setRateLimiter(DomainRateLimiter $limiter): self
    {
        $this->rateLimiter = $limiter;
        return $this;
    }

    /**
     * v2.3: Get domain rate limiter instance
     */
    public function getRateLimiter(): ?DomainRateLimiter
    {
        return $this->rateLimiter;
    }

    /**
     * v2.3: Parse response headers from cURL
     */
    private function parseResponseHeaders(string $headerString, string $url): void
    {
        $this->responseHeaders[$url] = [];
        $lines = explode("\r\n", $headerString);

        foreach ($lines as $line) {
            if (strpos($line, ':') !== false) {
                [$name, $value] = explode(':', $line, 2);
                $name = strtolower(trim($name));
                $value = trim($value);
                $this->responseHeaders[$url][$name] = $value;
            }
        }
    }

    /**
     * v2.3: Get Retry-After header value for URL
     */
    private function getRetryAfterSeconds(string $url): ?int
    {
        if (!isset($this->responseHeaders[$url]['retry-after'])) {
            return null;
        }

        $value = $this->responseHeaders[$url]['retry-after'];

        // If numeric, it's seconds
        if (is_numeric($value)) {
            return max(1, (int)$value);
        }

        // Otherwise it's an HTTP date
        $timestamp = strtotime($value);
        if ($timestamp !== false) {
            return max(1, $timestamp - time());
        }

        return null;
    }

    /**
     * Fetch multiple URLs in parallel with rate limiting per domain
     *
     * @param array $urls URLs to fetch
     * @param int $maxConcurrent Maximum concurrent requests
     * @param int $timeout Timeout in seconds
     * @return array Results indexed by URL
     */
    public function fetchBatch(array $urls, int $maxConcurrent, int $timeout): array
    {
        $results = [];

        if (empty($urls)) {
            return $results;
        }

        // v2.3: Filter out URLs with domains in cooldown (rate limiter)
        if ($this->rateLimiter) {
            foreach ($urls as $i => $url) {
                $domain = $this->rateLimiter->getDomain($url);
                if ($this->rateLimiter->isInCooldown($domain)) {
                    $remaining = $this->rateLimiter->getCooldownRemaining($domain);
                    $results[$url] = [
                        'success'       => false,
                        'html'          => '',
                        'http_code'     => 429,
                        'response_time' => 0,
                        'error'         => "Domain in cooldown for {$remaining}s due to rate limiting (429)",
                    ];
                    unset($urls[$i]);
                }
            }
            $urls = array_values($urls);
        }

        // BACK-003: Filter out URLs with open circuit breakers
        $blockedUrls = [];
        if ($this->circuitBreaker) {
            foreach ($urls as $i => $url) {
                $domain = parse_url($url, PHP_URL_HOST) ?? '';
                if (!$this->circuitBreaker->isAllowed($domain)) {
                    // Circuit is open - reject immediately
                    $results[$url] = [
                        'success'       => false,
                        'html'          => '',
                        'http_code'     => 0,
                        'response_time' => 0,
                        'error'         => 'Circuit breaker open - domain temporarily blocked due to repeated failures',
                    ];
                    $blockedUrls[] = $url;
                    unset($urls[$i]);
                }
            }
            $urls = array_values($urls); // Re-index
        }

        if (empty($urls)) {
            return $results;
        }

        // Reset domain counts
        $this->domainCounts = [];

        $mh = curl_multi_init();
        $handles = [];
        $handleDomains = []; // Track domain per handle
        $activeRequests = 0;
        $queue = $urls;
        $deferred = []; // URLs deferred due to rate limiting

        // Helper: get domain from URL
        $getDomain = function (string $url): string {
            $host = parse_url($url, PHP_URL_HOST) ?? '';
            $parts = explode('.', $host);
            $count = count($parts);
            return $count >= 2 ? $parts[$count - 2] . '.' . $parts[$count - 1] : $host;
        };

        // Helper: can add URL (rate limiting check)
        // v2.3: Use dynamic max concurrent from rate limiter if available
        $canAddUrl = function (string $url) use ($getDomain): bool {
            $domain = $getDomain($url);
            $maxForDomain = $this->maxPerDomain;

            // v2.3: Check rate limiter for domain-specific limits
            if ($this->rateLimiter) {
                $maxForDomain = min($maxForDomain, $this->rateLimiter->getMaxConcurrent($domain));
            }

            return ($this->domainCounts[$domain] ?? 0) < $maxForDomain;
        };

        // Track start times for response time measurement
        $startTimes = [];

        // v2.3: Storage for response headers (for Retry-After parsing)
        $headerBuffers = [];

        // Function to add a new request
        $addRequest = function (string $url) use (&$mh, &$handles, &$handleDomains, &$activeRequests, &$startTimes, &$headerBuffers, $timeout, $getDomain): void {
            $domain = $getDomain($url);

            // v2.3: Apply smart delay from rate limiter before request
            if ($this->rateLimiter) {
                $delay = $this->rateLimiter->getDelay($domain);
                if ($delay > 0) {
                    usleep($delay * 1000); // Convert ms to microseconds
                }
            }

            $this->domainCounts[$domain] = ($this->domainCounts[$domain] ?? 0) + 1;
            $handleDomains[$url] = $domain;
            $startTimes[$url] = microtime(true);
            $headerBuffers[$url] = '';

            $ch = curl_init($url);

            // PERF-005: Build cURL options with DNS cache and Keep-Alive
            $curlOptions = [
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS      => 5,
                CURLOPT_USERAGENT      => $this->getRandomUserAgent(),
                CURLOPT_SSL_VERIFYPEER => $this->sslVerify,
                CURLOPT_SSL_VERIFYHOST => $this->sslVerify ? 2 : 0,
                CURLOPT_TIMEOUT        => $timeout,
                CURLOPT_CONNECTTIMEOUT => min($timeout, 5),
                CURLOPT_PRIVATE        => $url,
                CURLOPT_ENCODING       => '',
                CURLOPT_HTTPHEADER     => $this->getHeaders($url),
                CURLOPT_COOKIEJAR      => $this->cookieFile,
                CURLOPT_COOKIEFILE     => $this->cookieFile,
                // PERF-005: DNS cache to avoid repeated lookups
                CURLOPT_DNS_CACHE_TIMEOUT => $this->dnsCacheTimeout,
                // v2.3: Capture headers for Retry-After parsing
                CURLOPT_HEADERFUNCTION => function($curl, $header) use ($url, &$headerBuffers) {
                    $headerBuffers[$url] .= $header;
                    return strlen($header);
                },
            ];

            // PERF-005: Add TCP Keep-Alive settings if enabled
            if ($this->tcpKeepAlive) {
                $curlOptions[CURLOPT_TCP_KEEPALIVE] = 1;
                $curlOptions[CURLOPT_TCP_KEEPIDLE] = $this->tcpKeepIdle;
                $curlOptions[CURLOPT_TCP_KEEPINTVL] = $this->tcpKeepInterval;
            }

            curl_setopt_array($ch, $curlOptions);

            curl_multi_add_handle($mh, $ch);
            $handles[$url] = $ch;
            $activeRequests++;
        };

        // Function to get next URL respecting rate limits
        $getNextUrl = function () use (&$queue, &$deferred, $canAddUrl): ?string {
            // Try from main queue first
            foreach ($queue as $i => $url) {
                if ($canAddUrl($url)) {
                    unset($queue[$i]);
                    return $url;
                }
            }
            // Try from deferred queue
            foreach ($deferred as $i => $url) {
                if ($canAddUrl($url)) {
                    unset($deferred[$i]);
                    return $url;
                }
            }
            // If can't add any, move first from queue to deferred
            if (!empty($queue)) {
                $url = array_shift($queue);
                $deferred[] = $url;
            }
            return null;
        };

        // Add initial batch of requests (com delay para evitar HTTP 429)
        $requestCount = 0;
        while ($activeRequests < $maxConcurrent) {
            $url = $getNextUrl();
            if ($url === null) break;
            $addRequest($url);
            $requestCount++;
            // THROTTLE: Pequeno delay a cada 3 requisições para não sobrecarregar servidores
            if ($requestCount % 3 === 0) {
                usleep(100000); // 100ms delay a cada 3 requisições
            }
        }

        // Process requests
        $selectTimeout = max(0.3, 0.8 / max(1, $maxConcurrent));
        do {
            curl_multi_exec($mh, $running);
            curl_multi_select($mh, $selectTimeout);

            while ($info = curl_multi_info_read($mh)) {
                if ($info['msg'] === CURLMSG_DONE) {
                    $ch = $info['handle'];
                    $errno = curl_errno($ch);
                    $url = curl_getinfo($ch, CURLINFO_PRIVATE);
                    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);

                    // Decrease domain count
                    $domain = $handleDomains[$url] ?? '';
                    if ($domain && isset($this->domainCounts[$domain])) {
                        $this->domainCounts[$domain]--;
                    }

                    // Calculate response time
                    $responseTime = isset($startTimes[$url])
                        ? round(microtime(true) - $startTimes[$url], 3)
                        : 0;
                    unset($startTimes[$url]);

                    // v2.2: Adjust domain delay based on response
                    if ($domain) {
                        $this->adjustDomainDelay($domain, $httpCode);
                    }

                    if ($errno === 0 && $httpCode >= 200 && $httpCode < 400) {
                        $html = curl_multi_getcontent($ch);
                        $results[$url] = [
                            'success'       => true,
                            'html'          => $html,
                            'http_code'     => $httpCode,
                            'response_time' => $responseTime,
                            'error'         => null,
                        ];

                        // BACK-003: Record success in circuit breaker
                        if ($this->circuitBreaker && $domain) {
                            $this->circuitBreaker->recordSuccess($domain);
                        }

                        // v2.3: Record success in rate limiter for gradual recovery
                        if ($this->rateLimiter && $domain) {
                            $this->rateLimiter->recordSuccess($domain);
                        }
                    } else {
                        $error = $errno !== 0 ? curl_error($ch) : "HTTP $httpCode";
                        $results[$url] = [
                            'success'       => false,
                            'html'          => '',
                            'http_code'     => $httpCode,
                            'response_time' => $responseTime,
                            'error'         => $error,
                        ];

                        // BACK-003: Record failure in circuit breaker
                        if ($this->circuitBreaker && $domain) {
                            $errorType = $errno !== 0 ? 'curl_error' : "http_{$httpCode}";
                            $this->circuitBreaker->recordFailure($domain, $errorType);
                        }

                        // v2.3: Handle 429 with smart rate limiting
                        if ($httpCode === 429 && $this->rateLimiter && $domain) {
                            // Parse headers for Retry-After
                            $this->parseResponseHeaders($headerBuffers[$url] ?? '', $url);
                            $retryAfter = $this->getRetryAfterSeconds($url);

                            // Record 429 with Retry-After value
                            $this->rateLimiter->record429($domain, $retryAfter);

                            // Log the rate limiting
                            error_log("WebScraper: 429 from {$domain}, Retry-After: " .
                                     ($retryAfter ?? 'not specified') . "s");
                        }
                    }

                    // Clean up header buffer
                    unset($headerBuffers[$url]);

                    curl_multi_remove_handle($mh, $ch);
                    curl_close($ch);
                    unset($handles[$url]);
                    unset($handleDomains[$url]);
                    $activeRequests--;

                    // Add next URL respecting rate limits (com pequeno delay)
                    $nextUrl = $getNextUrl();
                    if ($nextUrl !== null) {
                        usleep(50000); // 50ms delay antes de nova requisição
                        $addRequest($nextUrl);
                    }
                }
            }

            // Re-check deferred queue (com throttle)
            while ($activeRequests < $maxConcurrent && (!empty($queue) || !empty($deferred))) {
                $nextUrl = $getNextUrl();
                if ($nextUrl === null) break;
                usleep(50000); // 50ms delay
                $addRequest($nextUrl);
            }

        } while ($running || $activeRequests > 0 || !empty($queue) || !empty($deferred));

        curl_multi_close($mh);

        // Ensure all URLs have a result
        foreach ($urls as $url) {
            if (!isset($results[$url])) {
                $results[$url] = [
                    'success'       => false,
                    'html'          => '',
                    'http_code'     => 0,
                    'response_time' => 0,
                    'error'         => 'Not processed',
                ];
            }
        }

        // SEC-008: Cleanup cookie file securely when done
        $this->cleanupCookieFile();

        // v2.3: Save rate limiter state
        if ($this->rateLimiter) {
            $this->rateLimiter->saveState();
        }

        return $results;
    }

    /**
     * Fetch single URL
     */
    public function fetch(string $url, int $timeout = 10): array
    {
        $result = $this->fetchBatch([$url], 1, $timeout);
        return $result[$url] ?? [
            'success'       => false,
            'html'          => '',
            'http_code'     => 0,
            'response_time' => 0,
            'error'         => 'Unknown error'
        ];
    }

    /**
     * Add custom User-Agent to the rotation pool
     */
    public function addUserAgent(string $userAgent): self
    {
        $this->userAgents[] = $userAgent;
        return $this;
    }

    /**
     * Get all User-Agents in the pool
     */
    public function getUserAgents(): array
    {
        return $this->userAgents;
    }

    /**
     * v2.2: Fetch URL with retry strategies
     * Tries different browser fingerprints when encountering 403 errors
     *
     * @param string $url URL to fetch
     * @param int $timeout Timeout in seconds
     * @return array Result with success, html, http_code, response_time, error
     */
    public function fetchWithRetryStrategies(string $url, int $timeout = 15): array
    {
        // Define retry strategies with different fingerprints
        $strategies = [
            ['type' => 'chrome', 'mobile' => false, 'delay' => 500],
            ['type' => 'firefox', 'mobile' => false, 'delay' => 1000],
            ['type' => 'safari', 'mobile' => false, 'delay' => 1500],
            ['type' => 'mobile', 'mobile' => true, 'delay' => 2000],
            ['type' => 'edge', 'mobile' => false, 'delay' => 2500],
            ['type' => 'googlebot', 'mobile' => false, 'delay' => 1000], // Some sites allow bots
        ];

        $lastResult = null;

        foreach ($strategies as $strategy) {
            $result = $this->fetchWithFingerprint(
                $url,
                $strategy['type'],
                $strategy['mobile'],
                $timeout
            );

            // If successful, return immediately
            if ($result['success']) {
                return $result;
            }

            $lastResult = $result;

            // If not a 403/429, don't retry with different fingerprint
            if (!in_array($result['http_code'], [403, 429, 0])) {
                return $result;
            }

            // Wait before next attempt
            usleep($strategy['delay'] * 1000);
        }

        // All strategies failed
        return $lastResult ?? [
            'success' => false,
            'html' => '',
            'http_code' => 0,
            'response_time' => 0,
            'error' => 'All retry strategies failed',
        ];
    }

    /**
     * v2.2: Fetch URL with specific browser fingerprint
     */
    private function fetchWithFingerprint(string $url, string $browserType, bool $isMobile, int $timeout): array
    {
        $startTime = microtime(true);

        $ch = curl_init($url);

        $userAgent = $this->getUserAgentByType($browserType);
        $headers = $this->getHeaders($url, $isMobile);

        // PERF-005: Build cURL options with DNS cache and Keep-Alive
        $curlOptions = [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS      => 5,
            CURLOPT_USERAGENT      => $userAgent,
            CURLOPT_SSL_VERIFYPEER => $this->sslVerify,
            CURLOPT_SSL_VERIFYHOST => $this->sslVerify ? 2 : 0,
            CURLOPT_TIMEOUT        => $timeout,
            CURLOPT_CONNECTTIMEOUT => min($timeout, 5),
            CURLOPT_ENCODING       => '',
            CURLOPT_HTTPHEADER     => $headers,
            CURLOPT_COOKIEJAR      => $this->cookieFile,
            CURLOPT_COOKIEFILE     => $this->cookieFile,
            // PERF-005: DNS cache to avoid repeated lookups
            CURLOPT_DNS_CACHE_TIMEOUT => $this->dnsCacheTimeout,
        ];

        // PERF-005: Add TCP Keep-Alive settings if enabled
        if ($this->tcpKeepAlive) {
            $curlOptions[CURLOPT_TCP_KEEPALIVE] = 1;
            $curlOptions[CURLOPT_TCP_KEEPIDLE] = $this->tcpKeepIdle;
            $curlOptions[CURLOPT_TCP_KEEPINTVL] = $this->tcpKeepInterval;
        }

        curl_setopt_array($ch, $curlOptions);

        $html = curl_exec($ch);
        $errno = curl_errno($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $error = $errno !== 0 ? curl_error($ch) : null;

        curl_close($ch);

        $responseTime = round(microtime(true) - $startTime, 3);

        // Adjust domain delay
        $domain = parse_url($url, PHP_URL_HOST) ?? '';
        if ($domain) {
            $this->adjustDomainDelay($domain, $httpCode);
        }

        if ($errno === 0 && $httpCode >= 200 && $httpCode < 400) {
            return [
                'success'       => true,
                'html'          => $html,
                'http_code'     => $httpCode,
                'response_time' => $responseTime,
                'error'         => null,
                'strategy'      => $browserType,
            ];
        }

        return [
            'success'       => false,
            'html'          => '',
            'http_code'     => $httpCode,
            'response_time' => $responseTime,
            'error'         => $error ?? "HTTP $httpCode",
            'strategy'      => $browserType,
        ];
    }

    /**
     * v2.2: Get current domain delays (for debugging)
     */
    public function getDomainDelays(): array
    {
        return $this->domainDelays;
    }

    /**
     * v2.2: Reset domain delays
     */
    public function resetDomainDelays(): self
    {
        $this->domainDelays = [];
        return $this;
    }

    /**
     * PERF-006: Pre-fetch DNS for a batch of URLs
     * Resolves DNS entries before the actual fetch to warm up cache
     * This can speed up subsequent requests by 50-200ms per domain
     *
     * @param array $urls URLs to pre-fetch DNS for
     * @param int $timeout Timeout in seconds for DNS resolution
     * @return array List of successfully pre-fetched domains
     */
    public function prefetchDns(array $urls, int $timeout = 5): array
    {
        if (empty($urls)) {
            return [];
        }

        // Extract unique domains from URLs
        $domains = [];
        foreach ($urls as $url) {
            $host = parse_url($url, PHP_URL_HOST);
            if ($host && !isset($domains[$host]) && !isset($this->prefetchedDomains[$host])) {
                $domains[$host] = $url;
            }
        }

        if (empty($domains)) {
            return array_keys($this->prefetchedDomains);
        }

        // Use cURL multi to resolve DNS in parallel
        $mh = curl_multi_init();
        $handles = [];

        foreach ($domains as $host => $url) {
            $ch = curl_init($url);
            curl_setopt_array($ch, [
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_NOBODY         => true,           // HEAD request only
                CURLOPT_CONNECT_ONLY   => true,           // Just connect, don't transfer
                CURLOPT_TIMEOUT        => $timeout,
                CURLOPT_CONNECTTIMEOUT => $timeout,
                CURLOPT_SSL_VERIFYPEER => $this->sslVerify,
                CURLOPT_SSL_VERIFYHOST => $this->sslVerify ? 2 : 0,
                CURLOPT_DNS_CACHE_TIMEOUT => $this->dnsCacheTimeout,
                CURLOPT_PRIVATE        => $host,
            ]);

            curl_multi_add_handle($mh, $ch);
            $handles[$host] = $ch;
        }

        // Execute all DNS resolutions in parallel
        $running = 0;
        do {
            curl_multi_exec($mh, $running);
            if ($running > 0) {
                curl_multi_select($mh, 0.1);
            }
        } while ($running > 0);

        // Collect results
        $resolved = [];
        while ($info = curl_multi_info_read($mh)) {
            if ($info['msg'] === CURLMSG_DONE) {
                $ch = $info['handle'];
                $host = curl_getinfo($ch, CURLINFO_PRIVATE);
                $errno = curl_errno($ch);

                if ($errno === 0 || $errno === CURLE_OK) {
                    $this->prefetchedDomains[$host] = true;
                    $resolved[] = $host;
                }

                curl_multi_remove_handle($mh, $ch);
                curl_close($ch);
            }
        }

        curl_multi_close($mh);

        return $resolved;
    }

    /**
     * PERF-006: Check if domain DNS was pre-fetched
     */
    public function isDnsPrefetched(string $domain): bool
    {
        return isset($this->prefetchedDomains[$domain]);
    }

    /**
     * PERF-006: Clear pre-fetched DNS cache
     */
    public function clearDnsPrefetch(): self
    {
        $this->prefetchedDomains = [];
        return $this;
    }

    /**
     * PERF-005: Configure DNS cache timeout
     *
     * @param int $seconds Seconds to cache DNS (0 = disabled, -1 = forever)
     */
    public function setDnsCacheTimeout(int $seconds): self
    {
        $this->dnsCacheTimeout = $seconds;
        return $this;
    }

    /**
     * PERF-005: Configure TCP Keep-Alive settings
     *
     * @param bool $enable Enable/disable Keep-Alive
     * @param int $idle Seconds before first keep-alive probe
     * @param int $interval Seconds between subsequent probes
     */
    public function setTcpKeepAlive(bool $enable, int $idle = 60, int $interval = 30): self
    {
        $this->tcpKeepAlive = $enable;
        $this->tcpKeepIdle = max(1, $idle);
        $this->tcpKeepInterval = max(1, $interval);
        return $this;
    }
}
