<?php
/**
 * ===========================================
 * FLOWBOT DCI - CRAWL MANAGER v1.0
 * ===========================================
 * BFS (Breadth-First Search) Deep Crawler
 * Crawls websites layer by layer with real-time SSE events
 *
 * Features:
 * - BFS algorithm for layer-by-layer crawling
 * - Depth tracking and limiting
 * - URL queue management
 * - Visited URL deduplication
 * - Search term matching
 * - Real-time SSE event emission
 * - Auto-process integration with UrlProcessor
 * - Rate limiting integration
 * - Circuit breaker protection
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

use FlowbotDCI\Core\Database;

class CrawlManager
{
    const VERSION = '1.0';

    // Dependencies
    private WebScraper $scraper;
    private LinkExtractor $extractor;
    private ?DomainRateLimiter $rateLimiter = null;
    private ?CircuitBreaker $circuitBreaker = null;
    private ?Database $database = null;

    // Configuration
    private int $maxPages = 100;
    private int $maxDepth = 3;
    private int $maxParallel = 5;
    private bool $sameDomainOnly = true;
    private string $searchTerm = '';
    private array $includePatterns = [];
    private array $excludePatterns = [];
    private bool $respectRobots = true;
    private bool $autoProcess = true;
    private int $timeout = 10;

    // State
    private array $visited = [];           // URLs already visited (set for O(1) lookup)
    private array $queue = [];             // BFS queue [depth => [urls]]
    private int $visitedCount = 0;         // Total pages visited
    private int $foundCount = 0;           // Pages where search term was found
    private int $errorCount = 0;           // Failed requests
    private int $processedCount = 0;       // URLs sent to processor
    private string $crawlId = '';          // Unique crawl job ID
    private string $status = 'pending';    // pending, running, paused, completed, failed
    private float $startTime = 0;
    private ?string $startDomain = null;   // Starting domain for same-domain filtering
    private bool $stopped = false;         // Flag to stop crawling

    // SSE callback
    private $eventCallback = null;

    public function __construct(WebScraper $scraper, LinkExtractor $extractor)
    {
        $this->scraper = $scraper;
        $this->extractor = $extractor;
        $this->crawlId = uniqid('crawl_', true);
    }

    /**
     * Set rate limiter for domain-based throttling
     */
    public function setRateLimiter(DomainRateLimiter $limiter): self
    {
        $this->rateLimiter = $limiter;
        return $this;
    }

    /**
     * Set circuit breaker for failure protection
     */
    public function setCircuitBreaker(CircuitBreaker $breaker): self
    {
        $this->circuitBreaker = $breaker;
        return $this;
    }

    /**
     * Set database for persistence
     */
    public function setDatabase(Database $db): self
    {
        $this->database = $db;
        return $this;
    }

    /**
     * Set event callback for SSE emission
     */
    public function setEventCallback(callable $callback): self
    {
        $this->eventCallback = $callback;
        return $this;
    }

    /**
     * Configure crawler options
     */
    public function configure(array $options): self
    {
        $this->maxPages = min(max((int)($options['maxPages'] ?? 100), 1), 10000);
        $this->maxDepth = min(max((int)($options['maxDepth'] ?? 3), 1), 20);
        $this->maxParallel = min(max((int)($options['maxParallel'] ?? 5), 1), 50);
        $this->sameDomainOnly = (bool)($options['sameDomainOnly'] ?? true);
        $this->searchTerm = trim($options['searchTerm'] ?? '');
        $this->includePatterns = $options['includePatterns'] ?? [];
        $this->excludePatterns = $options['excludePatterns'] ?? [];
        $this->respectRobots = (bool)($options['respectRobots'] ?? true);
        $this->autoProcess = (bool)($options['autoProcess'] ?? true);
        $this->timeout = min(max((int)($options['timeout'] ?? 10), 1), 60);

        return $this;
    }

    /**
     * Start BFS crawl from a URL
     *
     * @param string $startUrl Starting URL
     * @return \Generator Yields events during crawl
     */
    public function startCrawl(string $startUrl): \Generator
    {
        // Validate start URL
        if (!$this->extractor->isValidUrl($startUrl)) {
            yield $this->createEvent('error', [
                'message' => "Invalid start URL: {$startUrl}",
                'code' => 'INVALID_URL',
            ]);
            return;
        }

        // Initialize state
        $this->visited = [];
        $this->queue = [];
        $this->visitedCount = 0;
        $this->foundCount = 0;
        $this->errorCount = 0;
        $this->processedCount = 0;
        $this->startTime = microtime(true);
        $this->status = 'running';
        $this->stopped = false;
        $this->startDomain = $this->extractor->extractDomain($startUrl);

        // Add start URL to queue at depth 0
        $this->addToQueue($startUrl, 0);

        // Emit start event
        yield $this->createEvent('status', [
            'message' => "Starting crawl from {$startUrl}",
            'crawlId' => $this->crawlId,
            'config' => [
                'maxPages' => $this->maxPages,
                'maxDepth' => $this->maxDepth,
                'maxParallel' => $this->maxParallel,
                'sameDomainOnly' => $this->sameDomainOnly,
                'searchTerm' => $this->searchTerm ?: '(none)',
            ],
        ]);

        // Save to database if available
        $this->saveCrawlJob();

        // Process each depth level (BFS)
        for ($depth = 0; $depth <= $this->maxDepth; $depth++) {
            if ($this->stopped) {
                break;
            }

            if ($this->visitedCount >= $this->maxPages) {
                break;
            }

            if (!isset($this->queue[$depth]) || empty($this->queue[$depth])) {
                continue;
            }

            yield $this->createEvent('status', [
                'message' => "Processing depth {$depth}...",
                'depth' => $depth,
                'urlsAtDepth' => count($this->queue[$depth]),
            ]);

            // Process all URLs at current depth
            yield from $this->processLevel($depth);
        }

        // Finalize
        $this->status = $this->stopped ? 'stopped' : 'completed';
        $elapsed = round(microtime(true) - $this->startTime, 2);

        $this->updateCrawlJob();

        yield $this->createEvent('done', [
            'message' => "Crawl completed",
            'crawlId' => $this->crawlId,
            'visitedCount' => $this->visitedCount,
            'foundCount' => $this->foundCount,
            'errorCount' => $this->errorCount,
            'processedCount' => $this->processedCount,
            'elapsed' => $elapsed,
            'pagesPerSecond' => $this->visitedCount > 0 ? round($this->visitedCount / $elapsed, 2) : 0,
        ]);
    }

    /**
     * Process all URLs at a specific depth level
     */
    private function processLevel(int $depth): \Generator
    {
        $urls = $this->queue[$depth] ?? [];

        // Process in batches
        $batches = array_chunk($urls, $this->maxParallel);

        foreach ($batches as $batch) {
            if ($this->stopped || $this->visitedCount >= $this->maxPages) {
                break;
            }

            // Filter out already visited URLs
            $batch = array_filter($batch, fn($url) => !isset($this->visited[$url]));

            if (empty($batch)) {
                continue;
            }

            // Fetch batch of URLs
            $results = $this->scraper->fetchBatch($batch, $this->maxParallel, $this->timeout);

            foreach ($results as $url => $result) {
                if ($this->stopped || $this->visitedCount >= $this->maxPages) {
                    break;
                }

                // Mark as visited
                $this->visited[$url] = true;
                $this->visitedCount++;

                if ($result['success']) {
                    $html = $result['html'] ?? '';
                    $httpCode = $result['http_code'] ?? 200;
                    $responseTime = $result['response_time'] ?? 0;

                    // Check for search term
                    $hasSearchTerm = $this->containsSearchTerm($html);
                    if ($hasSearchTerm) {
                        $this->foundCount++;
                    }

                    // Emit visit event
                    yield $this->createEvent($hasSearchTerm ? 'found' : 'visit', [
                        'message' => $hasSearchTerm
                            ? "Found term in: {$url}"
                            : "Visited: {$url}",
                        'url' => $url,
                        'depth' => $depth,
                        'httpCode' => $httpCode,
                        'responseTime' => round($responseTime * 1000),
                        'foundTerm' => $hasSearchTerm,
                        'visitedCount' => $this->visitedCount,
                        'foundCount' => $this->foundCount,
                    ]);

                    // Save URL to database
                    $this->saveCrawlUrl($url, $depth, $hasSearchTerm ? 'found' : 'visited', $httpCode, $responseTime);

                    // Extract and queue new links (for next depth)
                    if ($depth < $this->maxDepth) {
                        $links = $this->extractAndFilterLinks($html, $url);

                        foreach ($links as $link) {
                            $this->addToQueue($link, $depth + 1);
                        }

                        if (!empty($links)) {
                            yield $this->createEvent('status', [
                                'message' => "Extracted " . count($links) . " new links from {$url}",
                                'newLinks' => count($links),
                                'queueSize' => $this->getQueueSize(),
                            ]);
                        }
                    }

                    // Auto-process if enabled
                    if ($this->autoProcess && $hasSearchTerm) {
                        $this->processedCount++;
                        // Note: Actual processing happens after crawl completes
                    }

                } else {
                    $this->errorCount++;
                    $error = $result['error'] ?? 'Unknown error';

                    yield $this->createEvent('error', [
                        'message' => "Failed: {$url} - {$error}",
                        'url' => $url,
                        'depth' => $depth,
                        'error' => $error,
                        'httpCode' => $result['http_code'] ?? 0,
                        'visitedCount' => $this->visitedCount,
                        'errorCount' => $this->errorCount,
                    ]);

                    $this->saveCrawlUrl($url, $depth, 'error', $result['http_code'] ?? 0, 0, $error);
                }
            }
        }
    }

    /**
     * Extract links from HTML and apply filters
     */
    private function extractAndFilterLinks(string $html, string $baseUrl): array
    {
        // Extract all links
        $links = $this->extractor->extractLinks($html, $baseUrl);

        // Filter non-content URLs (images, PDFs, etc.)
        $links = $this->extractor->filterNonContent($links);

        // Filter by domain if same-domain only
        if ($this->sameDomainOnly && $this->startDomain) {
            $links = $this->extractor->filterByDomain($links, $this->startDomain, true);
        }

        // Apply custom patterns
        if (!empty($this->includePatterns) || !empty($this->excludePatterns)) {
            $links = $this->extractor->filterByPattern($links, $this->includePatterns, $this->excludePatterns);
        }

        // Deduplicate against visited
        $links = $this->extractor->deduplicate($links, array_keys($this->visited));

        // Also deduplicate against queue
        $links = array_filter($links, fn($url) => !$this->isInQueue($url));

        return $links;
    }

    /**
     * Check if search term exists in HTML
     */
    private function containsSearchTerm(string $html): bool
    {
        if (empty($this->searchTerm)) {
            return true; // No search term = all pages match
        }

        // Case-insensitive search
        return stripos($html, $this->searchTerm) !== false;
    }

    /**
     * Add URL to BFS queue at specified depth
     */
    private function addToQueue(string $url, int $depth): void
    {
        // Skip if already visited or in queue
        if (isset($this->visited[$url]) || $this->isInQueue($url)) {
            return;
        }

        // Initialize depth array if needed
        if (!isset($this->queue[$depth])) {
            $this->queue[$depth] = [];
        }

        $this->queue[$depth][] = $url;
    }

    /**
     * Check if URL is already in any queue level
     */
    private function isInQueue(string $url): bool
    {
        foreach ($this->queue as $level) {
            if (in_array($url, $level, true)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Get total queue size across all depths
     */
    private function getQueueSize(): int
    {
        $total = 0;
        foreach ($this->queue as $level) {
            $total += count($level);
        }
        return $total;
    }

    /**
     * Create an event array
     */
    private function createEvent(string $type, array $data): array
    {
        $event = [
            'event' => $type,
            'data' => $data,
            'timestamp' => date('Y-m-d\TH:i:s.') . sprintf('%03d', (microtime(true) - floor(microtime(true))) * 1000) . 'Z',
            'crawlId' => $this->crawlId,
        ];

        // Call event callback if set
        if ($this->eventCallback) {
            call_user_func($this->eventCallback, $type, $data);
        }

        return $event;
    }

    /**
     * Stop the crawler
     */
    public function stop(): void
    {
        $this->stopped = true;
        $this->status = 'stopped';
    }

    /**
     * Get current crawl statistics
     */
    public function getStats(): array
    {
        return [
            'crawlId' => $this->crawlId,
            'status' => $this->status,
            'visitedCount' => $this->visitedCount,
            'foundCount' => $this->foundCount,
            'errorCount' => $this->errorCount,
            'processedCount' => $this->processedCount,
            'queueSize' => $this->getQueueSize(),
            'elapsed' => $this->startTime > 0 ? round(microtime(true) - $this->startTime, 2) : 0,
            'config' => [
                'maxPages' => $this->maxPages,
                'maxDepth' => $this->maxDepth,
                'maxParallel' => $this->maxParallel,
                'searchTerm' => $this->searchTerm,
            ],
        ];
    }

    /**
     * Get all found URLs (where search term was matched)
     */
    public function getFoundUrls(): array
    {
        if (!$this->database) {
            return [];
        }

        $stmt = $this->database->prepare("
            SELECT url, depth, http_code, response_time, visited_at
            FROM crawler_urls
            WHERE job_id = ? AND found_term = 1
            ORDER BY depth ASC, visited_at ASC
        ");
        $stmt->execute([$this->crawlId]);

        return $stmt->fetchAll(\PDO::FETCH_ASSOC);
    }

    /**
     * Get all visited URLs
     */
    public function getVisitedUrls(): array
    {
        return array_keys($this->visited);
    }

    /**
     * Save crawl job to database
     */
    private function saveCrawlJob(): void
    {
        if (!$this->database) {
            return;
        }

        try {
            $stmt = $this->database->prepare("
                INSERT INTO crawler_jobs (id, type, start_url, search_term, config, status, started_at, created_at)
                VALUES (?, 'deep', ?, ?, ?, ?, NOW(), NOW())
                ON DUPLICATE KEY UPDATE status = VALUES(status), started_at = VALUES(started_at)
            ");

            $stmt->execute([
                $this->crawlId,
                $this->startDomain ? "https://{$this->startDomain}/" : '',
                $this->searchTerm,
                json_encode([
                    'maxPages' => $this->maxPages,
                    'maxDepth' => $this->maxDepth,
                    'maxParallel' => $this->maxParallel,
                    'sameDomainOnly' => $this->sameDomainOnly,
                ]),
                $this->status,
            ]);
        } catch (\Exception $e) {
            error_log("CrawlManager: Failed to save crawl job: " . $e->getMessage());
        }
    }

    /**
     * Update crawl job in database
     */
    private function updateCrawlJob(): void
    {
        if (!$this->database) {
            return;
        }

        try {
            $stmt = $this->database->prepare("
                UPDATE crawler_jobs
                SET status = ?,
                    visited_count = ?,
                    found_count = ?,
                    processed_count = ?,
                    error_count = ?,
                    completed_at = NOW()
                WHERE id = ?
            ");

            $stmt->execute([
                $this->status,
                $this->visitedCount,
                $this->foundCount,
                $this->processedCount,
                $this->errorCount,
                $this->crawlId,
            ]);
        } catch (\Exception $e) {
            error_log("CrawlManager: Failed to update crawl job: " . $e->getMessage());
        }
    }

    /**
     * Save URL to database
     */
    private function saveCrawlUrl(string $url, int $depth, string $status, int $httpCode, float $responseTime, ?string $error = null): void
    {
        if (!$this->database) {
            return;
        }

        try {
            $stmt = $this->database->prepare("
                INSERT INTO crawler_urls (job_id, url, depth, status, found_term, http_code, response_time, error_message, visited_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, NOW())
            ");

            $stmt->execute([
                $this->crawlId,
                $url,
                $depth,
                $status,
                $status === 'found' ? 1 : 0,
                $httpCode,
                $responseTime,
                $error,
            ]);
        } catch (\Exception $e) {
            error_log("CrawlManager: Failed to save URL: " . $e->getMessage());
        }
    }

    /**
     * Get crawl ID
     */
    public function getCrawlId(): string
    {
        return $this->crawlId;
    }
}
