<?php
/**
 * ===========================================
 * FLOWBOT DCI - ROBOTS.TXT VALIDATOR v1.0
 * ===========================================
 * Validates URLs against robots.txt rules
 *
 * Features:
 * - Parse robots.txt files
 * - Check if URLs are allowed for crawling
 * - Respect Crawl-delay directives
 * - Cache robots.txt per domain
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

class RobotsValidator
{
    const VERSION = '1.0';

    private string $userAgent = 'FlowbotDCI';
    private int $timeout = 10;
    private array $cache = []; // [domain => rules]
    private int $defaultCrawlDelay = 1;

    public function __construct(array $config = [])
    {
        $this->userAgent = $config['userAgent'] ?? 'FlowbotDCI';
        $this->timeout = $config['timeout'] ?? 10;
        $this->defaultCrawlDelay = $config['defaultCrawlDelay'] ?? 1;
    }

    /**
     * Check if a URL is allowed to be crawled
     *
     * @param string $url URL to check
     * @param string|null $userAgent User agent to check for (null = use default)
     * @return bool True if allowed, false if disallowed
     */
    public function isAllowed(string $url, ?string $userAgent = null): bool
    {
        $userAgent = $userAgent ?? $this->userAgent;

        $parsed = parse_url($url);
        if (!isset($parsed['host'])) {
            return true; // Invalid URL, allow by default
        }

        $domain = $parsed['scheme'] . '://' . $parsed['host'];
        $path = $parsed['path'] ?? '/';

        // Get rules for this domain
        $rules = $this->getRules($domain);

        // Check rules for this user agent
        return $this->checkRules($rules, $path, $userAgent);
    }

    /**
     * Get crawl delay for a domain
     *
     * @param string $domain Domain or URL
     * @return int Crawl delay in seconds
     */
    public function getCrawlDelay(string $domain): int
    {
        // Normalize domain
        if (!preg_match('#^https?://#', $domain)) {
            $domain = 'https://' . $domain;
        }

        $parsed = parse_url($domain);
        if (!isset($parsed['host'])) {
            return $this->defaultCrawlDelay;
        }

        $domain = $parsed['scheme'] . '://' . $parsed['host'];
        $rules = $this->getRules($domain);

        // Check for crawl-delay for our user agent
        $ua = strtolower($this->userAgent);
        if (isset($rules[$ua]['crawl-delay'])) {
            return (int)$rules[$ua]['crawl-delay'];
        }

        // Check wildcard
        if (isset($rules['*']['crawl-delay'])) {
            return (int)$rules['*']['crawl-delay'];
        }

        return $this->defaultCrawlDelay;
    }

    /**
     * Get robots.txt rules for a domain
     */
    private function getRules(string $domain): array
    {
        // Check cache
        if (isset($this->cache[$domain])) {
            return $this->cache[$domain];
        }

        // Fetch robots.txt
        $robotsUrl = rtrim($domain, '/') . '/robots.txt';
        $content = $this->fetchRobotsTxt($robotsUrl);

        // Parse rules
        $rules = $this->parseRobotsTxt($content);

        // Cache
        $this->cache[$domain] = $rules;

        return $rules;
    }

    /**
     * Fetch robots.txt content
     */
    private function fetchRobotsTxt(string $url): string
    {
        $ch = curl_init();
        curl_setopt_array($ch, [
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => $this->timeout,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS => 3,
            CURLOPT_USERAGENT => $this->userAgent,
        ]);

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);

        // If not found or error, allow everything
        if ($httpCode !== 200 || empty($response)) {
            return '';
        }

        return $response;
    }

    /**
     * Parse robots.txt content into rules
     */
    private function parseRobotsTxt(string $content): array
    {
        $rules = [];
        $currentUserAgent = null;

        $lines = explode("\n", $content);

        foreach ($lines as $line) {
            // Remove comments
            $line = preg_replace('/#.*$/', '', $line);
            $line = trim($line);

            if (empty($line)) {
                continue;
            }

            // Parse directive
            if (stripos($line, 'User-agent:') === 0) {
                $currentUserAgent = strtolower(trim(substr($line, 11)));
                if (!isset($rules[$currentUserAgent])) {
                    $rules[$currentUserAgent] = [
                        'disallow' => [],
                        'allow' => [],
                        'crawl-delay' => null,
                    ];
                }
            } elseif ($currentUserAgent !== null) {
                if (stripos($line, 'Disallow:') === 0) {
                    $path = trim(substr($line, 9));
                    if (!empty($path)) {
                        $rules[$currentUserAgent]['disallow'][] = $path;
                    }
                } elseif (stripos($line, 'Allow:') === 0) {
                    $path = trim(substr($line, 6));
                    if (!empty($path)) {
                        $rules[$currentUserAgent]['allow'][] = $path;
                    }
                } elseif (stripos($line, 'Crawl-delay:') === 0) {
                    $delay = trim(substr($line, 12));
                    if (is_numeric($delay)) {
                        $rules[$currentUserAgent]['crawl-delay'] = (int)$delay;
                    }
                }
            }
        }

        return $rules;
    }

    /**
     * Check if path is allowed by rules
     */
    private function checkRules(array $rules, string $path, string $userAgent): bool
    {
        $userAgent = strtolower($userAgent);

        // Get applicable rules (specific user agent or wildcard)
        $applicableRules = $rules[$userAgent] ?? $rules['*'] ?? null;

        if ($applicableRules === null) {
            return true; // No rules for this user agent
        }

        // Check allow rules first (they take precedence)
        foreach ($applicableRules['allow'] ?? [] as $pattern) {
            if ($this->matchPath($path, $pattern)) {
                return true;
            }
        }

        // Check disallow rules
        foreach ($applicableRules['disallow'] ?? [] as $pattern) {
            if ($this->matchPath($path, $pattern)) {
                return false;
            }
        }

        // Default: allowed
        return true;
    }

    /**
     * Match path against robots.txt pattern
     */
    private function matchPath(string $path, string $pattern): bool
    {
        if (empty($pattern)) {
            return false;
        }

        // Exact match
        if ($pattern === $path) {
            return true;
        }

        // Pattern with wildcard
        if (strpos($pattern, '*') !== false || strpos($pattern, '$') !== false) {
            // Convert to regex
            $regex = str_replace('*', '.*', $pattern);
            $regex = str_replace('$', '$', $regex);
            $regex = '#^' . $regex . '#';

            return (bool)preg_match($regex, $path);
        }

        // Prefix match
        return str_starts_with($path, $pattern);
    }

    /**
     * Clear cache
     */
    public function clearCache(): void
    {
        $this->cache = [];
    }

    /**
     * Set user agent
     */
    public function setUserAgent(string $userAgent): self
    {
        $this->userAgent = $userAgent;
        return $this;
    }
}
