<?php
/**
 * ===========================================
 * FLOWBOT DCI - URL PRE-PROCESSOR v2.2
 * ===========================================
 * Validates and optimizes URLs before fetching
 * Reduces errors by filtering invalid URLs and
 * distributing load across domains
 *
 * v2.2 Changes:
 * - Social media domains (Twitter, Facebook, Instagram, TikTok, Telegram) now allowed
 * - Only truly inaccessible domains blocked (Snapchat, WhatsApp, Discord, LinkedIn)
 * - Added social media detection for special handling
 */

declare(strict_types=1);

namespace FlowbotDCI\Utils;

class UrlPreProcessor
{
    // v2.2: Only block domains that truly don't have public accessible content
    // Social media with public posts (Twitter, Facebook, Instagram, TikTok, Telegram) are now ALLOWED
    private array $blockedDomains = [
        'snapchat.com',     // Ephemeral stories, no public access
        'whatsapp.com',     // Private messaging only
        'discord.com',      // Mostly private servers
        'linkedin.com',     // Requires login for most content
    ];

    // v2.2: Social media domains with public content (NOT blocked, but flagged for special handling)
    private array $socialMediaDomains = [
        'facebook.com', 'fb.com', 'fb.watch',
        'instagram.com',
        'twitter.com', 'x.com',
        'tiktok.com',
        'telegram.org', 't.me',
    ];

    // Extensões de arquivo que não são HTML
    private array $nonHtmlExtensions = [
        '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
        '.zip', '.rar', '.7z', '.tar', '.gz',
        '.mp3', '.mp4', '.avi', '.mkv', '.mov', '.wmv',
        '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
        '.exe', '.msi', '.dmg', '.apk',
        '.css', '.js', '.json', '.xml', '.csv',
    ];

    /**
     * Pre-process URLs: validate, filter, and optimize order
     * @return array ['valid' => [...], 'invalid' => [...], 'optimized' => [...]]
     */
    public function process(array $urls): array
    {
        $valid = [];
        $invalid = [];

        foreach ($urls as $url) {
            $url = trim($url);
            $validation = $this->validateUrl($url);

            if ($validation['valid']) {
                $valid[] = $url;
            } else {
                $invalid[] = [
                    'url' => $url,
                    'reason' => $validation['reason']
                ];
            }
        }

        // Otimizar ordem: distribuir por domínio para evitar sobrecarga
        $optimized = $this->optimizeOrder($valid);

        return [
            'valid' => $valid,
            'invalid' => $invalid,
            'optimized' => $optimized,
            'stats' => [
                'total' => count($urls),
                'valid' => count($valid),
                'invalid' => count($invalid),
                'domains' => $this->countDomains($valid),
            ]
        ];
    }

    /**
     * Validate a single URL
     */
    public function validateUrl(string $url): array
    {
        // 1. Verificar formato básico
        if (empty($url)) {
            return ['valid' => false, 'reason' => 'Empty URL'];
        }

        // 2. Verificar se começa com http/https
        if (!preg_match('/^https?:\/\//i', $url)) {
            return ['valid' => false, 'reason' => 'Missing protocol'];
        }

        // 3. Validar formato da URL
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return ['valid' => false, 'reason' => 'Invalid URL format'];
        }

        // 4. Extrair host
        $host = parse_url($url, PHP_URL_HOST);
        if (empty($host)) {
            return ['valid' => false, 'reason' => 'No host found'];
        }

        // 5. Verificar domínios bloqueados
        foreach ($this->blockedDomains as $blocked) {
            if (stripos($host, $blocked) !== false) {
                return ['valid' => false, 'reason' => "Blocked domain: $blocked"];
            }
        }

        // 6. Verificar extensão do arquivo
        $path = parse_url($url, PHP_URL_PATH) ?? '';
        foreach ($this->nonHtmlExtensions as $ext) {
            if (str_ends_with(strtolower($path), $ext)) {
                return ['valid' => false, 'reason' => "Non-HTML extension: $ext"];
            }
        }

        // 7. Verificar tamanho da URL (muito longas causam problemas)
        if (strlen($url) > 2048) {
            return ['valid' => false, 'reason' => 'URL too long'];
        }

        return ['valid' => true, 'reason' => null];
    }

    /**
     * Optimize URL order to distribute load across domains
     * Intercala URLs de diferentes domínios para evitar sobrecarga
     */
    public function optimizeOrder(array $urls): array
    {
        if (count($urls) <= 1) {
            return $urls;
        }

        // Agrupar por domínio
        $byDomain = [];
        foreach ($urls as $url) {
            $host = parse_url($url, PHP_URL_HOST) ?? 'unknown';
            $domain = $this->getBaseDomain($host);

            if (!isset($byDomain[$domain])) {
                $byDomain[$domain] = [];
            }
            $byDomain[$domain][] = $url;
        }

        // Intercalar URLs de diferentes domínios (round-robin)
        $optimized = [];
        $maxCount = max(array_map('count', $byDomain));

        for ($i = 0; $i < $maxCount; $i++) {
            foreach ($byDomain as &$domainUrls) {
                if (isset($domainUrls[$i])) {
                    $optimized[] = $domainUrls[$i];
                }
            }
        }

        return $optimized;
    }

    /**
     * Get base domain (without subdomains)
     */
    private function getBaseDomain(string $host): string
    {
        $parts = explode('.', $host);
        $count = count($parts);

        if ($count >= 2) {
            return $parts[$count - 2] . '.' . $parts[$count - 1];
        }

        return $host;
    }

    /**
     * Count unique domains
     */
    private function countDomains(array $urls): int
    {
        $domains = [];
        foreach ($urls as $url) {
            $host = parse_url($url, PHP_URL_HOST) ?? '';
            $domain = $this->getBaseDomain($host);
            $domains[$domain] = true;
        }
        return count($domains);
    }

    /**
     * Add blocked domain
     */
    public function addBlockedDomain(string $domain): void
    {
        $this->blockedDomains[] = $domain;
    }

    /**
     * Remove blocked domain
     */
    public function removeBlockedDomain(string $domain): void
    {
        $this->blockedDomains = array_filter(
            $this->blockedDomains,
            fn($d) => $d !== $domain
        );
    }

    /**
     * v2.2: Check if URL is from a social media platform
     */
    public function isSocialMedia(string $url): bool
    {
        $host = parse_url($url, PHP_URL_HOST) ?? '';
        foreach ($this->socialMediaDomains as $domain) {
            if (stripos($host, $domain) !== false) {
                return true;
            }
        }
        return false;
    }

    /**
     * v2.2: Get social media platform name from URL
     * @return string|null Platform name or null if not social media
     */
    public function getSocialMediaPlatform(string $url): ?string
    {
        $host = parse_url($url, PHP_URL_HOST) ?? '';
        $host = strtolower($host);

        $platformMap = [
            'twitter.com' => 'twitter',
            'x.com' => 'twitter',
            'facebook.com' => 'facebook',
            'fb.com' => 'facebook',
            'fb.watch' => 'facebook',
            'instagram.com' => 'instagram',
            'tiktok.com' => 'tiktok',
            'telegram.org' => 'telegram',
            't.me' => 'telegram',
        ];

        foreach ($platformMap as $domain => $platform) {
            if (strpos($host, $domain) !== false) {
                return $platform;
            }
        }

        return null;
    }

    /**
     * v2.2: Get list of social media domains
     */
    public function getSocialMediaDomains(): array
    {
        return $this->socialMediaDomains;
    }

    /**
     * v2.2: Get list of blocked domains
     */
    public function getBlockedDomains(): array
    {
        return $this->blockedDomains;
    }
}
