<?php
/**
 * ===========================================
 * FLOWBOT DCI - METADATA EXTRACTOR SERVICE v2.2
 * ===========================================
 * Extracts metadata from HTML content
 * With integrated social media extraction support
 *
 * v2.2 Changes:
 * - Integrated SocialMediaExtractor for Twitter, Facebook, Instagram, TikTok, Telegram
 * - Priority: Social media extraction first, then standard extraction
 * - Merged metadata from both sources for best coverage
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

use DOMDocument;
use DOMXPath;

class MetadataExtractor
{
    private ?DOMDocument $dom = null;
    private ?DOMXPath $xpath = null;
    private string $baseUrl = '';
    private array $defaults;

    // OTIMIZADO: Cache de parse_url para evitar chamadas repetidas
    private ?array $parsedUrl = null;

    // v2.2: Social media extractor for specialized extraction
    private ?SocialMediaExtractor $socialExtractor = null;

    // v2.2: List of social media domains
    private array $socialMediaDomains = [
        'twitter.com', 'x.com',
        'facebook.com', 'fb.com', 'fb.watch',
        'instagram.com',
        'tiktok.com',
        'telegram.org', 't.me',
    ];

    public function __construct(array $defaults = [])
    {
        $this->defaults = array_merge([
            'thumbnail' => 'fallback_image.jpg',
            'favicon'   => 'default_favicon.ico',
        ], $defaults);

        // v2.2: Initialize social media extractor
        $this->socialExtractor = new SocialMediaExtractor();
    }

    /**
     * Extract all metadata from HTML
     * OTIMIZADO: Cache de parse_url + DOM cleanup após uso
     * v2.2: Social media extraction priority
     */
    public function extract(string $html, string $url): array
    {
        // OTIMIZADO: Parse URL uma única vez e usar cache
        $this->parsedUrl = parse_url($url);
        $host = $this->parsedUrl['host'] ?? '';

        // v2.2: Check if this is a social media URL and try specialized extraction first
        $socialData = [];
        if ($this->isSocialMedia($url) && $this->socialExtractor !== null) {
            $socialData = $this->socialExtractor->extract($url, $html);
        }

        // Standard extraction (always do this for fallback values)
        $this->loadHtml($html);
        $this->baseUrl = $this->getBaseUrl($url);

        $standardResult = [
            'title'       => $this->getTitle(),
            'description' => $this->getDescription(),
            'thumbnail'   => $this->getThumbnail(),
            'favicon'     => $this->getFavicon(),
            'host'        => $host,
            'domain'      => $host,
        ];

        // v2.2: Merge social media data with standard extraction
        // Social media data takes priority for fields it provides
        $result = $this->mergeSocialMediaData($standardResult, $socialData);

        // OTIMIZADO: Cleanup explícito para liberar memória (50-100MB savings per batch)
        $this->dom = null;
        $this->xpath = null;
        $this->parsedUrl = null;

        return $result;
    }

    /**
     * v2.2: Check if URL is from a social media platform
     */
    private function isSocialMedia(string $url): bool
    {
        $host = strtolower(parse_url($url, PHP_URL_HOST) ?? '');
        foreach ($this->socialMediaDomains as $domain) {
            if (strpos($host, $domain) !== false) {
                return true;
            }
        }
        return false;
    }

    /**
     * v2.2: Merge social media data with standard extraction
     * Social media specific fields take priority
     */
    private function mergeSocialMediaData(array $standard, array $social): array
    {
        if (empty($social)) {
            return $standard;
        }

        $result = $standard;

        // Override with social media data if available and not empty
        $fieldsToMerge = ['title', 'description', 'thumbnail'];
        foreach ($fieldsToMerge as $field) {
            if (!empty($social[$field]) && $social[$field] !== 'No title' && $social[$field] !== 'No description') {
                $result[$field] = $social[$field];
            }
        }

        // Add extra social media specific fields
        $extraFields = ['platform', 'type', 'author', 'embed_url', 'video_url', 'shortcode', 'tweet_id', 'video_id', 'channel', 'message_id'];
        foreach ($extraFields as $field) {
            if (!empty($social[$field])) {
                $result[$field] = $social[$field];
            }
        }

        // Flag that this is a social media extraction
        if (!empty($social['platform'])) {
            $result['is_social_media'] = true;
        }

        return $result;
    }

    /**
     * Load HTML into DOM
     */
    private function loadHtml(string $html): void
    {
        $this->dom = new DOMDocument();
        libxml_use_internal_errors(true);
        $this->dom->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR);
        libxml_clear_errors();
        $this->xpath = new DOMXPath($this->dom);
    }

    /**
     * Get base URL from full URL
     * OTIMIZADO: Usa cache de parsedUrl
     */
    private function getBaseUrl(string $url): string
    {
        $scheme = $this->parsedUrl['scheme'] ?? 'https';
        $host = $this->parsedUrl['host'] ?? '';
        return "$scheme://$host";
    }

    /**
     * Extract page title
     */
    public function getTitle(): string
    {
        $queries = [
            '//meta[@property="og:title"]/@content',
            '//meta[@name="twitter:title"]/@content',
            '//title',
            '//h1',
        ];

        foreach ($queries as $query) {
            $nodes = $this->xpath->query($query);
            if ($nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue);
                if (!empty($value)) {
                    return $this->cleanText($value);
                }
            }
        }

        return 'No title';
    }

    /**
     * Extract page description
     */
    public function getDescription(): string
    {
        $queries = [
            '//meta[@property="og:description"]/@content',
            '//meta[@name="twitter:description"]/@content',
            '//meta[@name="description"]/@content',
            '//p',
        ];

        foreach ($queries as $query) {
            $nodes = $this->xpath->query($query);
            if ($nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue);
                if (!empty($value)) {
                    return $this->cleanText(mb_substr($value, 0, 500));
                }
            }
        }

        return 'No description';
    }

    /**
     * Extract thumbnail image
     * OTIMIZADO: Reduziu de 15+ queries separadas para 3 queries combinadas
     */
    public function getThumbnail(): string
    {
        // OTIMIZADO: Query 1 - Meta tags OG e Twitter (mais comuns e prioritárias)
        $metaQuery = '//meta[@property="og:image" or @property="og:image:secure_url" or @property="og:image:url" or @name="twitter:image" or @name="twitter:image:src"]/@content';
        $nodes = $this->xpath->query($metaQuery);
        if ($nodes && $nodes->length > 0) {
            foreach ($nodes as $node) {
                $url = $this->resolveUrl(trim($node->nodeValue));
                if ($this->isValidUrl($url)) {
                    return $url;
                }
            }
        }

        // OTIMIZADO: Query 2 - Apple touch icons e links
        $linkQuery = '//link[@rel="apple-touch-icon" or @rel="apple-touch-icon-precomposed"]/@href';
        $nodes = $this->xpath->query($linkQuery);
        if ($nodes && $nodes->length > 0) {
            $url = $this->resolveUrl(trim($nodes->item(0)->nodeValue));
            if ($this->isValidUrl($url)) {
                return $url;
            }
        }

        // OTIMIZADO: Query 3 - Imagens no conteúdo (fallback)
        $imgQuery = '//article//img/@src | //main//img/@src | //figure/img/@src | //video/@poster';
        $nodes = $this->xpath->query($imgQuery);
        if ($nodes && $nodes->length > 0) {
            foreach ($nodes as $node) {
                $url = $this->resolveUrl(trim($node->nodeValue));
                if ($this->isValidUrl($url)) {
                    return $url;
                }
            }
        }

        // Fallback: background-image (mantido como último recurso)
        $inlineStyles = $this->xpath->query('//div[contains(@style, "background-image")]');
        if ($inlineStyles) {
            foreach ($inlineStyles as $node) {
                $style = $node->getAttribute('style');
                if (preg_match('/url\(["\']?(.*?)["\']?\)/i', $style, $matches)) {
                    $url = $this->resolveUrl(trim($matches[1]));
                    if ($this->isValidUrl($url)) {
                        return $url;
                    }
                }
            }
        }

        return $this->defaults['thumbnail'];
    }

    /**
     * Extract favicon
     */
    public function getFavicon(): string
    {
        $queries = [
            '//link[@rel="icon"]/@href',
            '//link[@rel="shortcut icon"]/@href',
            '//link[contains(@rel, "icon")]/@href',
        ];

        foreach ($queries as $query) {
            $nodes = $this->xpath->query($query);
            if ($nodes->length > 0 && $nodes->item(0)->nodeValue) {
                $url = $this->resolveUrl(trim($nodes->item(0)->nodeValue));
                if ($this->isValidUrl($url)) {
                    return $url;
                }
            }
        }

        // Try default favicon location
        $defaultFavicon = $this->baseUrl . '/favicon.ico';
        return $defaultFavicon;
    }

    /**
     * Resolve relative URL to absolute
     */
    private function resolveUrl(string $url): string
    {
        if (empty($url)) {
            return '';
        }

        // Already absolute
        if (parse_url($url, PHP_URL_SCHEME) !== null) {
            return $url;
        }

        // Protocol-relative
        if (strpos($url, '//') === 0) {
            return 'https:' . $url;
        }

        // Absolute path
        if (strpos($url, '/') === 0) {
            return rtrim($this->baseUrl, '/') . $url;
        }

        // Relative path
        return rtrim($this->baseUrl, '/') . '/' . ltrim($url, '/');
    }

    /**
     * Validate URL
     */
    private function isValidUrl(string $url): bool
    {
        return filter_var($url, FILTER_VALIDATE_URL) !== false;
    }

    /**
     * Clean text (remove extra whitespace, etc.)
     */
    private function cleanText(string $text): string
    {
        $text = preg_replace('/\s+/', ' ', $text);
        return trim($text);
    }
}
