<?php
/**
 * ============================================
 * FLOWBOT DCI - CONTENT EXTRACTOR v1.0
 * ============================================
 * Advanced metadata and content extraction from HTML.
 *
 * Features:
 * - OpenGraph metadata extraction
 * - Twitter Cards extraction
 * - Schema.org microdata parsing
 * - Auto-embed generation (YouTube, Vimeo, TikTok, etc.)
 * - Image/video detection with dimensions
 * - Favicon extraction with fallback chain
 * - Canonical URL resolution
 * - Author/date extraction
 * ============================================
 */

declare(strict_types=1);

namespace FlowbotDCI\Services\Crawler;

use DOMDocument;
use DOMXPath;

class ContentExtractor
{
    const VERSION = '1.0';

    /**
     * Default fallback values
     */
    private string $defaultThumbnail = 'fallback_image.jpg';
    private string $defaultFavicon = 'default_favicon.ico';

    /**
     * Embed patterns for video platforms
     */
    private array $embedPatterns = [
        'youtube' => [
            'patterns' => [
                '/(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})/',
            ],
            'template' => '<iframe src="https://www.youtube.com/embed/{id}" width="560" height="315" frameborder="0" allowfullscreen></iframe>',
        ],
        'vimeo' => [
            'patterns' => [
                '/vimeo\.com\/(?:video\/)?(\d+)/',
            ],
            'template' => '<iframe src="https://player.vimeo.com/video/{id}" width="560" height="315" frameborder="0" allowfullscreen></iframe>',
        ],
        'tiktok' => [
            'patterns' => [
                '/tiktok\.com\/@[^\/]+\/video\/(\d+)/',
                '/vm\.tiktok\.com\/([a-zA-Z0-9]+)/',
            ],
            'template' => '<iframe src="https://www.tiktok.com/embed/{id}" width="325" height="575" frameborder="0" allowfullscreen></iframe>',
        ],
        'instagram' => [
            'patterns' => [
                '/instagram\.com\/(?:p|reel)\/([a-zA-Z0-9_-]+)/',
            ],
            'template' => '<iframe src="https://www.instagram.com/p/{id}/embed" width="400" height="500" frameborder="0" scrolling="no" allowfullscreen></iframe>',
        ],
        'twitter' => [
            'patterns' => [
                '/(?:twitter|x)\.com\/[^\/]+\/status\/(\d+)/',
            ],
            'template' => '<blockquote class="twitter-tweet"><a href="https://twitter.com/i/status/{id}"></a></blockquote><script async src="https://platform.twitter.com/widgets.js"></script>',
        ],
        'dailymotion' => [
            'patterns' => [
                '/dailymotion\.com\/video\/([a-zA-Z0-9]+)/',
            ],
            'template' => '<iframe src="https://www.dailymotion.com/embed/video/{id}" width="560" height="315" frameborder="0" allowfullscreen></iframe>',
        ],
    ];

    /**
     * Set default thumbnail URL
     */
    public function setDefaultThumbnail(string $url): self
    {
        $this->defaultThumbnail = $url;
        return $this;
    }

    /**
     * Set default favicon URL
     */
    public function setDefaultFavicon(string $url): self
    {
        $this->defaultFavicon = $url;
        return $this;
    }

    /**
     * Extract all metadata from HTML
     */
    public function extract(string $html, string $url): array
    {
        $dom = new DOMDocument();
        libxml_use_internal_errors(true);
        @$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();

        $xpath = new DOMXPath($dom);
        $baseUrl = $this->getBaseUrl($url);

        $data = [
            'url' => $url,
            'title' => $this->extractTitle($xpath),
            'description' => $this->extractDescription($xpath),
            'thumbnail' => $this->extractThumbnail($xpath, $baseUrl),
            'favicon' => $this->extractFavicon($xpath, $baseUrl),
            'canonical_url' => $this->extractCanonicalUrl($xpath, $url),
            'author' => $this->extractAuthor($xpath),
            'published_date' => $this->extractPublishedDate($xpath),
            'modified_date' => $this->extractModifiedDate($xpath),
            'embed_code' => $this->generateEmbed($url),
            'site_name' => $this->extractSiteName($xpath),
            'locale' => $this->extractLocale($xpath),
            'type' => $this->extractType($xpath),
            'keywords' => $this->extractKeywords($xpath),
            'images' => $this->extractImages($xpath, $baseUrl),
            'videos' => $this->extractVideos($xpath, $baseUrl),
            'social_profiles' => $this->extractSocialProfiles($xpath),
            'opengraph' => $this->extractOpenGraph($xpath),
            'twitter_cards' => $this->extractTwitterCards($xpath),
            'schema_org' => $this->extractSchemaOrg($dom),
        ];

        // Generate tags from title
        $data['tags'] = $this->generateTags($data['title']);

        return $data;
    }

    /**
     * Extract title with fallback chain
     */
    public function extractTitle(DOMXPath $xpath): string
    {
        $queries = [
            '//meta[@property="og:title"]/@content',
            '//meta[@name="twitter:title"]/@content',
            '//title',
            '//h1',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? $nodes->item(0)->textContent ?? '');
                if (!empty($value)) {
                    return $this->cleanText($value);
                }
            }
        }

        return 'No title';
    }

    /**
     * Extract description with fallback chain
     */
    public function extractDescription(DOMXPath $xpath): string
    {
        $queries = [
            '//meta[@property="og:description"]/@content',
            '//meta[@name="twitter:description"]/@content',
            '//meta[@name="description"]/@content',
            '//p[string-length(text()) > 50]',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? $nodes->item(0)->textContent ?? '');
                if (!empty($value)) {
                    return $this->cleanText($value);
                }
            }
        }

        return 'No description';
    }

    /**
     * Extract thumbnail with extensive fallback chain
     */
    public function extractThumbnail(DOMXPath $xpath, string $baseUrl): string
    {
        $queries = [
            '//meta[@property="og:image:secure_url"]/@content',
            '//meta[@property="og:image:url"]/@content',
            '//meta[@property="og:image"]/@content',
            '//meta[@name="twitter:image:src"]/@content',
            '//meta[@name="twitter:image"]/@content',
            '//link[@rel="image_src"]/@href',
            '//link[@rel="apple-touch-icon"]/@href',
            '//link[@rel="apple-touch-icon-precomposed"]/@href',
            '//img[contains(@class, "post-thumbnail")]/@src',
            '//figure/img/@src',
            '//div[@class="post-thumbnail"]/img/@src',
            '//img[contains(@class, "wp-post-image")]/@src',
            '//article//img/@src',
            '//video/@poster',
            '//div[contains(@class, "video-thumbnail")]/img/@src',
            '//img[contains(@class, "featured")]/@src',
            '//img[contains(@class, "hero")]/@src',
            '//img[@id="main-image"]/@src',
            '//main//img/@src',
            '//img/@src',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $url = trim($nodes->item(0)->nodeValue ?? '');
                if (!empty($url)) {
                    $url = $this->resolveUrl($url, $baseUrl);
                    if ($this->isValidImageUrl($url)) {
                        return $url;
                    }
                }
            }
        }

        return $this->defaultThumbnail;
    }

    /**
     * Extract favicon with fallback chain
     */
    public function extractFavicon(DOMXPath $xpath, string $baseUrl): string
    {
        $queries = [
            '//link[@rel="icon"]/@href',
            '//link[@rel="shortcut icon"]/@href',
            '//link[@rel="apple-touch-icon"]/@href',
            '//link[@rel="apple-touch-icon-precomposed"]/@href',
            '//link[contains(@rel, "icon")]/@href',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $url = trim($nodes->item(0)->nodeValue ?? '');
                if (!empty($url)) {
                    return $this->resolveUrl($url, $baseUrl);
                }
            }
        }

        // Try /favicon.ico as last resort
        $faviconUrl = rtrim($baseUrl, '/') . '/favicon.ico';
        return $faviconUrl;
    }

    /**
     * Extract canonical URL
     */
    public function extractCanonicalUrl(DOMXPath $xpath, string $originalUrl): string
    {
        $nodes = $xpath->query('//link[@rel="canonical"]/@href');
        if ($nodes && $nodes->length > 0) {
            $canonical = trim($nodes->item(0)->nodeValue ?? '');
            if (filter_var($canonical, FILTER_VALIDATE_URL)) {
                return $canonical;
            }
        }

        // Check OpenGraph URL
        $nodes = $xpath->query('//meta[@property="og:url"]/@content');
        if ($nodes && $nodes->length > 0) {
            $ogUrl = trim($nodes->item(0)->nodeValue ?? '');
            if (filter_var($ogUrl, FILTER_VALIDATE_URL)) {
                return $ogUrl;
            }
        }

        return $originalUrl;
    }

    /**
     * Extract author
     */
    public function extractAuthor(DOMXPath $xpath): ?string
    {
        $queries = [
            '//meta[@name="author"]/@content',
            '//meta[@property="article:author"]/@content',
            '//meta[@name="twitter:creator"]/@content',
            '//a[@rel="author"]',
            '//*[@class="author"]',
            '//*[@class="byline"]',
            '//*[contains(@class, "author-name")]',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? $nodes->item(0)->textContent ?? '');
                if (!empty($value) && strlen($value) < 100) {
                    return $this->cleanText($value);
                }
            }
        }

        return null;
    }

    /**
     * Extract published date
     */
    public function extractPublishedDate(DOMXPath $xpath): ?string
    {
        $queries = [
            '//meta[@property="article:published_time"]/@content',
            '//meta[@name="date"]/@content',
            '//meta[@name="pubdate"]/@content',
            '//time[@datetime]/@datetime',
            '//time[@pubdate]/@datetime',
            '//*[@class="published"]/@datetime',
            '//*[contains(@class, "post-date")]',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? $nodes->item(0)->textContent ?? '');
                if (!empty($value)) {
                    $timestamp = strtotime($value);
                    if ($timestamp !== false) {
                        return date('Y-m-d H:i:s', $timestamp);
                    }
                }
            }
        }

        return null;
    }

    /**
     * Extract modified date
     */
    public function extractModifiedDate(DOMXPath $xpath): ?string
    {
        $queries = [
            '//meta[@property="article:modified_time"]/@content',
            '//meta[@name="last-modified"]/@content',
            '//time[@itemprop="dateModified"]/@datetime',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? '');
                if (!empty($value)) {
                    $timestamp = strtotime($value);
                    if ($timestamp !== false) {
                        return date('Y-m-d H:i:s', $timestamp);
                    }
                }
            }
        }

        return null;
    }

    /**
     * Generate embed code for video URLs
     */
    public function generateEmbed(string $url): ?string
    {
        foreach ($this->embedPatterns as $platform => $config) {
            foreach ($config['patterns'] as $pattern) {
                if (preg_match($pattern, $url, $matches)) {
                    $id = $matches[1] ?? null;
                    if ($id) {
                        return str_replace('{id}', $id, $config['template']);
                    }
                }
            }
        }

        return null;
    }

    /**
     * Extract site name
     */
    public function extractSiteName(DOMXPath $xpath): ?string
    {
        $queries = [
            '//meta[@property="og:site_name"]/@content',
            '//meta[@name="application-name"]/@content',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? '');
                if (!empty($value)) {
                    return $this->cleanText($value);
                }
            }
        }

        return null;
    }

    /**
     * Extract locale
     */
    public function extractLocale(DOMXPath $xpath): ?string
    {
        $queries = [
            '//meta[@property="og:locale"]/@content',
            '//html/@lang',
            '//meta[@http-equiv="content-language"]/@content',
        ];

        foreach ($queries as $query) {
            $nodes = $xpath->query($query);
            if ($nodes && $nodes->length > 0) {
                $value = trim($nodes->item(0)->nodeValue ?? '');
                if (!empty($value)) {
                    return $value;
                }
            }
        }

        return null;
    }

    /**
     * Extract content type
     */
    public function extractType(DOMXPath $xpath): string
    {
        $nodes = $xpath->query('//meta[@property="og:type"]/@content');
        if ($nodes && $nodes->length > 0) {
            return trim($nodes->item(0)->nodeValue ?? 'website');
        }
        return 'website';
    }

    /**
     * Extract keywords
     */
    public function extractKeywords(DOMXPath $xpath): array
    {
        $nodes = $xpath->query('//meta[@name="keywords"]/@content');
        if ($nodes && $nodes->length > 0) {
            $keywords = trim($nodes->item(0)->nodeValue ?? '');
            if (!empty($keywords)) {
                return array_map('trim', explode(',', $keywords));
            }
        }
        return [];
    }

    /**
     * Extract all images from page
     */
    public function extractImages(DOMXPath $xpath, string $baseUrl, int $limit = 10): array
    {
        $images = [];
        $nodes = $xpath->query('//img[@src]');

        $count = 0;
        foreach ($nodes as $node) {
            if ($count >= $limit) break;

            $src = $node->getAttribute('src');
            if (empty($src)) continue;

            $src = $this->resolveUrl($src, $baseUrl);
            if (!$this->isValidImageUrl($src)) continue;

            $images[] = [
                'url' => $src,
                'alt' => $node->getAttribute('alt') ?: null,
                'width' => $node->getAttribute('width') ?: null,
                'height' => $node->getAttribute('height') ?: null,
            ];
            $count++;
        }

        return $images;
    }

    /**
     * Extract videos from page
     */
    public function extractVideos(DOMXPath $xpath, string $baseUrl): array
    {
        $videos = [];

        // Check video tags
        $nodes = $xpath->query('//video[@src]');
        foreach ($nodes as $node) {
            $src = $this->resolveUrl($node->getAttribute('src'), $baseUrl);
            $videos[] = [
                'type' => 'native',
                'url' => $src,
                'poster' => $node->getAttribute('poster') ?: null,
            ];
        }

        // Check source tags inside video
        $nodes = $xpath->query('//video/source[@src]');
        foreach ($nodes as $node) {
            $src = $this->resolveUrl($node->getAttribute('src'), $baseUrl);
            $videos[] = [
                'type' => 'native',
                'url' => $src,
                'mime' => $node->getAttribute('type') ?: null,
            ];
        }

        // Check iframes for embedded videos
        $nodes = $xpath->query('//iframe[@src]');
        foreach ($nodes as $node) {
            $src = $node->getAttribute('src');
            if ($this->isVideoEmbed($src)) {
                $videos[] = [
                    'type' => 'embed',
                    'url' => $src,
                    'width' => $node->getAttribute('width') ?: null,
                    'height' => $node->getAttribute('height') ?: null,
                ];
            }
        }

        return $videos;
    }

    /**
     * Extract social media profile links
     */
    public function extractSocialProfiles(DOMXPath $xpath): array
    {
        $profiles = [];
        $patterns = [
            'facebook' => '/facebook\.com\/[^\/\s"\']+/',
            'twitter' => '/(?:twitter|x)\.com\/[^\/\s"\']+/',
            'instagram' => '/instagram\.com\/[^\/\s"\']+/',
            'linkedin' => '/linkedin\.com\/(?:in|company)\/[^\/\s"\']+/',
            'youtube' => '/youtube\.com\/(?:channel|user|c)\/[^\/\s"\']+/',
            'tiktok' => '/tiktok\.com\/@[^\/\s"\']+/',
            'pinterest' => '/pinterest\.com\/[^\/\s"\']+/',
            'github' => '/github\.com\/[^\/\s"\']+/',
        ];

        $nodes = $xpath->query('//a[@href]');
        foreach ($nodes as $node) {
            $href = $node->getAttribute('href');
            foreach ($patterns as $platform => $pattern) {
                if (preg_match($pattern, $href, $matches)) {
                    $profiles[$platform] = $matches[0];
                }
            }
        }

        return $profiles;
    }

    /**
     * Extract OpenGraph metadata
     */
    public function extractOpenGraph(DOMXPath $xpath): array
    {
        $og = [];
        $nodes = $xpath->query('//meta[starts-with(@property, "og:")]');

        foreach ($nodes as $node) {
            $property = str_replace('og:', '', $node->getAttribute('property'));
            $content = $node->getAttribute('content');
            $og[$property] = $content;
        }

        return $og;
    }

    /**
     * Extract Twitter Cards metadata
     */
    public function extractTwitterCards(DOMXPath $xpath): array
    {
        $twitter = [];
        $nodes = $xpath->query('//meta[starts-with(@name, "twitter:")]');

        foreach ($nodes as $node) {
            $name = str_replace('twitter:', '', $node->getAttribute('name'));
            $content = $node->getAttribute('content');
            $twitter[$name] = $content;
        }

        return $twitter;
    }

    /**
     * Extract Schema.org JSON-LD data
     */
    public function extractSchemaOrg(DOMDocument $dom): array
    {
        $schemas = [];
        $xpath = new DOMXPath($dom);
        $nodes = $xpath->query('//script[@type="application/ld+json"]');

        foreach ($nodes as $node) {
            $json = trim($node->textContent);
            if (!empty($json)) {
                $data = json_decode($json, true);
                if (json_last_error() === JSON_ERROR_NONE && $data) {
                    $schemas[] = $data;
                }
            }
        }

        return $schemas;
    }

    /**
     * Generate tags from title
     */
    public function generateTags(string $title): array
    {
        // Common words to exclude
        $stopWords = [
            'a', 'an', 'the', 'and', 'or', 'but', 'in', 'at', 'on', 'with', 'to',
            'for', 'is', 'of', 'that', 'it', 'by', 'from', 'as', 'are', 'was',
            'be', 'has', 'have', 'will', 'this', 'which', 'its', 'about', 'up',
            'more', 'who', 'also', 'they', 'out', 'he', 'she', 'you', 'their',
            'we', 'her', 'his', 'them', 'been', 'these', 'would', 'some', 'can',
            'like', 'there', 'if', 'all', 'my', 'what', 'so', 'then', 'into',
            'just', 'over', 'do', 'than', 'when', 'other', 'how', 'our', 'any',
            'new', 'me', 'after', 'most', 'made', 'only', 'time', 'where', 'year',
            '-', '–', '|', '•', ':', ';', ',', '.', '!', '?',
        ];

        // Clean and split title
        $title = preg_replace('/[^\w\s-]/u', ' ', strtolower($title));
        $words = preg_split('/\s+/', $title);

        // Filter words
        $tags = array_filter($words, function($word) use ($stopWords) {
            return strlen($word) > 2 && !in_array($word, $stopWords);
        });

        return array_values(array_unique(array_slice($tags, 0, 10)));
    }

    /**
     * Get base URL from full URL
     */
    private function getBaseUrl(string $url): string
    {
        $parts = parse_url($url);
        $scheme = $parts['scheme'] ?? 'https';
        $host = $parts['host'] ?? '';
        return "{$scheme}://{$host}";
    }

    /**
     * Resolve relative URL to absolute
     */
    private function resolveUrl(string $url, string $baseUrl): string
    {
        $url = trim($url);

        // Already absolute
        if (preg_match('/^https?:\/\//i', $url)) {
            return $url;
        }

        // Protocol-relative
        if (strpos($url, '//') === 0) {
            return 'https:' . $url;
        }

        // Absolute path
        if (strpos($url, '/') === 0) {
            return rtrim($baseUrl, '/') . $url;
        }

        // Relative path
        return rtrim($baseUrl, '/') . '/' . ltrim($url, '/');
    }

    /**
     * Check if URL is a valid image URL
     */
    private function isValidImageUrl(string $url): bool
    {
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return false;
        }

        // Check for common image extensions
        $imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp', 'ico'];
        $path = parse_url($url, PHP_URL_PATH) ?? '';
        $extension = strtolower(pathinfo($path, PATHINFO_EXTENSION));

        // Also accept URLs without extension (CDN images)
        return empty($extension) || in_array($extension, $imageExtensions);
    }

    /**
     * Check if URL is a video embed
     */
    private function isVideoEmbed(string $url): bool
    {
        $videoDomains = [
            'youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com',
            'tiktok.com', 'instagram.com', 'facebook.com', 'twitter.com',
            'twitch.tv', 'streamable.com', 'wistia.com',
        ];

        $host = parse_url($url, PHP_URL_HOST) ?? '';
        foreach ($videoDomains as $domain) {
            if (strpos($host, $domain) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Clean text content
     */
    private function cleanText(string $text): string
    {
        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
        $text = preg_replace('/\s+/', ' ', $text);
        return trim($text);
    }

    /**
     * Static factory method for quick extraction
     */
    public static function fromHtml(string $html, string $url): array
    {
        $extractor = new self();
        return $extractor->extract($html, $url);
    }
}
