<?php
/**
 * ===========================================
 * FLOWBOT DCI - SOCIAL MEDIA EXTRACTOR v2.2
 * ===========================================
 * Extracts metadata from social media posts
 * Supports: Twitter/X, Facebook, Instagram, TikTok, Telegram
 *
 * This extractor uses OpenGraph and Twitter Cards tags
 * to extract title, description, thumbnail, and embed URLs
 * from public social media posts without requiring API authentication.
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

class SocialMediaExtractor
{
    /**
     * Extract metadata from social media URL
     *
     * @param string $url The URL to extract from
     * @param string $html The HTML content of the page
     * @return array Extracted metadata
     */
    public function extract(string $url, string $html): array
    {
        $platform = $this->detectPlatform($url);

        if ($platform === null) {
            return [];
        }

        return match ($platform) {
            'twitter' => $this->extractTwitter($url, $html),
            'facebook' => $this->extractFacebook($url, $html),
            'instagram' => $this->extractInstagram($url, $html),
            'tiktok' => $this->extractTikTok($url, $html),
            'telegram' => $this->extractTelegram($url, $html),
            default => []
        };
    }

    /**
     * Detect social media platform from URL
     */
    public function detectPlatform(string $url): ?string
    {
        $host = strtolower(parse_url($url, PHP_URL_HOST) ?? '');

        $platformMap = [
            'twitter.com' => 'twitter',
            'x.com' => 'twitter',
            'facebook.com' => 'facebook',
            'fb.com' => 'facebook',
            'fb.watch' => 'facebook',
            'instagram.com' => 'instagram',
            'tiktok.com' => 'tiktok',
            'telegram.org' => 'telegram',
            't.me' => 'telegram',
        ];

        foreach ($platformMap as $domain => $platform) {
            if (strpos($host, $domain) !== false) {
                return $platform;
            }
        }

        return null;
    }

    /**
     * Check if URL is from a supported social media platform
     */
    public function isSupportedPlatform(string $url): bool
    {
        return $this->detectPlatform($url) !== null;
    }

    /**
     * Extract metadata from Twitter/X posts
     */
    private function extractTwitter(string $url, string $html): array
    {
        $data = [
            'platform' => 'twitter',
            'type' => 'tweet',
            'url' => $url,
        ];

        // 1. Try OpenGraph first (more reliable)
        $data = array_merge($data, $this->extractOpenGraph($html));

        // 2. Fallback to Twitter Cards
        $data = array_merge($data, $this->extractTwitterCards($html));

        // 3. Extract tweet ID and username from URL
        if (preg_match('#(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)#i', $url, $matches)) {
            $data['author'] = '@' . $matches[1];
            $data['tweet_id'] = $matches[2];
            $data['embed_url'] = "https://platform.twitter.com/embed/Tweet.html?id={$matches[2]}";
        }

        // 4. Try to extract author from page if not in URL
        if (empty($data['author'])) {
            if (preg_match('#(?:twitter\.com|x\.com)/([^/?\s]+)#i', $url, $matches)) {
                $data['author'] = '@' . $matches[1];
            }
        }

        return $data;
    }

    /**
     * Extract metadata from Facebook posts
     */
    private function extractFacebook(string $url, string $html): array
    {
        $data = [
            'platform' => 'facebook',
            'url' => $url,
        ];

        // Facebook is the creator of OpenGraph, so it always has OG tags
        $data = array_merge($data, $this->extractOpenGraph($html));

        // Detect content type from URL
        if (strpos($url, '/watch') !== false || strpos($url, '/videos/') !== false) {
            $data['type'] = 'video';
        } elseif (strpos($url, '/photo') !== false || strpos($url, '/photos/') !== false) {
            $data['type'] = 'photo';
        } elseif (strpos($url, '/reel') !== false) {
            $data['type'] = 'reel';
        } elseif (strpos($url, '/events/') !== false) {
            $data['type'] = 'event';
        } else {
            $data['type'] = 'post';
        }

        // Extract video URL if present
        if (preg_match('/<meta property="og:video(?::url)?" content="([^"]+)"/', $html, $matches)) {
            $data['video_url'] = html_entity_decode($matches[1]);
        }

        // Extract post ID for embedding
        if (preg_match('#facebook\.com/.+/(?:posts|videos)/(\d+)#', $url, $matches)) {
            $data['post_id'] = $matches[1];
        }

        return $data;
    }

    /**
     * Extract metadata from Instagram posts
     */
    private function extractInstagram(string $url, string $html): array
    {
        $data = [
            'platform' => 'instagram',
            'url' => $url,
        ];

        // Instagram uses OpenGraph
        $data = array_merge($data, $this->extractOpenGraph($html));

        // Detect content type from URL
        if (strpos($url, '/reel/') !== false) {
            $data['type'] = 'reel';
        } elseif (strpos($url, '/p/') !== false) {
            $data['type'] = 'post';
        } elseif (strpos($url, '/stories/') !== false) {
            $data['type'] = 'story';
        } elseif (strpos($url, '/tv/') !== false) {
            $data['type'] = 'igtv';
        } else {
            $data['type'] = 'profile';
        }

        // Extract shortcode for embedding
        if (preg_match('#instagram\.com/(?:p|reel|tv)/([A-Za-z0-9_-]+)#i', $url, $matches)) {
            $data['shortcode'] = $matches[1];
            $data['embed_url'] = "https://www.instagram.com/p/{$matches[1]}/embed/";
        }

        // Extract username from URL or page
        if (preg_match('#instagram\.com/([^/?]+)#i', $url, $matches)) {
            $username = $matches[1];
            // Filter out known paths that aren't usernames
            if (!in_array($username, ['p', 'reel', 'tv', 'stories', 'explore', 'accounts'])) {
                $data['author'] = '@' . $username;
            }
        }

        return $data;
    }

    /**
     * Extract metadata from TikTok videos
     */
    private function extractTikTok(string $url, string $html): array
    {
        $data = [
            'platform' => 'tiktok',
            'type' => 'video',
            'url' => $url,
        ];

        // TikTok has good OpenGraph support
        $data = array_merge($data, $this->extractOpenGraph($html));

        // Extract video ID and username from URL
        if (preg_match('#tiktok\.com/@([^/]+)/video/(\d+)#i', $url, $matches)) {
            $data['author'] = '@' . $matches[1];
            $data['video_id'] = $matches[2];
            $data['embed_url'] = "https://www.tiktok.com/embed/v2/{$matches[2]}";
        }

        // Handle short URLs (vm.tiktok.com)
        if (preg_match('#vm\.tiktok\.com/([A-Za-z0-9]+)#i', $url, $matches)) {
            $data['short_code'] = $matches[1];
        }

        return $data;
    }

    /**
     * Extract metadata from Telegram posts/channels
     */
    private function extractTelegram(string $url, string $html): array
    {
        $data = [
            'platform' => 'telegram',
            'url' => $url,
        ];

        // Telegram public posts have OpenGraph tags
        $data = array_merge($data, $this->extractOpenGraph($html));

        // Extract channel name and message ID from URL
        if (preg_match('#t\.me/([^/]+)(?:/(\d+))?#i', $url, $matches)) {
            $data['channel'] = $matches[1];
            if (!empty($matches[2])) {
                $data['message_id'] = $matches[2];
                $data['type'] = 'message';
            } else {
                $data['type'] = 'channel';
            }
        }

        // Also check telegram.org URLs
        if (preg_match('#telegram\.org/([^/]+)#i', $url, $matches)) {
            $data['channel'] = $matches[1];
            $data['type'] = 'channel';
        }

        return $data;
    }

    /**
     * Extract OpenGraph metadata from HTML
     */
    private function extractOpenGraph(string $html): array
    {
        $data = [];

        $ogTags = [
            'og:title' => 'title',
            'og:description' => 'description',
            'og:image' => 'thumbnail',
            'og:image:url' => 'thumbnail',
            'og:image:secure_url' => 'thumbnail',
            'og:video' => 'video_url',
            'og:video:url' => 'video_url',
            'og:url' => 'canonical_url',
            'og:site_name' => 'site_name',
            'og:type' => 'og_type',
        ];

        foreach ($ogTags as $property => $key) {
            // Match both property= and name= attributes
            $pattern = '/<meta\s+(?:property|name)=["\']' . preg_quote($property, '/') . '["\']\s+content=["\']([^"\']+)["\']/i';
            if (preg_match($pattern, $html, $matches)) {
                if (!isset($data[$key]) || empty($data[$key])) {
                    $data[$key] = html_entity_decode($matches[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
                }
            }

            // Also try reverse attribute order
            $pattern2 = '/<meta\s+content=["\']([^"\']+)["\']\s+(?:property|name)=["\']' . preg_quote($property, '/') . '["\']/i';
            if (preg_match($pattern2, $html, $matches)) {
                if (!isset($data[$key]) || empty($data[$key])) {
                    $data[$key] = html_entity_decode($matches[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
                }
            }
        }

        return $data;
    }

    /**
     * Extract Twitter Cards metadata from HTML
     */
    private function extractTwitterCards(string $html): array
    {
        $data = [];

        $twitterTags = [
            'twitter:title' => 'title',
            'twitter:description' => 'description',
            'twitter:image' => 'thumbnail',
            'twitter:image:src' => 'thumbnail',
            'twitter:player' => 'player_url',
            'twitter:creator' => 'author',
            'twitter:site' => 'site_account',
        ];

        foreach ($twitterTags as $name => $key) {
            $pattern = '/<meta\s+(?:property|name)=["\']' . preg_quote($name, '/') . '["\']\s+content=["\']([^"\']+)["\']/i';
            if (preg_match($pattern, $html, $matches)) {
                if (!isset($data[$key]) || empty($data[$key])) {
                    $data[$key] = html_entity_decode($matches[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
                }
            }

            // Also try reverse attribute order
            $pattern2 = '/<meta\s+content=["\']([^"\']+)["\']\s+(?:property|name)=["\']' . preg_quote($name, '/') . '["\']/i';
            if (preg_match($pattern2, $html, $matches)) {
                if (!isset($data[$key]) || empty($data[$key])) {
                    $data[$key] = html_entity_decode($matches[1], ENT_QUOTES | ENT_HTML5, 'UTF-8');
                }
            }
        }

        return $data;
    }

    /**
     * Get fallback title from HTML <title> tag
     */
    public function extractHtmlTitle(string $html): ?string
    {
        if (preg_match('/<title[^>]*>([^<]+)<\/title>/i', $html, $matches)) {
            return html_entity_decode(trim($matches[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
        }
        return null;
    }

    /**
     * Get fallback description from meta description
     */
    public function extractMetaDescription(string $html): ?string
    {
        $pattern = '/<meta\s+name=["\']description["\']\s+content=["\']([^"\']+)["\']/i';
        if (preg_match($pattern, $html, $matches)) {
            return html_entity_decode(trim($matches[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
        }

        // Try reverse order
        $pattern2 = '/<meta\s+content=["\']([^"\']+)["\']\s+name=["\']description["\']/i';
        if (preg_match($pattern2, $html, $matches)) {
            return html_entity_decode(trim($matches[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
        }

        return null;
    }

    /**
     * Check if extraction was successful (has at least title or thumbnail)
     */
    public function isSuccessful(array $data): bool
    {
        return !empty($data['title']) || !empty($data['thumbnail']);
    }
}
