<?php
/**
 * ===========================================
 * FLOWBOT DCI - LINK EXTRACTOR v1.0
 * ===========================================
 * Extracts and normalizes links from HTML content
 * Part of the Deep Crawler system
 *
 * Features:
 * - Extract all <a href=""> links from HTML
 * - Normalize relative URLs to absolute
 * - Filter by domain (same-domain or cross-domain)
 * - Filter by URL patterns (include/exclude regex)
 * - Deduplicate against visited URLs
 */

declare(strict_types=1);

namespace FlowbotDCI\Services;

class LinkExtractor
{
    const VERSION = '1.0';

    /**
     * Extract all links from HTML content
     *
     * @param string $html HTML content to parse
     * @param string $baseUrl Base URL for resolving relative links
     * @return array Array of normalized absolute URLs
     */
    public function extractLinks(string $html, string $baseUrl): array
    {
        if (empty($html) || empty($baseUrl)) {
            return [];
        }

        $links = [];

        // Parse base URL components
        $baseParts = parse_url($baseUrl);
        if (!$baseParts || !isset($baseParts['scheme'], $baseParts['host'])) {
            return [];
        }

        $baseScheme = $baseParts['scheme'];
        $baseHost = $baseParts['host'];
        $basePath = $baseParts['path'] ?? '/';

        // Build base for relative URLs
        $baseOrigin = "{$baseScheme}://{$baseHost}";
        $baseDir = dirname($basePath);
        if ($baseDir === '\\' || $baseDir === '.') {
            $baseDir = '/';
        }

        // Suppress DOMDocument warnings for malformed HTML
        libxml_use_internal_errors(true);

        $dom = new \DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR | LIBXML_NOWARNING);

        libxml_clear_errors();

        // Extract all anchor tags
        $anchors = $dom->getElementsByTagName('a');

        foreach ($anchors as $anchor) {
            $href = $anchor->getAttribute('href');

            if (empty($href)) {
                continue;
            }

            // Skip javascript:, mailto:, tel:, data:, #anchors
            if (preg_match('/^(javascript|mailto|tel|data|#)/i', $href)) {
                continue;
            }

            // Normalize the URL
            $normalizedUrl = $this->normalizeUrl($href, $baseOrigin, $baseDir);

            if ($normalizedUrl && filter_var($normalizedUrl, FILTER_VALIDATE_URL)) {
                $links[] = $normalizedUrl;
            }
        }

        // Remove duplicates
        return array_values(array_unique($links));
    }

    /**
     * Normalize a URL (relative to absolute)
     *
     * @param string $url URL to normalize
     * @param string $baseOrigin Base origin (scheme://host)
     * @param string $baseDir Base directory path
     * @return string|null Normalized absolute URL or null if invalid
     */
    public function normalizeUrl(string $url, string $baseOrigin, string $baseDir = '/'): ?string
    {
        $url = trim($url);

        if (empty($url)) {
            return null;
        }

        // Already absolute URL
        if (preg_match('/^https?:\/\//i', $url)) {
            return $this->cleanUrl($url);
        }

        // Protocol-relative URL (//example.com/path)
        if (str_starts_with($url, '//')) {
            return $this->cleanUrl('https:' . $url);
        }

        // Root-relative URL (/path/to/page)
        if (str_starts_with($url, '/')) {
            return $this->cleanUrl($baseOrigin . $url);
        }

        // Relative URL (path/to/page or ../path)
        // Resolve against base directory
        $resolvedPath = $this->resolvePath($baseDir, $url);
        return $this->cleanUrl($baseOrigin . $resolvedPath);
    }

    /**
     * Resolve relative path with ../ and ./ handling
     *
     * @param string $basePath Base directory path
     * @param string $relativePath Relative path to resolve
     * @return string Resolved absolute path
     */
    private function resolvePath(string $basePath, string $relativePath): string
    {
        // Start from base directory
        $path = rtrim($basePath, '/') . '/' . $relativePath;

        // Replace backslashes
        $path = str_replace('\\', '/', $path);

        // Handle ../ and ./
        $parts = [];
        foreach (explode('/', $path) as $part) {
            if ($part === '.' || $part === '') {
                continue;
            }
            if ($part === '..') {
                array_pop($parts);
            } else {
                $parts[] = $part;
            }
        }

        return '/' . implode('/', $parts);
    }

    /**
     * Clean URL by removing fragments and normalizing
     *
     * @param string $url URL to clean
     * @return string Cleaned URL
     */
    private function cleanUrl(string $url): string
    {
        // Remove fragment (#anchor)
        $url = preg_replace('/#.*$/', '', $url);

        // Remove trailing whitespace
        $url = trim($url);

        // Ensure no double slashes (except after protocol)
        $url = preg_replace('#([^:])//+#', '$1/', $url);

        // Remove trailing slash for consistency (except for root)
        if (strlen($url) > 1 && substr($url, -1) === '/' && !preg_match('#^https?://[^/]+/$#', $url)) {
            $url = rtrim($url, '/');
        }

        return $url;
    }

    /**
     * Filter links by domain
     *
     * @param array $links Array of URLs
     * @param string $domain Domain to filter by (e.g., "example.com")
     * @param bool $sameDomainOnly If true, only keep same-domain links
     * @return array Filtered URLs
     */
    public function filterByDomain(array $links, string $domain, bool $sameDomainOnly = true): array
    {
        $domain = strtolower(trim($domain));

        return array_values(array_filter($links, function ($url) use ($domain, $sameDomainOnly) {
            $urlParts = parse_url($url);
            if (!$urlParts || !isset($urlParts['host'])) {
                return false;
            }

            $urlHost = strtolower($urlParts['host']);

            // Check if URL host matches or is subdomain of target domain
            $isSameDomain = ($urlHost === $domain) ||
                str_ends_with($urlHost, '.' . $domain);

            return $sameDomainOnly ? $isSameDomain : !$isSameDomain;
        }));
    }

    /**
     * Filter links by URL patterns (regex)
     *
     * @param array $links Array of URLs
     * @param array $includePatterns Patterns to include (if any match, include)
     * @param array $excludePatterns Patterns to exclude (if any match, exclude)
     * @return array Filtered URLs
     */
    public function filterByPattern(array $links, array $includePatterns = [], array $excludePatterns = []): array
    {
        return array_values(array_filter($links, function ($url) use ($includePatterns, $excludePatterns) {
            // First check exclude patterns
            foreach ($excludePatterns as $pattern) {
                if (preg_match($pattern, $url)) {
                    return false;
                }
            }

            // If no include patterns, accept all (that weren't excluded)
            if (empty($includePatterns)) {
                return true;
            }

            // Check include patterns (must match at least one)
            foreach ($includePatterns as $pattern) {
                if (preg_match($pattern, $url)) {
                    return true;
                }
            }

            return false;
        }));
    }

    /**
     * Filter out common non-content URLs
     *
     * @param array $links Array of URLs
     * @return array Filtered URLs
     */
    public function filterNonContent(array $links): array
    {
        $excludePatterns = [
            '#\.(jpg|jpeg|png|gif|webp|svg|ico|bmp|tiff)(\?|$)#i',  // Images
            '#\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|7z)(\?|$)#i',  // Documents
            '#\.(mp3|mp4|wav|ogg|avi|mov|wmv|flv|webm)(\?|$)#i',  // Media
            '#\.(css|js|json|xml|rss|atom)(\?|$)#i',  // Assets
            '#/(login|logout|signin|signout|register|signup|auth)#i',  // Auth pages
            '#/(cart|checkout|basket|wishlist)#i',  // E-commerce
            '#/(api|ajax|feed|rss)/#i',  // API endpoints
            '#\?.*replytocom=#i',  // WordPress comment replies
            '#/wp-admin/#i',  // WordPress admin
            '#/admin/#i',  // Admin panels
        ];

        return $this->filterByPattern($links, [], $excludePatterns);
    }

    /**
     * Deduplicate links against already visited URLs
     *
     * @param array $links Array of URLs to check
     * @param array $visited Array of already visited URLs
     * @return array New URLs not in visited
     */
    public function deduplicate(array $links, array $visited): array
    {
        // Normalize visited URLs for comparison
        $visitedNormalized = array_flip(array_map([$this, 'cleanUrl'], $visited));

        return array_values(array_filter($links, function ($url) use ($visitedNormalized) {
            $normalized = $this->cleanUrl($url);
            return !isset($visitedNormalized[$normalized]);
        }));
    }

    /**
     * Extract domain from URL
     *
     * @param string $url URL to extract domain from
     * @return string|null Domain or null if invalid
     */
    public function extractDomain(string $url): ?string
    {
        $parts = parse_url($url);
        return $parts['host'] ?? null;
    }

    /**
     * Check if URL is valid and crawlable
     *
     * @param string $url URL to validate
     * @return bool True if valid and crawlable
     */
    public function isValidUrl(string $url): bool
    {
        // Must be valid URL
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            return false;
        }

        // Must be HTTP or HTTPS
        $scheme = parse_url($url, PHP_URL_SCHEME);
        if (!in_array(strtolower($scheme ?? ''), ['http', 'https'])) {
            return false;
        }

        // Must have a host
        $host = parse_url($url, PHP_URL_HOST);
        if (empty($host)) {
            return false;
        }

        return true;
    }

    /**
     * Extract links with additional metadata
     *
     * @param string $html HTML content
     * @param string $baseUrl Base URL
     * @return array Array of ['url' => string, 'text' => string, 'rel' => string]
     */
    public function extractLinksWithMetadata(string $html, string $baseUrl): array
    {
        if (empty($html) || empty($baseUrl)) {
            return [];
        }

        $links = [];
        $baseParts = parse_url($baseUrl);
        if (!$baseParts || !isset($baseParts['scheme'], $baseParts['host'])) {
            return [];
        }

        $baseOrigin = "{$baseParts['scheme']}://{$baseParts['host']}";
        $baseDir = dirname($baseParts['path'] ?? '/');

        libxml_use_internal_errors(true);
        $dom = new \DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOERROR | LIBXML_NOWARNING);
        libxml_clear_errors();

        $anchors = $dom->getElementsByTagName('a');

        foreach ($anchors as $anchor) {
            $href = $anchor->getAttribute('href');

            if (empty($href) || preg_match('/^(javascript|mailto|tel|data|#)/i', $href)) {
                continue;
            }

            $normalizedUrl = $this->normalizeUrl($href, $baseOrigin, $baseDir);

            if ($normalizedUrl && filter_var($normalizedUrl, FILTER_VALIDATE_URL)) {
                $links[] = [
                    'url' => $normalizedUrl,
                    'text' => trim($anchor->textContent),
                    'rel' => $anchor->getAttribute('rel') ?: null,
                    'title' => $anchor->getAttribute('title') ?: null,
                ];
            }
        }

        return $links;
    }
}
