<?php
/**
 * ===========================================
 * FLOWBOT DCI - TAG EXTRACTOR
 * ===========================================
 * Extracts keywords/tags from text
 */

declare(strict_types=1);

namespace FlowbotDCI\Utils;

class TagExtractor
{
    private array $commonWords = [
        // Articles and pronouns
        'a', 'an', 'the', 'this', 'that', 'these', 'those', 'it', 'its',
        'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'him',
        'her', 'his', 'they', 'them', 'their',

        // Prepositions
        'in', 'at', 'on', 'with', 'to', 'for', 'of', 'by', 'from', 'as',
        'into', 'about', 'up', 'over', 'after', 'out', 'than', 'through',

        // Conjunctions
        'and', 'or', 'but', 'if', 'then', 'so', 'because', 'while', 'when',
        'where', 'which', 'who', 'what', 'how', 'why',

        // Common verbs
        'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
        'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
        'can', 'may', 'might', 'must', 'shall', 'get', 'got', 'make', 'made',

        // Adverbs
        'also', 'just', 'only', 'more', 'most', 'very', 'well', 'now',
        'here', 'there', 'all', 'any', 'some', 'other', 'new', 'good',

        // Time words
        'time', 'year', 'years', 'day', 'days', 'week', 'month', 'today',
        'yesterday', 'tomorrow',

        // Misc
        '–', '-', '|', ':', ';', ',', '.', '!', '?',
    ];

    private int $minWordLength = 3;
    private int $maxTags = 10;

    /**
     * Extract tags from text
     */
    public function extract(string $text, int $maxTags = null): array
    {
        $maxTags = $maxTags ?? $this->maxTags;

        // Remove punctuation
        $text = preg_replace("/[.,\/#!$%\^&\*;:{}=\-_`~()\"']/", ' ', $text);

        // Convert to lowercase
        $text = mb_strtolower($text);

        // Split into words
        $words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);

        // Filter words
        $keywords = array_filter($words, function ($word) {
            return !in_array($word, $this->commonWords)
                && mb_strlen($word) >= $this->minWordLength
                && !is_numeric($word);
        });

        // Get unique keywords
        $keywords = array_unique($keywords);

        // Limit count
        return array_slice(array_values($keywords), 0, $maxTags);
    }

    /**
     * Extract weighted tags (with frequency count)
     */
    public function extractWeighted(string $text, int $maxTags = null): array
    {
        $maxTags = $maxTags ?? $this->maxTags;

        // Remove punctuation
        $text = preg_replace("/[.,\/#!$%\^&\*;:{}=\-_`~()\"']/", ' ', $text);

        // Convert to lowercase
        $text = mb_strtolower($text);

        // Split into words
        $words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);

        // Count word frequency
        $frequency = [];
        foreach ($words as $word) {
            if (in_array($word, $this->commonWords)) continue;
            if (mb_strlen($word) < $this->minWordLength) continue;
            if (is_numeric($word)) continue;

            $frequency[$word] = ($frequency[$word] ?? 0) + 1;
        }

        // Sort by frequency
        arsort($frequency);

        // Return top tags with weights
        return array_slice($frequency, 0, $maxTags, true);
    }

    /**
     * Set minimum word length
     */
    public function setMinWordLength(int $length): self
    {
        $this->minWordLength = $length;
        return $this;
    }

    /**
     * Set maximum tags to return
     */
    public function setMaxTags(int $count): self
    {
        $this->maxTags = $count;
        return $this;
    }

    /**
     * Add custom common words to filter
     */
    public function addCommonWords(array $words): self
    {
        $this->commonWords = array_merge($this->commonWords, array_map('mb_strtolower', $words));
        return $this;
    }

    /**
     * Extract hashtags from text
     */
    public function extractHashtags(string $text): array
    {
        preg_match_all('/#(\w+)/', $text, $matches);
        return $matches[1] ?? [];
    }

    /**
     * Convert tags to hashtag format
     */
    public function toHashtags(array $tags): array
    {
        return array_map(function ($tag) {
            return '#' . str_replace(' ', '', ucwords($tag));
        }, $tags);
    }
}
