-- ============================================
-- FLOWBOT DCI - UNIFIED CRAWLER SCHEMA v7.0
-- ============================================
-- Professional crawler system database schema
-- Run this migration to enable unified crawler features
-- ============================================

-- ============================================
-- 1. ENHANCE EXISTING CRAWLER_JOBS TABLE
-- ============================================

-- Check if crawler_jobs exists, if not create it
CREATE TABLE IF NOT EXISTS crawler_jobs (
    id VARCHAR(50) PRIMARY KEY,
    type VARCHAR(50) NOT NULL DEFAULT 'deep',
    seed_url TEXT,
    search_term VARCHAR(500),
    status ENUM('pending', 'running', 'paused', 'completed', 'failed', 'cancelled') DEFAULT 'pending',
    pages_crawled INT UNSIGNED DEFAULT 0,
    pages_found INT UNSIGNED DEFAULT 0,
    errors INT UNSIGNED DEFAULT 0,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    completed_at DATETIME NULL,
    INDEX idx_status (status),
    INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- Add new columns to crawler_jobs (ignore if already exists)
-- User tracking
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS user_id INT UNSIGNED NULL AFTER id;
ALTER TABLE crawler_jobs ADD INDEX IF NOT EXISTS idx_user_id (user_id);

-- Crawl mode (deep, search, sitemap, infinite, hybrid)
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS mode ENUM('deep', 'search', 'sitemap', 'infinite', 'hybrid') DEFAULT 'deep' AFTER type;

-- Priority for queue ordering (1=highest, 10=lowest)
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS priority TINYINT UNSIGNED DEFAULT 5 AFTER mode;

-- Crawl limits
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS max_pages INT UNSIGNED DEFAULT 100 AFTER priority;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS max_depth INT UNSIGNED DEFAULT 3 AFTER max_pages;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS parallel_count INT UNSIGNED DEFAULT 5 AFTER max_depth;

-- Domain filtering
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS same_domain_only BOOLEAN DEFAULT TRUE AFTER parallel_count;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS include_patterns JSON NULL COMMENT 'URL patterns to include' AFTER same_domain_only;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS exclude_patterns JSON NULL COMMENT 'URL patterns to exclude' AFTER include_patterns;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS forced_domains JSON NULL COMMENT 'Domains to always accept regardless of relevance' AFTER exclude_patterns;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS blocked_domains JSON NULL COMMENT 'Domains to never crawl' AFTER forced_domains;

-- Robots.txt policy
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS robots_policy ENUM('respect', 'ignore', 'cautious') DEFAULT 'respect' AFTER blocked_domains;

-- Relevance scoring
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS relevance_threshold FLOAT DEFAULT 2.0 COMMENT 'Minimum relevance score to accept' AFTER robots_policy;

-- Session management for resume capability
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS session_id VARCHAR(64) NULL COMMENT 'Session ID for checkpoint/resume' AFTER relevance_threshold;
ALTER TABLE crawler_jobs ADD INDEX IF NOT EXISTS idx_session (session_id);

-- Hierarchical jobs
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS parent_job_id VARCHAR(50) NULL COMMENT 'Parent job for sub-crawls' AFTER session_id;

-- Retry tracking
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS retry_count INT UNSIGNED DEFAULT 0 AFTER parent_job_id;
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS last_error TEXT NULL AFTER retry_count;

-- Flexible metadata storage
ALTER TABLE crawler_jobs ADD COLUMN IF NOT EXISTS metadata JSON NULL COMMENT 'Additional job metadata' AFTER last_error;

-- Performance indices
ALTER TABLE crawler_jobs ADD INDEX IF NOT EXISTS idx_priority_status (priority, status);
ALTER TABLE crawler_jobs ADD INDEX IF NOT EXISTS idx_mode (mode);


-- ============================================
-- 2. DOMAIN STATISTICS TABLE
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_domain_stats (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    domain VARCHAR(255) NOT NULL,

    -- Visit statistics
    total_visits INT UNSIGNED DEFAULT 0,
    successful_visits INT UNSIGNED DEFAULT 0,
    failed_visits INT UNSIGNED DEFAULT 0,
    timeout_count INT UNSIGNED DEFAULT 0,
    rate_limit_hits INT UNSIGNED DEFAULT 0,

    -- Performance metrics
    avg_response_time FLOAT DEFAULT 0 COMMENT 'Average response time in milliseconds',
    min_response_time FLOAT DEFAULT 0,
    max_response_time FLOAT DEFAULT 0,
    total_bytes_downloaded BIGINT UNSIGNED DEFAULT 0,

    -- Last visit info
    last_visit DATETIME NULL,
    last_http_code SMALLINT UNSIGNED NULL,
    last_error VARCHAR(500) NULL,

    -- Robots.txt cache
    robots_txt_cached TEXT NULL COMMENT 'Cached robots.txt content',
    robots_txt_expires DATETIME NULL COMMENT 'When to refresh robots.txt',
    crawl_delay INT UNSIGNED DEFAULT 0 COMMENT 'Crawl-delay from robots.txt in seconds',

    -- Blocking
    is_blocked BOOLEAN DEFAULT FALSE,
    block_reason VARCHAR(255) NULL,
    blocked_at DATETIME NULL,

    -- Social media flag
    is_social_media BOOLEAN DEFAULT FALSE,

    -- Timestamps
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- Indices
    UNIQUE KEY idx_domain_unique (domain),
    INDEX idx_blocked (is_blocked),
    INDEX idx_last_visit (last_visit),
    INDEX idx_social (is_social_media)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 3. CONTENT HASH TABLE (Duplicate Detection)
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_content_hashes (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,

    -- URL identification
    url_hash VARCHAR(64) NOT NULL COMMENT 'MD5 hash of normalized URL',
    original_url TEXT NOT NULL,

    -- Content fingerprinting
    content_hash VARCHAR(64) NOT NULL COMMENT 'MD5 hash of normalized text content',
    title_hash VARCHAR(64) NULL COMMENT 'MD5 hash of title',

    -- Duplicate tracking
    first_seen DATETIME DEFAULT CURRENT_TIMESTAMP,
    last_seen DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    occurrence_count INT UNSIGNED DEFAULT 1,

    -- Canonical reference
    canonical_url TEXT NULL COMMENT 'The preferred/canonical URL for this content',

    -- Job reference
    first_job_id VARCHAR(50) NULL COMMENT 'Job that first discovered this content',

    -- Indices
    UNIQUE KEY idx_url_hash (url_hash),
    INDEX idx_content_hash (content_hash),
    INDEX idx_title_hash (title_hash),
    INDEX idx_first_seen (first_seen)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 4. JOB QUEUE TABLE (Background Processing)
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_queue (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,

    -- Job identification
    job_type VARCHAR(50) NOT NULL COMMENT 'Type: crawl, process, export, etc.',
    job_id VARCHAR(50) NULL COMMENT 'Reference to crawler_jobs.id if applicable',

    -- Payload
    payload JSON NOT NULL COMMENT 'Job parameters and data',

    -- Priority and scheduling
    priority TINYINT UNSIGNED DEFAULT 5 COMMENT '1=highest, 10=lowest',
    scheduled_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT 'When to execute',

    -- Status tracking
    status ENUM('pending', 'processing', 'completed', 'failed', 'cancelled', 'retry') DEFAULT 'pending',

    -- Retry management
    attempts INT UNSIGNED DEFAULT 0,
    max_attempts INT UNSIGNED DEFAULT 3,
    next_retry_at DATETIME NULL,

    -- Execution tracking
    started_at DATETIME NULL,
    completed_at DATETIME NULL,
    execution_time_ms INT UNSIGNED NULL COMMENT 'Execution duration in milliseconds',

    -- Error handling
    error_message TEXT NULL,
    error_trace TEXT NULL,

    -- Worker assignment
    worker_id VARCHAR(50) NULL COMMENT 'ID of worker processing this job',
    locked_at DATETIME NULL COMMENT 'When worker acquired lock',
    lock_timeout DATETIME NULL COMMENT 'When lock expires',

    -- Timestamps
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- Indices
    INDEX idx_status_priority (status, priority, scheduled_at),
    INDEX idx_worker (worker_id, status),
    INDEX idx_job_id (job_id),
    INDEX idx_scheduled (scheduled_at),
    INDEX idx_retry (status, next_retry_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 5. ANALYTICS METRICS TABLE
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_metrics (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,

    -- Time dimensions
    metric_date DATE NOT NULL,
    metric_hour TINYINT UNSIGNED NULL COMMENT 'Hour of day (0-23), NULL for daily aggregates',

    -- Job metrics
    total_jobs INT UNSIGNED DEFAULT 0,
    completed_jobs INT UNSIGNED DEFAULT 0,
    failed_jobs INT UNSIGNED DEFAULT 0,
    cancelled_jobs INT UNSIGNED DEFAULT 0,

    -- URL metrics
    total_urls_discovered INT UNSIGNED DEFAULT 0,
    total_urls_crawled INT UNSIGNED DEFAULT 0,
    total_urls_processed INT UNSIGNED DEFAULT 0,
    total_urls_imported INT UNSIGNED DEFAULT 0,
    total_urls_skipped INT UNSIGNED DEFAULT 0,
    total_urls_error INT UNSIGNED DEFAULT 0,

    -- Performance metrics
    avg_job_duration_ms FLOAT DEFAULT 0,
    avg_urls_per_job FLOAT DEFAULT 0,
    avg_response_time_ms FLOAT DEFAULT 0,

    -- Resource metrics
    unique_domains INT UNSIGNED DEFAULT 0,
    data_volume_bytes BIGINT UNSIGNED DEFAULT 0,

    -- Search engine metrics (for search mode)
    search_queries_count INT UNSIGNED DEFAULT 0,
    search_results_count INT UNSIGNED DEFAULT 0,

    -- Timestamps
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- Indices
    UNIQUE KEY idx_date_hour (metric_date, metric_hour),
    INDEX idx_date (metric_date)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 6. CRAWLER CHECKPOINTS TABLE (Resume Support)
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_checkpoints (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,

    -- Job reference
    job_id VARCHAR(50) NOT NULL,
    session_id VARCHAR(64) NOT NULL,

    -- State snapshot
    checkpoint_data JSON NOT NULL COMMENT 'Serialized crawler state',

    -- Progress at checkpoint
    urls_processed INT UNSIGNED DEFAULT 0,
    current_depth INT UNSIGNED DEFAULT 0,
    queue_size INT UNSIGNED DEFAULT 0,

    -- Timestamps
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,

    -- Indices
    INDEX idx_job (job_id),
    INDEX idx_session (session_id),
    INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 7. ACTIVITY LOG TABLE (Event Tracking)
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_activity_log (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,

    -- Event identification
    job_id VARCHAR(50) NULL,
    event_type VARCHAR(50) NOT NULL COMMENT 'started, completed, error, paused, resumed, etc.',
    event_level ENUM('info', 'success', 'warning', 'error') DEFAULT 'info',

    -- Event details
    message TEXT NOT NULL,
    context JSON NULL COMMENT 'Additional context data',

    -- URL reference (if applicable)
    url TEXT NULL,
    http_code SMALLINT UNSIGNED NULL,
    response_time_ms INT UNSIGNED NULL,

    -- Timestamps
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,

    -- Indices
    INDEX idx_job_id (job_id),
    INDEX idx_event_type (event_type),
    INDEX idx_level (event_level),
    INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 8. ENSURE CRAWLER_SEEN_LINKS EXISTS
-- ============================================

CREATE TABLE IF NOT EXISTS crawler_seen_links (
    id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    process_id VARCHAR(100) NOT NULL,
    link TEXT NOT NULL,
    depth INT UNSIGNED DEFAULT 0,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,

    -- Prevent duplicates per process
    UNIQUE KEY idx_process_link (process_id, link(255)),
    INDEX idx_process (process_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


-- ============================================
-- 9. STORED PROCEDURES FOR COMMON OPERATIONS
-- ============================================

-- Procedure to update domain statistics after a visit
DELIMITER //
CREATE PROCEDURE IF NOT EXISTS update_domain_stats(
    IN p_domain VARCHAR(255),
    IN p_success BOOLEAN,
    IN p_response_time FLOAT,
    IN p_http_code SMALLINT,
    IN p_bytes_downloaded BIGINT,
    IN p_error VARCHAR(500)
)
BEGIN
    INSERT INTO crawler_domain_stats (
        domain,
        total_visits,
        successful_visits,
        failed_visits,
        avg_response_time,
        min_response_time,
        max_response_time,
        total_bytes_downloaded,
        last_visit,
        last_http_code,
        last_error
    ) VALUES (
        p_domain,
        1,
        IF(p_success, 1, 0),
        IF(p_success, 0, 1),
        p_response_time,
        p_response_time,
        p_response_time,
        p_bytes_downloaded,
        NOW(),
        p_http_code,
        IF(p_success, NULL, p_error)
    )
    ON DUPLICATE KEY UPDATE
        total_visits = total_visits + 1,
        successful_visits = successful_visits + IF(p_success, 1, 0),
        failed_visits = failed_visits + IF(p_success, 0, 1),
        avg_response_time = (avg_response_time * (total_visits - 1) + p_response_time) / total_visits,
        min_response_time = LEAST(min_response_time, p_response_time),
        max_response_time = GREATEST(max_response_time, p_response_time),
        total_bytes_downloaded = total_bytes_downloaded + p_bytes_downloaded,
        last_visit = NOW(),
        last_http_code = p_http_code,
        last_error = IF(p_success, NULL, p_error),
        updated_at = NOW();
END //
DELIMITER ;


-- Procedure to increment hourly metrics
DELIMITER //
CREATE PROCEDURE IF NOT EXISTS increment_metrics(
    IN p_date DATE,
    IN p_hour TINYINT,
    IN p_jobs INT,
    IN p_completed INT,
    IN p_failed INT,
    IN p_urls_crawled INT,
    IN p_urls_imported INT,
    IN p_urls_error INT,
    IN p_bytes BIGINT
)
BEGIN
    INSERT INTO crawler_metrics (
        metric_date,
        metric_hour,
        total_jobs,
        completed_jobs,
        failed_jobs,
        total_urls_crawled,
        total_urls_imported,
        total_urls_error,
        data_volume_bytes
    ) VALUES (
        p_date,
        p_hour,
        p_jobs,
        p_completed,
        p_failed,
        p_urls_crawled,
        p_urls_imported,
        p_urls_error,
        p_bytes
    )
    ON DUPLICATE KEY UPDATE
        total_jobs = total_jobs + p_jobs,
        completed_jobs = completed_jobs + p_completed,
        failed_jobs = failed_jobs + p_failed,
        total_urls_crawled = total_urls_crawled + p_urls_crawled,
        total_urls_imported = total_urls_imported + p_urls_imported,
        total_urls_error = total_urls_error + p_urls_error,
        data_volume_bytes = data_volume_bytes + p_bytes,
        updated_at = NOW();
END //
DELIMITER ;


-- ============================================
-- 10. INITIAL DATA / DEFAULTS
-- ============================================

-- Insert common social media domains to block by default
INSERT IGNORE INTO crawler_domain_stats (domain, is_blocked, block_reason, is_social_media) VALUES
('facebook.com', TRUE, 'Social media - requires authentication', TRUE),
('twitter.com', TRUE, 'Social media - requires authentication', TRUE),
('x.com', TRUE, 'Social media - requires authentication', TRUE),
('instagram.com', TRUE, 'Social media - requires authentication', TRUE),
('linkedin.com', TRUE, 'Social media - requires authentication', TRUE),
('tiktok.com', TRUE, 'Social media - requires authentication', TRUE),
('pinterest.com', TRUE, 'Social media - requires authentication', TRUE),
('reddit.com', FALSE, 'Social media - public access', TRUE),
('youtube.com', FALSE, 'Video platform - public access', TRUE);


-- ============================================
-- MIGRATION COMPLETE
-- ============================================
-- Version: 7.0.0
-- Date: 2026-01-21
-- Description: Unified crawler schema with full feature support
-- ============================================
