add search engine - ringle

2025-08-27 20:23:05 +03:00
parent 1cf81bbfee
commit 347b40ed71
47 changed files with 7214 additions and 493 deletions
--- a/search-engine/Cargo.lock
+++ b/search-engine/Cargo.lock
--- a/search-engine/Cargo.toml
+++ b/search-engine/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "gurted-search-engine"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+tokio = { version = "1.38.0", features = ["full"] }
+futures = "0.3.30"
+tantivy = "0.22"
+sha2 = "0.10"
+gurt = { path = "../protocol/library" }
+sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "chrono", "uuid"] }
+scraper = "0.20"
+lol_html = "1.2"
+url = "2.5"
+toml = "0.8.13"
+serde = { version = "1.0.203", features = ["derive"] }
+serde_json = "1.0"
+chrono = { version = "0.4", features = ["serde"] }
+anyhow = "1.0.86"
+thiserror = "1.0"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+uuid = { version = "1.0", features = ["v4"] }
+regex = "1.10.4"
+mime = "0.3"
+base64 = "0.22"
+glob = "0.3"
+clap = { version = "4.5.4", features = ["derive"] }
+urlencoding = "2.1"
+reqwest = "0.11"
--- a/search-engine/README.md
+++ b/search-engine/README.md
@@ -0,0 +1,5 @@
+The official Gurted search engine, Ringle.
+
+Copy `config.template.toml` to `config.toml` and edit as needed.
+
+Run with `cargo run`
--- a/search-engine/config.template.toml
+++ b/search-engine/config.template.toml
@@ -0,0 +1,51 @@
+[database]
+url = "postgres://..."
+max_connections = 5
+
+[server]
+address = "127.0.0.1"
+port = 4879
+cert_path = "certs/t.crt"
+key_path = "certs/t.key"
+
+[search]
+index_path = "./search_indexes"
+crawl_interval_hours = 2
+max_pages_per_domain = 1000
+crawler_timeout_seconds = 30
+crawler_user_agent = "GurtedSearchBot/1.0"
+max_concurrent_crawls = 5
+content_size_limit_mb = 10
+index_rebuild_interval_hours = 48
+search_results_per_page = 20
+max_search_results = 1000
+
+allowed_extensions = [
+    "html", "htm", "txt", "md", "json", "xml", "rss", "atom"
+]
+
+blocked_extensions = [
+    "exe", "zip", "rar", "tar", "gz", "7z", "iso", "dmg",
+    "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
+    "jpg", "jpeg", "png", "gif", "bmp", "svg", "webp",
+    "mp3", "mp4", "avi", "mov", "wmv", "flv", "webm",
+    "css", "js", "woff", "woff2", "ttf", "eot"
+]
+
+[crawler]
+clanker_txt = true
+crawl_delay_ms = 1000
+max_redirects = 5
+follow_external_links = false
+max_depth = 10
+
+request_headers = [
+    ["Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"],
+    ["Accept-Language", "en-US,en;q=0.5"],
+    ["Accept-Encoding", "gzip, deflate"],
+    ["DNT", "1"],
+]
+
+[logging]
+level = "info"
+format = "compact"
--- a/search-engine/frontend/search.html
+++ b/search-engine/frontend/search.html
@@ -0,0 +1,96 @@
+<head>
+    <title>Gurted Search Engine</title>
+    <meta name="description" content="Search across all registered GURT domains">
+    <meta name="theme-color" content="#000000">
+    <icon src="https://cdn-icons-png.flaticon.com/512/295/295128.png">
+
+        <style>
+            body {
+                font-sans bg-[#000000] text-[#ffffff] min-h-screen flex flex-col items-center p-8
+            }
+
+            .search-container {
+                max-w-2xl w-full flex flex-col gap-8
+            }
+
+            .logo {
+                text-center text-5xl font-bold text-[#dc2626] mb-8
+            }
+
+            .search-box {
+                flex flex-col gap-4
+            }
+
+            .search-input {
+                w-[600px] p-3 text-base bg-[#1a1a1a] border-2 border-[#333333] rounded-lg text-[#ffffff] outline-none active:border-[#dc2626]
+            }
+
+            .search-btn {
+                bg-[#1a1a1a] text-[#ffffff] px-4 py-2 rounded text-sm border border-[#333333] cursor-pointer hover:border-[#dc2626] hover:shadow-sm
+            }
+
+            .results {
+                w-full flex flex-col gap-4 mt-8
+            }
+
+            .result-item {
+                p-4 cursor-pointer hover:bg-[#0a0a0a] rounded
+            }
+
+            .result-header {
+                inline-flex gap-2 mb-2
+            }
+
+            .result-icon {
+                w-6 h-6 min-w-6 min-h-6 rounded border border-[#333333] bg-[#1a1a1a] object-cover mt-1
+            }
+
+            .result-content {
+                flex-1 min-w-0
+            }
+
+            .result-title {
+                text-xl font-normal text-[#dc2626] mb-1 hover:underline
+            }
+
+            .result-url {
+                text-[#4ade80] text-sm mb-2
+            }
+
+            .result-preview {
+                text-[#cccccc] text-sm leading-relaxed
+            }
+
+            .stats {
+                text-left text-[#999999] text-sm mt-4 ml-1
+            }
+
+            .loading {
+                text-center text-[#999999] mt-8
+            }
+        </style>
+
+        <script src="search.lua" />
+</head>
+
+<body>
+    <div style="search-container">
+        <h1 style="logo">Ringle</h1>
+
+        <div style="w-full flex flex-col gap-4 items-center justify-center content-center">
+            <input type="text" id="searchQuery" style="search-input" placeholder="Search across the Gurted..." />
+            <div style="inline-flex gap-2 items-center justify-center">
+                <button id="searchButton" style="search-btn">Search</button>
+                <button id="luckyButton" style="search-btn">I'm feeling lucky</button>
+            </div>
+        </div>
+
+        <div id="loading" style="loading hidden">
+            Searching...
+        </div>
+
+        <div id="results" style="results"></div>
+
+        <div id="stats" style="stats"></div>
+    </div>
+</body>
--- a/search-engine/frontend/search.lua
+++ b/search-engine/frontend/search.lua
@@ -0,0 +1,150 @@
+local searchBtn = gurt.select('#searchButton')
+local luckyBtn = gurt.select('#luckyButton')
+local searchQuery = gurt.select('#searchQuery')
+local loading = gurt.select('#loading')
+local results = gurt.select('#results')
+local stats = gurt.select('#stats')
+
+local function showLoading()
+    loading.classList:remove('hidden')
+    results.text = ''
+    stats.text = ''
+end
+
+local function displayResults(data)
+    loading.classList:add('hidden')
+    results.text = ''
+    
+    if not data.results or #data.results == 0 then
+        local noResultsItem = gurt.create('div', {
+            text = 'No results found for your query.',
+            style = 'result-item'
+        })
+        results:append(noResultsItem)
+        stats.text = 'No results found'
+        return
+    end
+    
+    for i, result in ipairs(data.results) do
+        local resultItem = gurt.create('div', { style = 'result-item' })
+        
+        resultItem:on('click', function()
+            gurt.location.goto(result.url)
+        end)
+        
+        local headerDiv = gurt.create('div', { style = 'result-header' })
+        
+        if result.icon and result.icon ~= '' then
+            local iconImg = gurt.create('img', {
+                src = result.icon,
+                style = 'result-icon',
+                alt = 'Site icon'
+            })
+            headerDiv:append(iconImg)
+        end
+        
+        local titleDiv = gurt.create('p', {
+            text = result.title or result.url,
+            style = 'result-title'
+        })
+        
+        headerDiv:append(titleDiv)
+        
+        local urlDiv = gurt.create('p', {
+            text = result.url,
+            style = 'result-url'
+        })
+        
+        local previewText = result.preview or result.description or ''
+        if #previewText > 150 then
+            previewText = previewText:sub(1, 147) .. '...'
+        end
+        
+        local previewDiv = gurt.create('p', {
+            text = previewText,
+            style = 'result-preview'
+        })
+        
+        resultItem:append(headerDiv)
+        resultItem:append(urlDiv)
+        resultItem:append(previewDiv)
+        
+        results:append(resultItem)
+    end
+    
+    local resultCount = #data.results
+    local totalResults = data.total_results or resultCount
+    stats.text = 'Found ' .. totalResults .. ' result' .. (totalResults == 1 and '' or 's')
+end
+
+local function performSearch(query)
+    if not query or query == '' then
+        return
+    end
+
+    showLoading()
+    
+    local url = '/api/search?q=' .. urlEncode(query) .. '&per_page=20'
+    local response = fetch(url, {
+        method = 'GET'
+    })
+    
+    if response:ok() then
+        local data = response:json()
+        displayResults(data)
+    else
+        loading.classList:add('hidden')
+        results.text = ''
+        stats.text = 'Search failed: ' .. response.status .. ' ' .. response.statusText
+    end
+end
+
+local function performLuckySearch()
+    showLoading()
+    
+    local luckyTerms = {'test', 'demo', 'api', 'web', 'site', 'page', 'home', 'index'}
+    local randomTerm = luckyTerms[math.random(#luckyTerms)]
+    
+    local url = '/api/search?q=' .. urlEncode(randomTerm) .. '&per_page=50'
+    local response = fetch(url, {
+        method = 'GET'
+    })
+    
+    if response:ok() then
+        local data = response:json()
+        if data.results and #data.results > 0 then
+            local randomResult = data.results[math.random(#data.results)]
+            gurt.location.goto(randomResult.url)
+        else
+            loading.classList:add('hidden')
+            results.text = ''
+            stats.text = 'No sites available for lucky search'
+        end
+    else
+        loading.classList:add('hidden')
+        results.text = ''
+        stats.text = 'Lucky search failed'
+    end
+end
+
+searchBtn:on('click', function()
+    local query = searchQuery.value
+    if query and query ~= '' then
+        performSearch(query:trim())
+    end
+end)
+
+luckyBtn:on('click', function()
+    performLuckySearch()
+end)
+
+searchQuery:on('keydown', function(e)
+    if e.key == 'Enter' then
+        local query = searchQuery.value
+        if query and query ~= '' then
+            performSearch(query:trim())
+        end
+    end
+end)
+
+searchQuery:focus()
--- a/search-engine/src/config.rs
+++ b/search-engine/src/config.rs
@@ -0,0 +1,121 @@
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Config {
+    pub database: DatabaseConfig,
+    pub server: ServerConfig,
+    pub search: SearchConfig,
+    pub crawler: CrawlerConfig,
+    pub logging: LoggingConfig,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct DatabaseConfig {
+    pub url: String,
+    pub max_connections: u32,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ServerConfig {
+    pub address: String,
+    pub port: u16,
+    pub cert_path: PathBuf,
+    pub key_path: PathBuf,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SearchConfig {
+    pub index_path: PathBuf,
+    pub crawl_interval_hours: u64,
+    pub max_pages_per_domain: usize,
+    pub crawler_timeout_seconds: u64,
+    pub crawler_user_agent: String,
+    pub max_concurrent_crawls: usize,
+    pub content_size_limit_mb: usize,
+    pub index_rebuild_interval_hours: u64,
+    pub search_results_per_page: usize,
+    pub max_search_results: usize,
+    pub allowed_extensions: Vec<String>,
+    pub blocked_extensions: Vec<String>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CrawlerConfig {
+    pub clanker_txt: bool,
+    pub crawl_delay_ms: u64,
+    pub max_redirects: usize,
+    pub follow_external_links: bool,
+    pub max_depth: usize,
+    pub request_headers: Vec<(String, String)>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct LoggingConfig {
+    pub level: String,
+    pub format: String,
+}
+
+impl Config {
+    pub fn load_from_file(path: &str) -> anyhow::Result<Self> {
+        let content = std::fs::read_to_string(path)
+            .map_err(|e| anyhow::anyhow!("Failed to read config file {}: {}", path, e))?;
+        
+        let config: Config = toml::from_str(&content)
+            .map_err(|e| anyhow::anyhow!("Failed to parse config file {}: {}", path, e))?;
+        
+        Ok(config)
+    }
+
+    pub fn database_url(&self) -> &str {
+        &self.database.url
+    }
+
+    pub fn server_bind_address(&self) -> String {
+        format!("{}:{}", self.server.address, self.server.port)
+    }
+
+    pub fn gurt_protocol_url(&self) -> String {
+        format!("gurt://{}:{}", self.server.address, self.server.port)
+    }
+
+    pub fn is_allowed_extension(&self, extension: &str) -> bool {
+        if self.is_blocked_extension(extension) {
+            return false;
+        }
+
+        if self.search.allowed_extensions.is_empty() {
+            return true;
+        }
+        self.search.allowed_extensions.iter()
+            .any(|ext| ext.eq_ignore_ascii_case(extension))
+    }
+
+    pub fn is_blocked_extension(&self, extension: &str) -> bool {
+        self.search.blocked_extensions.iter()
+            .any(|ext| ext.eq_ignore_ascii_case(extension))
+    }
+
+    pub fn content_size_limit_bytes(&self) -> usize {
+        self.search
+            .content_size_limit_mb
+            .saturating_mul(1024)
+            .saturating_mul(1024)
+    }
+
+    pub fn crawler_timeout(&self) -> std::time::Duration {
+        std::time::Duration::from_secs(self.search.crawler_timeout_seconds)
+    }
+
+    pub fn crawl_delay(&self) -> std::time::Duration {
+        std::time::Duration::from_millis(self.crawler.crawl_delay_ms)
+    }
+
+    pub fn crawl_interval(&self) -> std::time::Duration {
+        std::time::Duration::from_secs(self.search.crawl_interval_hours * 3600)
+    }
+
+    pub fn index_rebuild_interval(&self) -> std::time::Duration {
+        std::time::Duration::from_secs(self.search.index_rebuild_interval_hours * 3600)
+    }
+}
--- a/search-engine/src/crawler.rs
+++ b/search-engine/src/crawler.rs
@@ -0,0 +1,706 @@
+use anyhow::{Result, Context};
+use chrono::Utc;
+use gurt::{GurtClient, GurtClientConfig};
+use scraper::{Html, Selector};
+use std::collections::{HashSet, VecDeque};
+use std::sync::Arc;
+use tracing::{info, debug, warn, error};
+use url::Url;
+
+use crate::config::Config;
+use crate::models::{Domain, DomainRepository, CrawledPage};
+
+#[derive(Debug, Clone)]
+struct CrawledPageWithHtml {
+    crawled_page: CrawledPage,
+    original_html: String,
+}
+use crate::indexer::SearchEngine;
+
+#[derive(Clone)]
+pub struct DomainCrawler {
+    config: Config,
+    gurt_client: GurtClient,
+    domain_repo: DomainRepository,
+    search_engine: Arc<SearchEngine>,
+}
+
+impl DomainCrawler {
+    pub async fn new(config: Config, domain_repo: DomainRepository, search_engine: Arc<SearchEngine>) -> Result<Self> {
+        // Fetch the Gurted CA certificate from the DNS server
+        let ca_cert = Self::fetch_ca_certificate().await
+            .context("Failed to fetch Gurted CA certificate")?;
+        
+        let gurt_config = GurtClientConfig {
+            request_timeout: config.crawler_timeout(),
+            user_agent: config.search.crawler_user_agent.clone(),
+            max_redirects: config.crawler.max_redirects,
+            custom_ca_certificates: vec![ca_cert],
+            ..Default::default()
+        };
+        
+        let gurt_client = GurtClient::with_config(gurt_config);
+
+        Ok(Self {
+            config,
+            gurt_client,
+            domain_repo,
+            search_engine,
+        })
+    }
+
+    async fn fetch_ca_certificate() -> Result<String> {
+        // Use GurtClient's DNS server configuration to build the HTTP URL
+        let dns_ip = GurtClientConfig::default().dns_server_ip;
+        
+        // The HTTP bootstrap server runs on port 8876 (hardcoded in DNS server)
+        let http_url = format!("http://{}:8876/ca/root", dns_ip);
+        
+        let response = reqwest::get(&http_url).await
+            .context("Failed to fetch CA certificate from HTTP bootstrap server")?;
+            
+        if !response.status().is_success() {
+            return Err(anyhow::anyhow!("Failed to fetch CA certificate: HTTP {}", response.status()));
+        }
+        
+        let ca_cert = response.text().await
+            .context("Failed to read CA certificate response")?;
+            
+        if ca_cert.trim().is_empty() {
+            return Err(anyhow::anyhow!("Received empty CA certificate"));
+        }
+        
+        Ok(ca_cert)
+    }
+
+    pub async fn crawl_domain(&self, domain: &Domain) -> Result<CrawlStats> {
+        info!("Starting crawl for domain: {}", domain.full_domain());
+        
+        let start_time = std::time::Instant::now();
+        let mut stats = CrawlStats::new();
+        
+        self.domain_repo
+            .update_crawl_status(domain.id, "crawling", None, None, None)
+            .await
+            .context("Failed to update crawl status to crawling")?;
+
+        let result = self.crawl_domain_internal(domain, &mut stats).await;
+
+        let duration = start_time.elapsed();
+        stats.duration_seconds = duration.as_secs();
+
+        match result {
+            Ok(()) => {
+                info!(
+                    "Successfully crawled domain {} - {} pages found, {} indexed in {:.2}s",
+                    domain.full_domain(),
+                    stats.pages_found,
+                    stats.pages_indexed,
+                    duration.as_secs_f64()
+                );
+
+                self.domain_repo
+                    .update_crawl_status(
+                        domain.id,
+                        "completed",
+                        None,
+                        Some(stats.pages_found as i32),
+                        Some(self.config.search.crawl_interval_hours as i64),
+                    )
+                    .await
+                    .context("Failed to update crawl status to completed")?;
+            }
+            Err(e) => {
+                error!(
+                    "Failed to crawl domain {}: {}",
+                    domain.full_domain(),
+                    e
+                );
+
+                self.domain_repo
+                    .update_crawl_status(
+                        domain.id,
+                        "failed",
+                        Some(&e.to_string()),
+                        Some(stats.pages_found as i32),
+                        Some(24),
+                    )
+                    .await
+                    .context("Failed to update crawl status to failed")?;
+
+                return Err(e);
+            }
+        }
+
+        Ok(stats)
+    }
+
+    async fn check_clanker_txt(&self, base_url: &str) -> Result<Vec<String>> {
+        let clanker_url = format!("{}/clanker.txt", base_url);
+        debug!("Checking clanker.txt at: {}", clanker_url);
+
+        match self.gurt_client.get(&clanker_url).await {
+            Ok(response) => {
+                if response.status_code == 200 {
+                    let content = String::from_utf8_lossy(&response.body);
+                    let urls = self.parse_clanker_txt(&content, base_url)?;
+                    debug!("Found {} allowed URLs in clanker.txt", urls.len());
+                    return Ok(urls);
+                }
+                // If clanker.txt doesn't exist (404), crawling is allowed
+                Ok(vec![])
+            }
+            Err(_) => {
+                // If we can't fetch clanker.txt, assume crawling is allowed
+                Ok(vec![])
+            }
+        }
+    }
+
+    fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
+        let user_agent = &self.config.search.crawler_user_agent;
+        let mut disallow_all = false;
+        let mut user_agent_matches = false;
+        let mut allowed_urls = Vec::new();
+
+        for line in content.lines() {
+            let line = line.trim();
+            if line.is_empty() || line.starts_with('#') {
+                continue;
+            }
+
+            if let Some(user_agent_value) = line.to_lowercase().strip_prefix("user-agent:") {
+                let current_user_agent = user_agent_value.trim().to_string();
+                user_agent_matches = current_user_agent == "*" || current_user_agent.eq_ignore_ascii_case(user_agent);
+                continue;
+            }
+
+            if user_agent_matches {
+                if let Some(path_value) = line.to_lowercase().strip_prefix("disallow:") {
+                    let path = path_value.trim();
+                    if path == "/" {
+                        disallow_all = true;
+                        break;
+                    }
+                } else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") {
+                    let path = path_value.trim();
+                    if !path.is_empty() {
+                        let full_url = format!("{}{}", base_url, path);
+                        debug!("Added allowed URL from clanker.txt: {}", full_url);
+                        allowed_urls.push(full_url);
+                    }
+                }
+            }
+        }
+
+        if disallow_all {
+            Err(anyhow::anyhow!("Crawling disallowed by clanker.txt"))
+        } else {
+            Ok(allowed_urls)
+        }
+    }
+
+    async fn crawl_domain_internal(&self, domain: &Domain, stats: &mut CrawlStats) -> Result<()> {
+        let base_url = domain.gurt_url();
+        let mut visited_urls = HashSet::new();
+        let mut queue = VecDeque::new();
+        let mut pages_to_index = Vec::new();
+
+        // Check clanker.txt if enabled and get allowed URLs
+        let mut clanker_urls = Vec::new();
+        if self.config.crawler.clanker_txt {
+            match self.check_clanker_txt(&base_url).await {
+                Ok(urls) => {
+                    clanker_urls = urls;
+                    info!("Found {} URLs in clanker.txt for {}", clanker_urls.len(), domain.full_domain());
+                },
+                Err(e) => {
+                    warn!("Clanker.txt check failed for {}: {}", domain.full_domain(), e);
+                    return Err(anyhow::anyhow!("Crawling disabled by clanker.txt: {}", e));
+                }
+            }
+        }
+
+        // Start with the root URL
+        queue.push_back(CrawlItem {
+            url: base_url.clone(),
+            depth: 0,
+        });
+        
+        // Add all URLs from clanker.txt to the queue
+        for url in clanker_urls {
+            if !visited_urls.contains(&url) {
+                queue.push_back(CrawlItem {
+                    url: url.clone(),
+                    depth: 0, // Treat clanker.txt URLs as root level
+                });
+                debug!("Added clanker.txt URL to queue: {}", url);
+            }
+        }
+
+        while let Some(item) = queue.pop_front() {
+            if visited_urls.contains(&item.url) {
+                continue;
+            }
+
+            if item.depth > self.config.crawler.max_depth {
+                debug!("Skipping URL due to depth limit: {}", item.url);
+                continue;
+            }
+
+            if stats.pages_found >= self.config.search.max_pages_per_domain {
+                info!("Reached page limit for domain: {}", domain.full_domain());
+                break;
+            }
+
+            visited_urls.insert(item.url.clone());
+            stats.pages_found += 1;
+
+            // Add crawl delay between requests
+            if stats.pages_found > 1 {
+                tokio::time::sleep(self.config.crawl_delay()).await;
+            }
+
+            match self.crawl_page(&item.url, domain).await {
+                Ok(Some(page_with_html)) => {
+                    // Extract links if not at max depth
+                    if item.depth < self.config.crawler.max_depth {
+                        if let Ok(links) = self.extract_links(&page_with_html.original_html, &base_url).await {
+                            debug!("Found {} links on {}", links.len(), item.url);
+                            for link in links {
+                                if self.should_crawl_url(&link, domain) {
+                                    debug!("Adding link to crawl queue: {}", link);
+                                    queue.push_back(CrawlItem {
+                                        url: link,
+                                        depth: item.depth + 1,
+                                    });
+                                }
+                            }
+                        }
+                    }
+
+                    pages_to_index.push(page_with_html.crawled_page);
+                    stats.pages_indexed += 1;
+
+                    // Index in batches
+                    if pages_to_index.len() >= 50 {
+                        let batch = std::mem::take(&mut pages_to_index);
+                        self.search_engine.index_pages(batch).await?;
+                    }
+                }
+                Ok(None) => {
+                    debug!("Skipped page: {}", item.url);
+                    stats.pages_skipped += 1;
+                }
+                Err(e) => {
+                    warn!("Failed to crawl page {}: {}", item.url, e);
+                    stats.errors += 1;
+                }
+            }
+        }
+
+        // Index remaining pages
+        if !pages_to_index.is_empty() {
+            self.search_engine.index_pages(pages_to_index).await?;
+        }
+
+        Ok(())
+    }
+
+    async fn crawl_page(&self, url: &str, domain: &Domain) -> Result<Option<CrawledPageWithHtml>> {
+        debug!("Crawling page: {}", url);
+
+        let response = match self.gurt_client.get(url).await {
+            Ok(response) => response,
+            Err(e) => {
+                return Err(anyhow::anyhow!("Failed to fetch URL: {} - {}", url, e));
+            }
+        };
+
+        let status_code = response.status_code;
+        let content_type = response
+            .headers
+            .get("content-type")
+            .map(|s| s.to_string());
+
+        // Check if we should process this content type
+        if let Some(ref ct) = content_type {
+            if !self.is_allowed_content_type(ct) {
+                debug!("Skipping URL with unsupported content type: {} ({})", url, ct);
+                return Ok(None);
+            }
+        }
+
+        if status_code != 200 {
+            return Err(anyhow::anyhow!(
+                "HTTP error {}: {}",
+                status_code,
+                response.status_message
+            ));
+        }
+
+        let content_bytes = response.body;
+
+        // Check content size limit
+        if content_bytes.len() > self.config.content_size_limit_bytes() {
+            warn!("Skipping URL due to size limit: {} ({} bytes)", url, content_bytes.len());
+            return Ok(None);
+        }
+
+        // Convert bytes to string
+        let content = String::from_utf8_lossy(&content_bytes);
+
+        // Extract metadata from HTML
+        let title = self.extract_title(&content);
+        let icon = self.extract_icon(&content, url);
+        let description = self.extract_meta_description(&content);
+        let cleaned_content = self.clean_content(&content);
+
+        let page = CrawledPageWithHtml {
+            crawled_page: CrawledPage {
+                url: url.to_string(),
+                domain: domain.full_domain(),
+                title,
+                content: cleaned_content.clone(),
+                content_hash: Self::calculate_content_hash(&cleaned_content),
+                indexed_at: Utc::now(),
+                icon,
+                description,
+            },
+            original_html: content.to_string(),
+        };
+
+        Ok(Some(page))
+    }
+
+    async fn extract_links(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
+        debug!("Extracting links from content length: {} chars", content.len());
+        let document = Html::parse_document(content);
+        let link_selector = Selector::parse("a[href]").unwrap();
+        let base = Url::parse(base_url)?;
+        let mut links = Vec::new();
+        
+        let all_links = document.select(&link_selector).collect::<Vec<_>>();
+        debug!("Found {} anchor tags in HTML", all_links.len());
+
+        for element in all_links {
+            if let Some(href) = element.value().attr("href") {
+                // Skip empty links and fragments
+                if href.is_empty() || href.starts_with('#') {
+                    continue;
+                }
+
+                // Skip mailto, tel, javascript links
+                if href.starts_with("mailto:") || href.starts_with("tel:") || href.starts_with("javascript:") {
+                    continue;
+                }
+
+                // Resolve relative URLs
+                match base.join(href) {
+                    Ok(absolute_url) => {
+                        let url_str = absolute_url.to_string();
+                        
+                        // Only include GURT protocol URLs for the same domain
+                        if url_str.starts_with("gurt://") {
+                            if let Ok(parsed) = Url::parse(&url_str) {
+                                if let Some(host) = parsed.host_str() {
+                                    if let Ok(base_parsed) = Url::parse(base_url) {
+                                        if let Some(base_host) = base_parsed.host_str() {
+                                            if host == base_host {
+                                                links.push(url_str);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        debug!("Failed to resolve URL {}: {}", href, e);
+                    }
+                }
+            }
+        }
+
+        // Remove duplicates
+        links.sort();
+        links.dedup();
+
+        Ok(links)
+    }
+
+    fn extract_title(&self, content: &str) -> Option<String> {
+        let document = Html::parse_document(content);
+        
+        // Try <title> tag first
+        if let Ok(title_selector) = Selector::parse("title") {
+            if let Some(title_element) = document.select(&title_selector).next() {
+                let title_text = title_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
+                if !title_text.is_empty() {
+                    return Some(title_text);
+                }
+            }
+        }
+
+        // Fallback to first <h1>
+        if let Ok(h1_selector) = Selector::parse("h1") {
+            if let Some(h1_element) = document.select(&h1_selector).next() {
+                let h1_text = h1_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
+                if !h1_text.is_empty() {
+                    return Some(h1_text);
+                }
+            }
+        }
+
+        None
+    }
+
+    fn extract_icon(&self, content: &str, base_url: &str) -> Option<String> {
+        let document = Html::parse_document(content);
+        
+        // Try to find icon tag first (custom GURT standard)
+        if let Ok(icon_selector) = Selector::parse("icon") {
+            if let Some(icon_element) = document.select(&icon_selector).next() {
+                if let Some(src) = icon_element.value().attr("src") {
+                    return Some(src.to_string());
+                }
+            }
+        }
+        
+        // Fallback to standard link rel="icon" or link rel="shortcut icon"
+        if let Ok(link_selector) = Selector::parse("link[rel~=\"icon\"], link[rel=\"shortcut icon\"]") {
+            if let Some(link_element) = document.select(&link_selector).next() {
+                if let Some(href) = link_element.value().attr("href") {
+                    // Convert relative URLs to absolute
+                    if href.starts_with("http") || href.starts_with("gurt") {
+                        return Some(href.to_string());
+                    } else if let Ok(base) = Url::parse(base_url) {
+                        if let Ok(absolute_url) = base.join(href) {
+                            return Some(absolute_url.to_string());
+                        }
+                    }
+                }
+            }
+        }
+        
+        None
+    }
+
+    fn extract_meta_description(&self, content: &str) -> Option<String> {
+        let document = Html::parse_document(content);
+        
+        // Look for meta name="description"
+        if let Ok(meta_selector) = Selector::parse("meta[name=\"description\"]") {
+            if let Some(meta_element) = document.select(&meta_selector).next() {
+                if let Some(content_attr) = meta_element.value().attr("content") {
+                    let description = content_attr.trim();
+                    if !description.is_empty() {
+                        return Some(description.to_string());
+                    }
+                }
+            }
+        }
+        
+        None
+    }
+
+    fn clean_content(&self, content: &str) -> String {
+        use lol_html::{element, rewrite_str, RewriteStrSettings};
+
+        // First pass: remove script, style, noscript elements
+        let settings = RewriteStrSettings {
+            element_content_handlers: vec![
+                element!("script", |el| {
+                    el.remove();
+                    Ok(())
+                }),
+                element!("style", |el| {
+                    el.remove();
+                    Ok(())
+                }),
+                element!("noscript", |el| {
+                    el.remove();
+                    Ok(())
+                }),
+            ],
+            ..RewriteStrSettings::default()
+        };
+
+        let cleaned_html = match rewrite_str(content, settings) {
+            Ok(html) => html,
+            Err(_) => content.to_string(),
+        };
+
+        // Second pass: extract text using scraper (already imported)
+        let document = Html::parse_document(&cleaned_html);
+        let text_content = document.root_element()
+            .text()
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        // Clean up whitespace
+        text_content
+            .split_whitespace()
+            .collect::<Vec<_>>()
+            .join(" ")
+    }
+
+    fn should_crawl_url(&self, url: &str, domain: &Domain) -> bool {
+        // Parse the URL
+        let parsed_url = match Url::parse(url) {
+            Ok(u) => u,
+            Err(_) => return false,
+        };
+
+        // Must be GURT protocol
+        if parsed_url.scheme() != "gurt" {
+            return false;
+        }
+
+        // Must be same domain
+        if let Some(host) = parsed_url.host_str() {
+            if host != domain.full_domain() {
+                return false;
+            }
+        } else {
+            return false;
+        }
+
+        if let Some(path) = parsed_url.path().split('/').last() {
+            if let Some(extension) = path.split('.').last() {
+                if path.contains('.') && extension != path {
+                    if self.config.is_blocked_extension(extension) {
+                        return false;
+                    }
+                    if !self.config.search.allowed_extensions.is_empty() 
+                        && !self.config.is_allowed_extension(extension) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        true
+    }
+
+    fn is_allowed_content_type(&self, content_type: &str) -> bool {
+        let ct_lower = content_type.to_lowercase();
+        
+        if ct_lower.contains("text/html") || ct_lower.contains("application/xhtml") {
+            return true;
+        }
+
+        if ct_lower.contains("text/plain") {
+            return true;
+        }
+
+        if ct_lower.contains("text/markdown") || ct_lower.contains("application/json") {
+            return true;
+        }
+
+        false
+    }
+
+    fn calculate_content_hash(content: &str) -> String {
+        use sha2::{Sha256, Digest};
+        let mut hasher = Sha256::new();
+        hasher.update(content.as_bytes());
+        format!("{:x}", hasher.finalize())
+    }
+}
+
+pub async fn run_crawl_all(config: Config) -> Result<()> {
+    info!("Starting crawl of all registered domains");
+
+    let pool = sqlx::PgPool::connect(&config.database_url()).await
+        .context("Failed to connect to database")?;
+
+    let domain_repo = DomainRepository::new(pool);
+    let search_engine = Arc::new(SearchEngine::new(config.clone())?);
+    let crawler = DomainCrawler::new(config.clone(), domain_repo.clone(), search_engine).await?;
+
+    let domains = domain_repo.get_domains_for_crawling(None).await
+        .context("Failed to fetch domains for crawling")?;
+
+    if domains.is_empty() {
+        info!("No domains found that need crawling");
+        return Ok(());
+    }
+
+    info!("Found {} domains to crawl", domains.len());
+
+    let mut total_stats = CrawlStats::new();
+    let max_concurrent = config.search.max_concurrent_crawls;
+
+    let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(max_concurrent));
+    let mut tasks = Vec::new();
+
+    for domain in domains {
+        let crawler = Arc::new(crawler.clone());
+        let permit = semaphore.clone().acquire_owned().await
+            .context("Failed to acquire semaphore permit")?;
+        
+        let task = tokio::spawn(async move {
+            let _permit = permit; // Keep permit alive
+            crawler.crawl_domain(&domain).await
+        });
+        
+        tasks.push(task);
+    }
+
+    for task in tasks {
+        match task.await {
+            Ok(Ok(stats)) => {
+                total_stats.pages_found += stats.pages_found;
+                total_stats.pages_indexed += stats.pages_indexed;
+                total_stats.pages_skipped += stats.pages_skipped;
+                total_stats.errors += stats.errors;
+            }
+            Ok(Err(e)) => {
+                error!("Crawl task failed: {}", e);
+                total_stats.errors += 1;
+            }
+            Err(e) => {
+                error!("Task join error: {}", e);
+                total_stats.errors += 1;
+            }
+        }
+    }
+
+    info!(
+        "Crawl completed - {} pages found, {} indexed, {} skipped, {} errors",
+        total_stats.pages_found,
+        total_stats.pages_indexed,
+        total_stats.pages_skipped,
+        total_stats.errors
+    );
+
+    Ok(())
+}
+
+#[derive(Debug, Clone)]
+struct CrawlItem {
+    url: String,
+    depth: usize,
+}
+
+#[derive(Debug, Clone)]
+pub struct CrawlStats {
+    pub pages_found: usize,
+    pub pages_indexed: usize,
+    pub pages_skipped: usize,
+    pub errors: usize,
+    pub duration_seconds: u64,
+}
+
+impl CrawlStats {
+    fn new() -> Self {
+        Self {
+            pages_found: 0,
+            pages_indexed: 0,
+            pages_skipped: 0,
+            errors: 0,
+            duration_seconds: 0,
+        }
+    }
+}
--- a/search-engine/src/indexer.rs
+++ b/search-engine/src/indexer.rs
@@ -0,0 +1,405 @@
+use anyhow::{Result, Context};
+use chrono::{DateTime, Utc};
+use std::collections::HashSet;
+use std::path::Path;
+use std::time::Instant;
+use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING, TEXT};
+use tantivy::{doc, Index, IndexReader, IndexWriter, ReloadPolicy, Term, TantivyDocument};
+use tantivy::collector::TopDocs;
+use tantivy::query::QueryParser;
+use tantivy::tokenizer::*;
+use tantivy::schema::Value;
+use tracing::{info, debug};
+
+use crate::config::Config;
+use crate::models::{SearchResult, SearchResponse, IndexStats, CrawledPage};
+
+pub struct SearchEngine {
+    config: Config,
+    index: Index,
+    reader: IndexReader,
+    schema: Schema,
+}
+
+impl SearchEngine {
+    pub fn new(config: Config) -> Result<Self> {
+        let index_path = &config.search.index_path;
+        
+        std::fs::create_dir_all(index_path)
+            .with_context(|| format!("Failed to create index directory: {:?}", index_path))?;
+
+        let schema = build_schema();
+        
+        let index = if index_path.join("meta.json").exists() {
+            info!("Loading existing search index from {:?}", index_path);
+            Index::open_in_dir(index_path)
+                .with_context(|| format!("Failed to open existing index at {:?}", index_path))?
+        } else {
+            info!("Creating new search index at {:?}", index_path);
+            Index::create_in_dir(index_path, schema.clone())
+                .with_context(|| format!("Failed to create new index at {:?}", index_path))?
+        };
+
+        // Configure tokenizers
+        let tokenizer_manager = index.tokenizers();
+        tokenizer_manager.register(
+            "gurted_text",
+            TextAnalyzer::builder(SimpleTokenizer::default())
+                .filter(RemoveLongFilter::limit(40))
+                .filter(LowerCaser)
+                .filter(StopWordFilter::new(Language::English).unwrap())
+                .build(),
+        );
+
+        let reader = index
+            .reader_builder()
+            .reload_policy(ReloadPolicy::OnCommitWithDelay)
+            .try_into()
+            .context("Failed to create index reader")?;
+
+        Ok(Self {
+            config,
+            index,
+            reader,
+            schema,
+        })
+    }
+
+    pub async fn index_pages(&self, pages: Vec<CrawledPage>) -> Result<usize> {
+        if pages.is_empty() {
+            return Ok(0);
+        }
+
+        let start_time = Instant::now();
+        let mut writer = self.get_writer()?;
+        let mut indexed_count = 0;
+        let mut duplicate_count = 0;
+
+        let url_field = self.schema.get_field("url").unwrap();
+        let title_field = self.schema.get_field("title").unwrap();
+        let content_field = self.schema.get_field("content").unwrap();
+        let preview_field = self.schema.get_field("preview").unwrap();
+        let domain_field = self.schema.get_field("domain").unwrap();
+        let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
+        let content_hash_field = self.schema.get_field("content_hash").unwrap();
+        let icon_field = self.schema.get_field("icon").unwrap();
+        let description_field = self.schema.get_field("description").unwrap();
+
+        info!("Indexing {} pages...", pages.len());
+
+        for page in pages {
+            // Check for duplicates (always enabled)
+            if let Ok(existing_hash) = self.get_document_hash(&page.url).await {
+                if existing_hash == page.content_hash {
+                    duplicate_count += 1;
+                    continue;
+                }
+            }
+
+            // Remove existing document for this URL
+            let url_term = Term::from_field_text(url_field, &page.url);
+            writer.delete_term(url_term);
+
+            let preview = page.generate_preview(500);
+            let title = page.title.unwrap_or_else(|| extract_title_from_content(&page.content));
+
+            // Add new document
+            writer.add_document(doc!(
+                url_field => page.url.clone(),
+                title_field => title,
+                content_field => page.content.clone(),
+                preview_field => preview,
+                domain_field => page.domain.clone(),
+                indexed_at_field => page.indexed_at.timestamp(),
+                content_hash_field => page.content_hash.clone(),
+                icon_field => page.icon.unwrap_or_default(),
+                description_field => page.description.unwrap_or_default()
+            ))?;
+
+            indexed_count += 1;
+
+            // Commit in batches
+            if indexed_count % 100 == 0 {
+                writer.commit()
+                    .context("Failed to commit batch of documents")?;
+                writer = self.get_writer()?; // Get new writer after commit
+                
+                let elapsed = start_time.elapsed().as_secs_f64();
+                let rate = indexed_count as f64 / elapsed;
+                info!("Indexed {} pages ({:.1} pages/sec)", indexed_count, rate);
+            }
+        }
+
+        // Final commit
+        writer.commit().context("Failed to commit final batch")?;
+
+        let total_time = start_time.elapsed();
+        info!(
+            "Indexing completed: {} pages indexed, {} duplicates skipped in {:.2}s",
+            indexed_count,
+            duplicate_count,
+            total_time.as_secs_f64()
+        );
+
+        Ok(indexed_count)
+    }
+
+    pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
+        let start_time = Instant::now();
+        let searcher = self.reader.searcher();
+        
+        let url_field = self.schema.get_field("url").unwrap();
+        let title_field = self.schema.get_field("title").unwrap();
+        let content_field = self.schema.get_field("content").unwrap();
+        let preview_field = self.schema.get_field("preview").unwrap();
+        let domain_field = self.schema.get_field("domain").unwrap();
+        let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
+        let icon_field = self.schema.get_field("icon").unwrap();
+        let description_field = self.schema.get_field("description").unwrap();
+
+        // Create query parser for title and content fields
+        let query_parser = QueryParser::for_index(
+            &self.index, 
+            vec![title_field, content_field]
+        );
+
+        let parsed_query = query_parser
+            .parse_query(query)
+            .with_context(|| format!("Failed to parse query: {}", query))?;
+
+        let top_docs = searcher
+            .search(&parsed_query, &TopDocs::with_limit(limit))
+            .context("Search query execution failed")?;
+
+        let mut results = Vec::new();
+
+        for (score, doc_address) in top_docs {
+            let doc: TantivyDocument = searcher.doc(doc_address)?;
+            
+            let url = doc.get_first(url_field)
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string();
+
+            let title = doc.get_first(title_field)
+                .and_then(|v| v.as_str())
+                .unwrap_or("Untitled")
+                .to_string();
+
+            let preview = doc.get_first(preview_field)
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string();
+
+            let domain = doc.get_first(domain_field)
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string();
+
+            let indexed_at_timestamp = doc.get_first(indexed_at_field)
+                .and_then(|v| v.as_i64())
+                .unwrap_or(0);
+
+            let indexed_at = DateTime::from_timestamp(indexed_at_timestamp, 0)
+                .unwrap_or_else(|| Utc::now());
+
+            let icon = doc.get_first(icon_field)
+                .and_then(|v| v.as_str())
+                .filter(|s| !s.is_empty())
+                .map(|s| s.to_string());
+
+            let description = doc.get_first(description_field)
+                .and_then(|v| v.as_str())
+                .filter(|s| !s.is_empty())
+                .map(|s| s.to_string());
+
+            results.push(SearchResult {
+                url,
+                title,
+                preview,
+                domain,
+                score,
+                indexed_at,
+                icon,
+                description,
+            });
+        }
+
+        let search_time = start_time.elapsed();
+        debug!(
+            "Search completed: {} results for '{}' in {:.2}ms",
+            results.len(),
+            query,
+            search_time.as_millis()
+        );
+
+        Ok(results)
+    }
+
+    pub async fn search_with_response(&self, query: &str, page: usize, per_page: usize) -> Result<SearchResponse> {
+        let offset = page.saturating_sub(1) * per_page;
+        let limit = std::cmp::min(per_page, self.config.search.max_search_results);
+
+        let all_results = self.search(query, offset + limit).await?;
+        let results = all_results.into_iter().skip(offset).take(per_page).collect();
+        let total_results = self.get_total_document_count().await?;
+
+        Ok(SearchResponse {
+            query: query.to_string(),
+            results,
+            total_results,
+            page,
+            per_page,
+        })
+    }
+
+    pub async fn get_stats(&self) -> Result<IndexStats> {
+        let searcher = self.reader.searcher();
+        let total_documents = searcher.num_docs() as usize;
+
+        // Count unique domains (simplified approach)
+        let domains: HashSet<String> = HashSet::new();
+        // TODO: Implement domain counting when needed
+
+        let total_domains = domains.len();
+
+        // Calculate index size
+        let index_size_mb = calculate_directory_size(&self.config.search.index_path)?;
+
+        // Get last update time (approximate)
+        let last_updated = get_index_last_modified(&self.config.search.index_path)?;
+
+        Ok(IndexStats {
+            total_documents,
+            total_domains,
+            index_size_mb,
+            last_updated,
+        })
+    }
+
+    pub async fn get_total_document_count(&self) -> Result<usize> {
+        let searcher = self.reader.searcher();
+        Ok(searcher.num_docs() as usize)
+    }
+
+    async fn get_document_hash(&self, url: &str) -> Result<String> {
+        let searcher = self.reader.searcher();
+        let url_field = self.schema.get_field("url").unwrap();
+        let content_hash_field = self.schema.get_field("content_hash").unwrap();
+        
+        let query_parser = QueryParser::for_index(&self.index, vec![url_field]);
+        let query = query_parser.parse_query(&format!("\"{}\"", url))?;
+        
+        let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
+        
+        if let Some((_, doc_address)) = top_docs.first() {
+            let doc: TantivyDocument = searcher.doc(*doc_address)?;
+            if let Some(hash_value) = doc.get_first(content_hash_field) {
+                if let Some(hash_str) = hash_value.as_str() {
+                    return Ok(hash_str.to_string());
+                }
+            }
+        }
+        
+        Err(anyhow::anyhow!("Document not found: {}", url))
+    }
+
+    fn get_writer(&self) -> Result<IndexWriter> {
+        self.index
+            .writer_with_num_threads(4, 256 * 1024 * 1024) // 256MB buffer
+            .context("Failed to create index writer")
+    }
+}
+
+pub async fn rebuild_index(config: Config) -> Result<()> {
+    info!("Starting index rebuild...");
+    
+    // Remove existing index
+    if config.search.index_path.exists() {
+        std::fs::remove_dir_all(&config.search.index_path)
+            .context("Failed to remove existing index")?;
+    }
+
+    // Create new search engine (which will create a new index)
+    let _search_engine = SearchEngine::new(config)?;
+    
+    info!("Index rebuild completed - new empty index created");
+    info!("Run a crawl to populate the index with content");
+    
+    Ok(())
+}
+
+fn build_schema() -> Schema {
+    let mut schema_builder = Schema::builder();
+
+    schema_builder.add_text_field("url", STRING | STORED | FAST);
+    schema_builder.add_text_field("title", TEXT | STORED);
+    schema_builder.add_text_field("content", TEXT);
+    schema_builder.add_text_field("preview", STRING | STORED);
+    schema_builder.add_text_field("domain", STRING | STORED | FAST);
+    schema_builder.add_i64_field("indexed_at", INDEXED | STORED | FAST);
+    schema_builder.add_text_field("content_hash", STRING | STORED);
+    schema_builder.add_text_field("icon", STRING | STORED);
+    schema_builder.add_text_field("description", STRING | STORED);
+
+    schema_builder.build()
+}
+
+fn extract_title_from_content(content: &str) -> String {
+    // Try to extract title from HTML content
+    let document = scraper::Html::parse_document(content);
+    let title_selector = scraper::Selector::parse("title").unwrap();
+    if let Some(title_element) = document.select(&title_selector).next() {
+        let title = title_element.text().collect::<Vec<_>>().join(" ");
+        if !title.trim().is_empty() {
+            return title.trim().to_string();
+        }
+    }
+
+    // Fallback to h1
+    let h1_selector = scraper::Selector::parse("h1").unwrap();
+    if let Some(h1_element) = document.select(&h1_selector).next() {
+        let h1_text = h1_element.text().collect::<Vec<_>>().join(" ");
+        if !h1_text.trim().is_empty() {
+            return h1_text.trim().to_string();
+        }
+    }
+
+    // Fallback to first line of content
+    content.lines()
+        .find(|line| !line.trim().is_empty())
+        .unwrap_or("Untitled")
+        .trim()
+        .to_string()
+}
+
+fn calculate_directory_size(path: &Path) -> Result<f64> {
+    let mut total_size = 0u64;
+    
+    if path.is_dir() {
+        for entry in std::fs::read_dir(path)? {
+            let entry = entry?;
+            let metadata = entry.metadata()?;
+            if metadata.is_file() {
+                total_size += metadata.len();
+            } else if metadata.is_dir() {
+                total_size += (calculate_directory_size(&entry.path())? * 1024.0 * 1024.0) as u64;
+            }
+        }
+    }
+    
+    Ok(total_size as f64 / 1024.0 / 1024.0) // Convert to MB
+}
+
+fn get_index_last_modified(path: &Path) -> Result<DateTime<Utc>> {
+    let meta_path = path.join("meta.json");
+    
+    if meta_path.exists() {
+        let metadata = std::fs::metadata(meta_path)?;
+        let modified = metadata.modified()?;
+        let datetime = DateTime::<Utc>::from(modified);
+        Ok(datetime)
+    } else {
+        Ok(Utc::now())
+    }
+}
--- a/search-engine/src/main.rs
+++ b/search-engine/src/main.rs
@@ -0,0 +1,124 @@
+mod config;
+mod indexer;
+mod crawler;
+mod scheduler;
+mod server;
+mod models;
+
+use anyhow::Result;
+use clap::{Parser, Subcommand};
+use config::Config;
+use tracing::info;
+use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
+
+#[derive(Parser)]
+#[command(name = "gurted-search-engine")]
+#[command(about = "Crawl and index registered GURT domains")]
+#[command(version = "0.1.0")]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+
+    #[arg(long, default_value = "config.toml")]
+    config: String,
+
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    verbose: u8,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Server,
+    Crawl,
+    RebuildIndex,
+    Search {
+        query: String,
+        #[arg(short, long, default_value = "10")]
+        limit: usize,
+    },
+    Stats,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    init_logging(&cli)?;
+
+    let config = Config::load_from_file(&cli.config).unwrap();
+
+    info!("Starting Gurted Search Engine v{}", env!("CARGO_PKG_VERSION"));
+    info!("Configuration loaded from: {}", cli.config);
+
+    match cli.command {
+        Commands::Server => {
+            info!("Starting search engine server on {}", config.server_bind_address());
+            server::run_server(config).await?;
+        }
+        Commands::Crawl => {
+            info!("Starting one-time crawl of all registered domains");
+            crawler::run_crawl_all(config).await?;
+        }
+        Commands::RebuildIndex => {
+            info!("Rebuilding search index from scratch");
+            indexer::rebuild_index(config).await?;
+        }
+        Commands::Search { query, limit } => {
+            info!("Testing search with query: '{}'", query);
+            test_search(config, query, limit).await?;
+        }
+        Commands::Stats => {
+            info!("Displaying search index statistics");
+            show_stats(config).await?;
+        }
+    }
+
+    Ok(())
+}
+
+fn init_logging(cli: &Cli) -> Result<()> {
+    let log_level = match cli.verbose {
+        0 => "info",
+        1 => "debug",
+        _ => "trace",
+    };
+
+    tracing_subscriber::registry()
+        .with(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| format!("gurted_search_engine={}", log_level).into()),
+        )
+        .with(tracing_subscriber::fmt::layer())
+        .init();
+
+    Ok(())
+}
+
+async fn test_search(config: Config, query: String, limit: usize) -> Result<()> {
+    let search_engine = indexer::SearchEngine::new(config)?;
+    let results = search_engine.search(&query, limit).await?;
+    
+    println!("Search results for '{}' (showing {} results):", 
+             query, results.len());
+    
+    for (i, result) in results.iter().enumerate() {
+        println!("{}. {} - {}", i + 1, result.title, result.url);
+        println!("   {}", result.preview);
+        println!();
+    }
+    
+    Ok(())
+}
+
+async fn show_stats(config: Config) -> Result<()> {
+    let search_engine = indexer::SearchEngine::new(config)?;
+    let stats = search_engine.get_stats().await?;
+    
+    println!("Search Index Statistics:");
+    println!("  Total documents: {}", stats.total_documents);
+    println!("  Total domains: {}", stats.total_domains);
+    println!("  Index size: {} MB", stats.index_size_mb);
+    println!("  Last updated: {}", stats.last_updated);
+    
+    Ok(())
+}
--- a/search-engine/src/models.rs
+++ b/search-engine/src/models.rs
@@ -0,0 +1,185 @@
+use serde::{Deserialize, Serialize};
+use sqlx::{FromRow, types::chrono::{DateTime, Utc}};
+
+#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
+pub struct Domain {
+    pub id: i32,
+    pub name: String,
+    pub tld: String,
+    pub user_id: Option<i32>,
+    pub status: Option<String>,
+    pub created_at: Option<DateTime<Utc>>,
+}
+
+impl Domain {
+    pub fn full_domain(&self) -> String {
+        format!("{}.{}", self.name, self.tld)
+    }
+
+    pub fn gurt_url(&self) -> String {
+        format!("gurt://{}.{}", self.name, self.tld)
+    }
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
+pub struct DnsRecord {
+    pub id: i32,
+    pub domain_id: i32,
+    pub record_type: String,
+    pub name: String,
+    pub value: String,
+    pub ttl: Option<i32>,
+    pub priority: Option<i32>,
+    pub created_at: Option<DateTime<Utc>>,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct SearchResult {
+    pub url: String,
+    pub title: String,
+    pub preview: String,
+    pub domain: String,
+    pub score: f32,
+    pub indexed_at: DateTime<Utc>,
+    pub icon: Option<String>,
+    pub description: Option<String>,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct SearchResponse {
+    pub query: String,
+    pub results: Vec<SearchResult>,
+    pub total_results: usize,
+    pub page: usize,
+    pub per_page: usize,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub struct IndexStats {
+    pub total_documents: usize,
+    pub total_domains: usize,
+    pub index_size_mb: f64,
+    pub last_updated: DateTime<Utc>,
+}
+
+#[derive(Clone, Debug)]
+pub struct CrawledPage {
+    pub url: String,
+    pub domain: String,
+    pub title: Option<String>,
+    pub content: String,
+    pub content_hash: String,
+    pub indexed_at: DateTime<Utc>,
+    pub icon: Option<String>,
+    pub description: Option<String>,
+}
+
+impl CrawledPage {
+    pub fn generate_preview(&self, max_len: usize) -> String {
+        let text = self.content.trim();
+        if text.len() <= max_len {
+            text.to_string()
+        } else {
+            let mut preview = text.chars().take(max_len).collect::<String>();
+            if let Some(last_space) = preview.rfind(' ') {
+                preview.truncate(last_space);
+            }
+            preview.push_str("...");
+            preview
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct DomainRepository {
+    pool: sqlx::PgPool,
+}
+
+impl DomainRepository {
+    pub fn new(pool: sqlx::PgPool) -> Self {
+        Self { pool }
+    }
+
+    pub async fn get_domains_for_crawling(&self, limit: Option<i32>) -> Result<Vec<Domain>, sqlx::Error> {
+        let query = if let Some(limit) = limit {
+            sqlx::query_as::<_, Domain>(
+                "SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at 
+                 FROM domains d
+                 LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
+                 WHERE d.status = 'approved' 
+                 AND (dcs.crawl_status IS NULL 
+                      OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
+                      OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
+                      OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
+                 ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC
+                 LIMIT $1"
+            )
+            .bind(limit)
+        } else {
+            sqlx::query_as::<_, Domain>(
+                "SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at 
+                 FROM domains d
+                 LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
+                 WHERE d.status = 'approved' 
+                 AND (dcs.crawl_status IS NULL 
+                      OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
+                      OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
+                      OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
+                 ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC"
+            )
+        };
+        
+        query.fetch_all(&self.pool).await
+    }
+
+    pub async fn update_crawl_status(
+        &self, 
+        domain_id: i32, 
+        status: &str, 
+        error_message: Option<&str>,
+        pages_found: Option<i32>,
+        next_crawl_hours: Option<i64>
+    ) -> Result<(), sqlx::Error> {
+        let next_crawl_at = next_crawl_hours
+            .map(|hours| chrono::Utc::now() + chrono::Duration::hours(hours));
+
+        sqlx::query(
+            "INSERT INTO domain_crawl_status (domain_id, crawl_status, error_message, pages_found, last_crawled_at, next_crawl_at, updated_at)
+             VALUES ($1, $2, $3, $4,
+                     CASE WHEN $2 IN ('completed','failed') THEN NOW() ELSE NULL END,
+                     $5, NOW())
+             ON CONFLICT (domain_id) 
+             DO UPDATE SET 
+                crawl_status = $2,
+                error_message = $3,
+                pages_found = CASE WHEN $2 = 'completed'
+                                   THEN COALESCE($4, domain_crawl_status.pages_found)
+                                   ELSE domain_crawl_status.pages_found END,
+                last_crawled_at = CASE WHEN $2 IN ('completed','failed')
+                                       THEN NOW()
+                                       ELSE domain_crawl_status.last_crawled_at END,
+                next_crawl_at = COALESCE($5, domain_crawl_status.next_crawl_at),
+                updated_at = NOW()"
+        )
+        .bind(domain_id)
+        .bind(status)
+        .bind(error_message)
+        .bind(pages_found)
+        .bind(next_crawl_at)
+        .execute(&self.pool)
+        .await?;
+
+        Ok(())
+    }
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
+pub struct CrawlStatus {
+    pub domain_id: i32,
+    pub last_crawled_at: Option<DateTime<Utc>>,
+    pub next_crawl_at: Option<DateTime<Utc>>,
+    pub crawl_status: Option<String>,
+    pub error_message: Option<String>,
+    pub pages_found: Option<i32>,
+    pub updated_at: Option<DateTime<Utc>>,
+}
--- a/search-engine/src/scheduler.rs
+++ b/search-engine/src/scheduler.rs
@@ -0,0 +1,152 @@
+use anyhow::{Result, Context};
+use tokio::time::{interval, Instant};
+use tracing::{info, error};
+
+use crate::config::Config;
+use crate::crawler::run_crawl_all;
+use crate::indexer::SearchEngine;
+
+pub struct CrawlScheduler {
+    config: Config,
+}
+
+impl CrawlScheduler {
+    pub fn new(config: Config) -> Self {
+        Self { config }
+    }
+
+    pub async fn start(&self) -> Result<()> {
+        info!("Starting crawl scheduler");
+        info!(
+            "Crawl interval: {} hours ({} seconds)",
+            self.config.search.crawl_interval_hours,
+            self.config.crawl_interval().as_secs()
+        );
+        info!(
+            "Index rebuild interval: {} hours ({} seconds)",
+            self.config.search.index_rebuild_interval_hours,
+            self.config.index_rebuild_interval().as_secs()
+        );
+
+        let mut crawl_interval = interval(self.config.crawl_interval());
+        let mut index_rebuild_interval = interval(self.config.index_rebuild_interval());
+
+        crawl_interval.tick().await;
+        index_rebuild_interval.tick().await;
+
+        info!("Running initial crawl...");
+        if let Err(e) = self.run_scheduled_crawl().await {
+            error!("Initial crawl failed: {}", e);
+            error!("Error details: {:?}", e);
+            
+            // Log the error chain
+            let mut source = e.source();
+            let mut depth = 1;
+            while let Some(err) = source {
+                error!("  Caused by ({}): {}", depth, err);
+                source = err.source();
+                depth += 1;
+            }
+        }
+
+        loop {
+            tokio::select! {
+                _ = crawl_interval.tick() => {
+                    info!("Running scheduled crawl");
+                    if let Err(e) = self.run_scheduled_crawl().await {
+                        error!("Scheduled crawl failed: {}", e);
+                        error!("Error details: {:?}", e);
+                        
+                        // Log the error chain
+                        let mut source = e.source();
+                        let mut depth = 1;
+                        while let Some(err) = source {
+                            error!("  Caused by ({}): {}", depth, err);
+                            source = err.source();
+                            depth += 1;
+                        }
+                    }
+                }
+                _ = index_rebuild_interval.tick() => {
+                    info!("Running scheduled index rebuild");
+                    if let Err(e) = self.run_scheduled_index_rebuild().await {
+                        error!("Scheduled index rebuild failed: {}", e);
+                    }
+                }
+                _ = tokio::signal::ctrl_c() => {
+                    info!("Received shutdown signal, stopping scheduler");
+                    break;
+                }
+            }
+        }
+
+        info!("Crawl scheduler stopped");
+        Ok(())
+    }
+
+    async fn run_scheduled_crawl(&self) -> Result<()> {
+        let start_time = Instant::now();
+        
+        run_crawl_all(self.config.clone()).await
+            .context("Scheduled crawl failed")?;
+
+        let duration = start_time.elapsed();
+        info!("Scheduled crawl completed in {:.2} seconds", duration.as_secs_f64());
+        
+        Ok(())
+    }
+
+    async fn run_scheduled_index_rebuild(&self) -> Result<()> {
+        let start_time = Instant::now();
+
+        let search_engine = SearchEngine::new(self.config.clone())?;
+        let stats_before = search_engine.get_stats().await?;
+        
+        info!(
+            "Starting index rebuild - current index has {} documents from {} domains ({:.2} MB)",
+            stats_before.total_documents,
+            stats_before.total_domains,
+            stats_before.index_size_mb
+        );
+
+        crate::indexer::rebuild_index(self.config.clone()).await
+            .context("Index rebuild failed")?;
+
+        info!("Repopulating rebuilt index with fresh crawl");
+        run_crawl_all(self.config.clone()).await
+            .context("Post-rebuild crawl failed")?;
+
+        let duration = start_time.elapsed();
+        
+        let new_search_engine = SearchEngine::new(self.config.clone())?;
+        let stats_after = new_search_engine.get_stats().await?;
+
+        info!(
+            "Index rebuild completed in {:.2} seconds - new index has {} documents from {} domains ({:.2} MB)",
+            duration.as_secs_f64(),
+            stats_after.total_documents,
+            stats_after.total_domains,
+            stats_after.index_size_mb
+        );
+
+        Ok(())
+    }
+}
+
+pub struct BackgroundScheduler {
+    scheduler: CrawlScheduler,
+}
+
+impl BackgroundScheduler {
+    pub fn new(config: Config) -> Self {
+        Self {
+            scheduler: CrawlScheduler::new(config),
+        }
+    }
+
+    pub fn start(self) -> tokio::task::JoinHandle<Result<()>> {
+        tokio::spawn(async move {
+            self.scheduler.start().await
+        })
+    }
+}
--- a/search-engine/src/server.rs
+++ b/search-engine/src/server.rs
@@ -0,0 +1,195 @@
+use anyhow::{Result, Context};
+use gurt::prelude::*;
+use gurt::GurtError;
+use serde_json::json;
+use std::sync::Arc;
+use tracing::{info, error};
+
+use crate::config::Config;
+use crate::indexer::SearchEngine;
+use crate::scheduler::BackgroundScheduler;
+
+pub struct SearchServer {
+    config: Config,
+    search_engine: Arc<SearchEngine>,
+}
+
+impl SearchServer {
+    pub async fn new(config: Config) -> Result<Self> {
+        // Connect to database
+        sqlx::PgPool::connect(&config.database_url()).await
+            .context("Failed to connect to database")?;
+
+        let search_engine = Arc::new(SearchEngine::new(config.clone())?);
+
+        Ok(Self {
+            config,
+            search_engine,
+        })
+    }
+
+    pub async fn run(self) -> Result<()> {
+        info!("Starting GURT search server on {}", self.config.server_bind_address());
+
+        let scheduler_config = self.config.clone();
+        let _scheduler_handle = BackgroundScheduler::new(scheduler_config).start();
+        info!("Background crawler scheduler started");
+
+        let server = GurtServer::with_tls_certificates(
+            &self.config.server.cert_path.to_string_lossy(),
+            &self.config.server.key_path.to_string_lossy()
+        )?;
+
+        let search_engine = self.search_engine.clone();
+        let config = self.config.clone();
+
+        let server = server
+            .get("/search", {
+                let search_engine = search_engine.clone();
+                let config = config.clone();
+                move |ctx| {
+                    let search_engine = search_engine.clone();
+                    let config = config.clone();
+                    let path = ctx.path().to_string();
+                    async move {
+                        handle_search(path, search_engine, config).await
+                    }
+                }
+            })
+            .get("/api/search*", {
+                let search_engine = search_engine.clone();
+                let config = config.clone();
+                move |ctx| {
+                    let search_engine = search_engine.clone();
+                    let config = config.clone();
+                    
+                    let path = ctx.path().to_string();
+                    async move {
+                        handle_api_search(path, search_engine, config).await
+                    }
+                }
+            })
+            .get("/", {
+                move |_ctx| async {
+                    Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.html")))
+                }
+            })
+            .get("/search.lua", {
+                move |_ctx| async {
+                    Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.lua")))
+                }
+            })
+            .get("/health", |_ctx| async {
+                Ok(GurtResponse::ok().with_json_body(&json!({"status": "healthy"}))?)
+            })
+            .get("/test*", |ctx| {
+                let path = ctx.path().to_string();
+                async move {
+                    println!("Test request path: '{}'", path);
+                    Ok(GurtResponse::ok().with_string_body(format!("Path received: {}", path)))
+                }
+            });
+
+        info!("GURT search server listening on {}", self.config.gurt_protocol_url());
+        server.listen(&self.config.server_bind_address()).await.map_err(|e| anyhow::anyhow!("GURT server error: {}", e))
+    }
+}
+
+pub async fn run_server(config: Config) -> Result<()> {
+    let server = SearchServer::new(config).await?;
+    server.run().await
+}
+
+fn parse_query_param(path: &str, param: &str) -> String {
+    let param_with_eq = format!("{}=", param);
+    if let Some(start) = path.find(&format!("?{}", param_with_eq)) {
+        let start_pos = start + 1 + param_with_eq.len(); // Skip the '?' and 'param='
+        let query_part = &path[start_pos..];
+        let end_pos = query_part.find('&').unwrap_or(query_part.len());
+        urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
+    } else if let Some(start) = path.find(&format!("&{}", param_with_eq)) {
+        let start_pos = start + 1 + param_with_eq.len(); // Skip the '&' and 'param='
+        let query_part = &path[start_pos..];
+        let end_pos = query_part.find('&').unwrap_or(query_part.len());
+        urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
+    } else {
+        String::new()
+    }
+}
+
+fn parse_query_param_usize(path: &str, param: &str) -> Option<usize> {
+    let value = parse_query_param(path, param);
+    if value.is_empty() { None } else { value.parse().ok() }
+}
+
+async fn handle_search(
+    path: String,
+    search_engine: Arc<SearchEngine>,
+    config: Config
+) -> Result<GurtResponse, GurtError> {
+    let query = parse_query_param(&path, "q");
+    
+    if query.is_empty() {
+        return Ok(GurtResponse::bad_request()
+            .with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
+    }
+
+    println!("Search query: '{}'", query);
+
+    let limit = parse_query_param_usize(&path, "limit")
+        .unwrap_or(config.search.search_results_per_page)
+        .min(config.search.max_search_results);
+
+    match search_engine.search(&query, limit).await {
+        Ok(results) => {
+            let response = json!({
+                "query": query,
+                "results": results,
+                "count": results.len()
+            });
+            
+            Ok(GurtResponse::ok()
+                .with_header("content-type", "application/json")
+                .with_json_body(&response)?)
+        }
+        Err(e) => {
+            error!("Search failed: {}", e);
+            Ok(GurtResponse::internal_server_error()
+                .with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
+        }
+    }
+}
+
+async fn handle_api_search(
+    path: String, 
+    search_engine: Arc<SearchEngine>,
+    config: Config
+) -> Result<GurtResponse, GurtError> {
+    let query = parse_query_param(&path, "q");
+    
+    if query.is_empty() {
+        return Ok(GurtResponse::bad_request()
+            .with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
+    }
+
+    let page = parse_query_param_usize(&path, "page")
+        .unwrap_or(1)
+        .max(1);
+
+    let per_page = parse_query_param_usize(&path, "per_page")
+        .unwrap_or(config.search.search_results_per_page)
+        .min(config.search.max_search_results);
+
+    match search_engine.search_with_response(&query, page, per_page).await {
+        Ok(response) => {
+            Ok(GurtResponse::ok()
+                .with_header("content-type", "application/json")
+                .with_json_body(&response)?)
+        }
+        Err(e) => {
+            error!("API search failed: {}", e);
+            Ok(GurtResponse::internal_server_error()
+                .with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
+        }
+    }
+}