use anyhow::{Result, Context}; use chrono::{DateTime, Utc}; use std::collections::HashSet; use std::path::Path; use std::time::Instant; use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING, TEXT}; use tantivy::{doc, Index, IndexReader, IndexWriter, ReloadPolicy, Term, TantivyDocument}; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::tokenizer::*; use tantivy::schema::Value; use tracing::{info, debug}; use crate::config::Config; use crate::models::{SearchResult, SearchResponse, IndexStats, CrawledPage}; pub struct SearchEngine { config: Config, index: Index, reader: IndexReader, schema: Schema, } impl SearchEngine { pub fn new(config: Config) -> Result { let index_path = &config.search.index_path; std::fs::create_dir_all(index_path) .with_context(|| format!("Failed to create index directory: {:?}", index_path))?; let schema = build_schema(); let index = if index_path.join("meta.json").exists() { info!("Loading existing search index from {:?}", index_path); Index::open_in_dir(index_path) .with_context(|| format!("Failed to open existing index at {:?}", index_path))? } else { info!("Creating new search index at {:?}", index_path); Index::create_in_dir(index_path, schema.clone()) .with_context(|| format!("Failed to create new index at {:?}", index_path))? }; // Configure tokenizers let tokenizer_manager = index.tokenizers(); tokenizer_manager.register( "gurted_text", TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(StopWordFilter::new(Language::English).unwrap()) .build(), ); let reader = index .reader_builder() .reload_policy(ReloadPolicy::OnCommitWithDelay) .try_into() .context("Failed to create index reader")?; Ok(Self { config, index, reader, schema, }) } pub async fn index_pages(&self, pages: Vec) -> Result { if pages.is_empty() { return Ok(0); } let start_time = Instant::now(); let mut writer = self.get_writer()?; let mut indexed_count = 0; let mut duplicate_count = 0; let url_field = self.schema.get_field("url").unwrap(); let title_field = self.schema.get_field("title").unwrap(); let content_field = self.schema.get_field("content").unwrap(); let preview_field = self.schema.get_field("preview").unwrap(); let domain_field = self.schema.get_field("domain").unwrap(); let indexed_at_field = self.schema.get_field("indexed_at").unwrap(); let content_hash_field = self.schema.get_field("content_hash").unwrap(); let icon_field = self.schema.get_field("icon").unwrap(); let description_field = self.schema.get_field("description").unwrap(); info!("Indexing {} pages...", pages.len()); for page in pages { // Check for duplicates (always enabled) if let Ok(existing_hash) = self.get_document_hash(&page.url).await { if existing_hash == page.content_hash { duplicate_count += 1; continue; } } // Remove existing document for this URL let url_term = Term::from_field_text(url_field, &page.url); writer.delete_term(url_term); let preview = page.generate_preview(500); let title = page.title.unwrap_or_else(|| extract_title_from_content(&page.content)); // Add new document writer.add_document(doc!( url_field => page.url.clone(), title_field => title, content_field => page.content.clone(), preview_field => preview, domain_field => page.domain.clone(), indexed_at_field => page.indexed_at.timestamp(), content_hash_field => page.content_hash.clone(), icon_field => page.icon.unwrap_or_default(), description_field => page.description.unwrap_or_default() ))?; indexed_count += 1; // Commit in batches if indexed_count % 100 == 0 { writer.commit() .context("Failed to commit batch of documents")?; writer = self.get_writer()?; // Get new writer after commit let elapsed = start_time.elapsed().as_secs_f64(); let rate = indexed_count as f64 / elapsed; info!("Indexed {} pages ({:.1} pages/sec)", indexed_count, rate); } } // Final commit writer.commit().context("Failed to commit final batch")?; let total_time = start_time.elapsed(); info!( "Indexing completed: {} pages indexed, {} duplicates skipped in {:.2}s", indexed_count, duplicate_count, total_time.as_secs_f64() ); Ok(indexed_count) } pub async fn search(&self, query: &str, limit: usize) -> Result> { let start_time = Instant::now(); let searcher = self.reader.searcher(); let url_field = self.schema.get_field("url").unwrap(); let title_field = self.schema.get_field("title").unwrap(); let content_field = self.schema.get_field("content").unwrap(); let preview_field = self.schema.get_field("preview").unwrap(); let domain_field = self.schema.get_field("domain").unwrap(); let indexed_at_field = self.schema.get_field("indexed_at").unwrap(); let icon_field = self.schema.get_field("icon").unwrap(); let description_field = self.schema.get_field("description").unwrap(); // Create query parser for title and content fields let query_parser = QueryParser::for_index( &self.index, vec![title_field, content_field] ); let parsed_query = query_parser .parse_query(query) .with_context(|| format!("Failed to parse query: {}", query))?; let top_docs = searcher .search(&parsed_query, &TopDocs::with_limit(limit)) .context("Search query execution failed")?; let mut results = Vec::new(); for (score, doc_address) in top_docs { let doc: TantivyDocument = searcher.doc(doc_address)?; let url = doc.get_first(url_field) .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let title = doc.get_first(title_field) .and_then(|v| v.as_str()) .unwrap_or("Untitled") .to_string(); let preview = doc.get_first(preview_field) .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let domain = doc.get_first(domain_field) .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let indexed_at_timestamp = doc.get_first(indexed_at_field) .and_then(|v| v.as_i64()) .unwrap_or(0); let indexed_at = DateTime::from_timestamp(indexed_at_timestamp, 0) .unwrap_or_else(|| Utc::now()); let icon = doc.get_first(icon_field) .and_then(|v| v.as_str()) .filter(|s| !s.is_empty()) .map(|s| s.to_string()); let description = doc.get_first(description_field) .and_then(|v| v.as_str()) .filter(|s| !s.is_empty()) .map(|s| s.to_string()); results.push(SearchResult { url, title, preview, domain, score, indexed_at, icon, description, }); } let search_time = start_time.elapsed(); debug!( "Search completed: {} results for '{}' in {:.2}ms", results.len(), query, search_time.as_millis() ); Ok(results) } pub async fn search_with_response(&self, query: &str, page: usize, per_page: usize) -> Result { let offset = page.saturating_sub(1) * per_page; let limit = std::cmp::min(per_page, self.config.search.max_search_results); let all_results = self.search(query, offset + limit).await?; let results = all_results.into_iter().skip(offset).take(per_page).collect(); let total_results = self.get_total_document_count().await?; Ok(SearchResponse { query: query.to_string(), results, total_results, page, per_page, }) } pub async fn get_stats(&self) -> Result { let searcher = self.reader.searcher(); let total_documents = searcher.num_docs() as usize; // Count unique domains (simplified approach) let domains: HashSet = HashSet::new(); // TODO: Implement domain counting when needed let total_domains = domains.len(); // Calculate index size let index_size_mb = calculate_directory_size(&self.config.search.index_path)?; // Get last update time (approximate) let last_updated = get_index_last_modified(&self.config.search.index_path)?; Ok(IndexStats { total_documents, total_domains, index_size_mb, last_updated, }) } pub async fn get_total_document_count(&self) -> Result { let searcher = self.reader.searcher(); Ok(searcher.num_docs() as usize) } async fn get_document_hash(&self, url: &str) -> Result { let searcher = self.reader.searcher(); let url_field = self.schema.get_field("url").unwrap(); let content_hash_field = self.schema.get_field("content_hash").unwrap(); let query_parser = QueryParser::for_index(&self.index, vec![url_field]); let query = query_parser.parse_query(&format!("\"{}\"", url))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?; if let Some((_, doc_address)) = top_docs.first() { let doc: TantivyDocument = searcher.doc(*doc_address)?; if let Some(hash_value) = doc.get_first(content_hash_field) { if let Some(hash_str) = hash_value.as_str() { return Ok(hash_str.to_string()); } } } Err(anyhow::anyhow!("Document not found: {}", url)) } fn get_writer(&self) -> Result { self.index .writer_with_num_threads(4, 256 * 1024 * 1024) // 256MB buffer .context("Failed to create index writer") } } pub async fn rebuild_index(config: Config) -> Result<()> { info!("Starting index rebuild..."); // Remove existing index if config.search.index_path.exists() { std::fs::remove_dir_all(&config.search.index_path) .context("Failed to remove existing index")?; } // Create new search engine (which will create a new index) let _search_engine = SearchEngine::new(config)?; info!("Index rebuild completed - new empty index created"); info!("Run a crawl to populate the index with content"); Ok(()) } fn build_schema() -> Schema { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("url", STRING | STORED | FAST); schema_builder.add_text_field("title", TEXT | STORED); schema_builder.add_text_field("content", TEXT); schema_builder.add_text_field("preview", STRING | STORED); schema_builder.add_text_field("domain", STRING | STORED | FAST); schema_builder.add_i64_field("indexed_at", INDEXED | STORED | FAST); schema_builder.add_text_field("content_hash", STRING | STORED); schema_builder.add_text_field("icon", STRING | STORED); schema_builder.add_text_field("description", STRING | STORED); schema_builder.build() } fn extract_title_from_content(content: &str) -> String { // Try to extract title from HTML content let document = scraper::Html::parse_document(content); let title_selector = scraper::Selector::parse("title").unwrap(); if let Some(title_element) = document.select(&title_selector).next() { let title = title_element.text().collect::>().join(" "); if !title.trim().is_empty() { return title.trim().to_string(); } } // Fallback to h1 let h1_selector = scraper::Selector::parse("h1").unwrap(); if let Some(h1_element) = document.select(&h1_selector).next() { let h1_text = h1_element.text().collect::>().join(" "); if !h1_text.trim().is_empty() { return h1_text.trim().to_string(); } } // Fallback to first line of content content.lines() .find(|line| !line.trim().is_empty()) .unwrap_or("Untitled") .trim() .to_string() } fn calculate_directory_size(path: &Path) -> Result { let mut total_size = 0u64; if path.is_dir() { for entry in std::fs::read_dir(path)? { let entry = entry?; let metadata = entry.metadata()?; if metadata.is_file() { total_size += metadata.len(); } else if metadata.is_dir() { total_size += (calculate_directory_size(&entry.path())? * 1024.0 * 1024.0) as u64; } } } Ok(total_size as f64 / 1024.0 / 1024.0) // Convert to MB } fn get_index_last_modified(path: &Path) -> Result> { let meta_path = path.join("meta.json"); if meta_path.exists() { let metadata = std::fs::metadata(meta_path)?; let modified = metadata.modified()?; let datetime = DateTime::::from(modified); Ok(datetime) } else { Ok(Utc::now()) } }