add search engine - ringle

This commit is contained in:
Face
2025-08-27 20:23:05 +03:00
parent 1cf81bbfee
commit 347b40ed71
47 changed files with 7214 additions and 493 deletions

4407
search-engine/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

31
search-engine/Cargo.toml Normal file
View File

@@ -0,0 +1,31 @@
[package]
name = "gurted-search-engine"
version = "0.1.0"
edition = "2021"
[dependencies]
tokio = { version = "1.38.0", features = ["full"] }
futures = "0.3.30"
tantivy = "0.22"
sha2 = "0.10"
gurt = { path = "../protocol/library" }
sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "chrono", "uuid"] }
scraper = "0.20"
lol_html = "1.2"
url = "2.5"
toml = "0.8.13"
serde = { version = "1.0.203", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] }
anyhow = "1.0.86"
thiserror = "1.0"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
uuid = { version = "1.0", features = ["v4"] }
regex = "1.10.4"
mime = "0.3"
base64 = "0.22"
glob = "0.3"
clap = { version = "4.5.4", features = ["derive"] }
urlencoding = "2.1"
reqwest = "0.11"

5
search-engine/README.md Normal file
View File

@@ -0,0 +1,5 @@
The official Gurted search engine, Ringle.
Copy `config.template.toml` to `config.toml` and edit as needed.
Run with `cargo run`

View File

@@ -0,0 +1,51 @@
[database]
url = "postgres://..."
max_connections = 5
[server]
address = "127.0.0.1"
port = 4879
cert_path = "certs/t.crt"
key_path = "certs/t.key"
[search]
index_path = "./search_indexes"
crawl_interval_hours = 2
max_pages_per_domain = 1000
crawler_timeout_seconds = 30
crawler_user_agent = "GurtedSearchBot/1.0"
max_concurrent_crawls = 5
content_size_limit_mb = 10
index_rebuild_interval_hours = 48
search_results_per_page = 20
max_search_results = 1000
allowed_extensions = [
"html", "htm", "txt", "md", "json", "xml", "rss", "atom"
]
blocked_extensions = [
"exe", "zip", "rar", "tar", "gz", "7z", "iso", "dmg",
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
"jpg", "jpeg", "png", "gif", "bmp", "svg", "webp",
"mp3", "mp4", "avi", "mov", "wmv", "flv", "webm",
"css", "js", "woff", "woff2", "ttf", "eot"
]
[crawler]
clanker_txt = true
crawl_delay_ms = 1000
max_redirects = 5
follow_external_links = false
max_depth = 10
request_headers = [
["Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"],
["Accept-Language", "en-US,en;q=0.5"],
["Accept-Encoding", "gzip, deflate"],
["DNT", "1"],
]
[logging]
level = "info"
format = "compact"

View File

@@ -0,0 +1,96 @@
<head>
<title>Gurted Search Engine</title>
<meta name="description" content="Search across all registered GURT domains">
<meta name="theme-color" content="#000000">
<icon src="https://cdn-icons-png.flaticon.com/512/295/295128.png">
<style>
body {
font-sans bg-[#000000] text-[#ffffff] min-h-screen flex flex-col items-center p-8
}
.search-container {
max-w-2xl w-full flex flex-col gap-8
}
.logo {
text-center text-5xl font-bold text-[#dc2626] mb-8
}
.search-box {
flex flex-col gap-4
}
.search-input {
w-[600px] p-3 text-base bg-[#1a1a1a] border-2 border-[#333333] rounded-lg text-[#ffffff] outline-none active:border-[#dc2626]
}
.search-btn {
bg-[#1a1a1a] text-[#ffffff] px-4 py-2 rounded text-sm border border-[#333333] cursor-pointer hover:border-[#dc2626] hover:shadow-sm
}
.results {
w-full flex flex-col gap-4 mt-8
}
.result-item {
p-4 cursor-pointer hover:bg-[#0a0a0a] rounded
}
.result-header {
inline-flex gap-2 mb-2
}
.result-icon {
w-6 h-6 min-w-6 min-h-6 rounded border border-[#333333] bg-[#1a1a1a] object-cover mt-1
}
.result-content {
flex-1 min-w-0
}
.result-title {
text-xl font-normal text-[#dc2626] mb-1 hover:underline
}
.result-url {
text-[#4ade80] text-sm mb-2
}
.result-preview {
text-[#cccccc] text-sm leading-relaxed
}
.stats {
text-left text-[#999999] text-sm mt-4 ml-1
}
.loading {
text-center text-[#999999] mt-8
}
</style>
<script src="search.lua" />
</head>
<body>
<div style="search-container">
<h1 style="logo">Ringle</h1>
<div style="w-full flex flex-col gap-4 items-center justify-center content-center">
<input type="text" id="searchQuery" style="search-input" placeholder="Search across the Gurted..." />
<div style="inline-flex gap-2 items-center justify-center">
<button id="searchButton" style="search-btn">Search</button>
<button id="luckyButton" style="search-btn">I'm feeling lucky</button>
</div>
</div>
<div id="loading" style="loading hidden">
Searching...
</div>
<div id="results" style="results"></div>
<div id="stats" style="stats"></div>
</div>
</body>

View File

@@ -0,0 +1,150 @@
local searchBtn = gurt.select('#searchButton')
local luckyBtn = gurt.select('#luckyButton')
local searchQuery = gurt.select('#searchQuery')
local loading = gurt.select('#loading')
local results = gurt.select('#results')
local stats = gurt.select('#stats')
local function showLoading()
loading.classList:remove('hidden')
results.text = ''
stats.text = ''
end
local function displayResults(data)
loading.classList:add('hidden')
results.text = ''
if not data.results or #data.results == 0 then
local noResultsItem = gurt.create('div', {
text = 'No results found for your query.',
style = 'result-item'
})
results:append(noResultsItem)
stats.text = 'No results found'
return
end
for i, result in ipairs(data.results) do
local resultItem = gurt.create('div', { style = 'result-item' })
resultItem:on('click', function()
gurt.location.goto(result.url)
end)
local headerDiv = gurt.create('div', { style = 'result-header' })
if result.icon and result.icon ~= '' then
local iconImg = gurt.create('img', {
src = result.icon,
style = 'result-icon',
alt = 'Site icon'
})
headerDiv:append(iconImg)
end
local titleDiv = gurt.create('p', {
text = result.title or result.url,
style = 'result-title'
})
headerDiv:append(titleDiv)
local urlDiv = gurt.create('p', {
text = result.url,
style = 'result-url'
})
local previewText = result.preview or result.description or ''
if #previewText > 150 then
previewText = previewText:sub(1, 147) .. '...'
end
local previewDiv = gurt.create('p', {
text = previewText,
style = 'result-preview'
})
resultItem:append(headerDiv)
resultItem:append(urlDiv)
resultItem:append(previewDiv)
results:append(resultItem)
end
local resultCount = #data.results
local totalResults = data.total_results or resultCount
stats.text = 'Found ' .. totalResults .. ' result' .. (totalResults == 1 and '' or 's')
end
local function performSearch(query)
if not query or query == '' then
return
end
showLoading()
local url = '/api/search?q=' .. urlEncode(query) .. '&per_page=20'
local response = fetch(url, {
method = 'GET'
})
if response:ok() then
local data = response:json()
displayResults(data)
else
loading.classList:add('hidden')
results.text = ''
stats.text = 'Search failed: ' .. response.status .. ' ' .. response.statusText
end
end
local function performLuckySearch()
showLoading()
local luckyTerms = {'test', 'demo', 'api', 'web', 'site', 'page', 'home', 'index'}
local randomTerm = luckyTerms[math.random(#luckyTerms)]
local url = '/api/search?q=' .. urlEncode(randomTerm) .. '&per_page=50'
local response = fetch(url, {
method = 'GET'
})
if response:ok() then
local data = response:json()
if data.results and #data.results > 0 then
local randomResult = data.results[math.random(#data.results)]
gurt.location.goto(randomResult.url)
else
loading.classList:add('hidden')
results.text = ''
stats.text = 'No sites available for lucky search'
end
else
loading.classList:add('hidden')
results.text = ''
stats.text = 'Lucky search failed'
end
end
searchBtn:on('click', function()
local query = searchQuery.value
if query and query ~= '' then
performSearch(query:trim())
end
end)
luckyBtn:on('click', function()
performLuckySearch()
end)
searchQuery:on('keydown', function(e)
if e.key == 'Enter' then
local query = searchQuery.value
if query and query ~= '' then
performSearch(query:trim())
end
end
end)
searchQuery:focus()

121
search-engine/src/config.rs Normal file
View File

@@ -0,0 +1,121 @@
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Config {
pub database: DatabaseConfig,
pub server: ServerConfig,
pub search: SearchConfig,
pub crawler: CrawlerConfig,
pub logging: LoggingConfig,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct DatabaseConfig {
pub url: String,
pub max_connections: u32,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ServerConfig {
pub address: String,
pub port: u16,
pub cert_path: PathBuf,
pub key_path: PathBuf,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SearchConfig {
pub index_path: PathBuf,
pub crawl_interval_hours: u64,
pub max_pages_per_domain: usize,
pub crawler_timeout_seconds: u64,
pub crawler_user_agent: String,
pub max_concurrent_crawls: usize,
pub content_size_limit_mb: usize,
pub index_rebuild_interval_hours: u64,
pub search_results_per_page: usize,
pub max_search_results: usize,
pub allowed_extensions: Vec<String>,
pub blocked_extensions: Vec<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CrawlerConfig {
pub clanker_txt: bool,
pub crawl_delay_ms: u64,
pub max_redirects: usize,
pub follow_external_links: bool,
pub max_depth: usize,
pub request_headers: Vec<(String, String)>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct LoggingConfig {
pub level: String,
pub format: String,
}
impl Config {
pub fn load_from_file(path: &str) -> anyhow::Result<Self> {
let content = std::fs::read_to_string(path)
.map_err(|e| anyhow::anyhow!("Failed to read config file {}: {}", path, e))?;
let config: Config = toml::from_str(&content)
.map_err(|e| anyhow::anyhow!("Failed to parse config file {}: {}", path, e))?;
Ok(config)
}
pub fn database_url(&self) -> &str {
&self.database.url
}
pub fn server_bind_address(&self) -> String {
format!("{}:{}", self.server.address, self.server.port)
}
pub fn gurt_protocol_url(&self) -> String {
format!("gurt://{}:{}", self.server.address, self.server.port)
}
pub fn is_allowed_extension(&self, extension: &str) -> bool {
if self.is_blocked_extension(extension) {
return false;
}
if self.search.allowed_extensions.is_empty() {
return true;
}
self.search.allowed_extensions.iter()
.any(|ext| ext.eq_ignore_ascii_case(extension))
}
pub fn is_blocked_extension(&self, extension: &str) -> bool {
self.search.blocked_extensions.iter()
.any(|ext| ext.eq_ignore_ascii_case(extension))
}
pub fn content_size_limit_bytes(&self) -> usize {
self.search
.content_size_limit_mb
.saturating_mul(1024)
.saturating_mul(1024)
}
pub fn crawler_timeout(&self) -> std::time::Duration {
std::time::Duration::from_secs(self.search.crawler_timeout_seconds)
}
pub fn crawl_delay(&self) -> std::time::Duration {
std::time::Duration::from_millis(self.crawler.crawl_delay_ms)
}
pub fn crawl_interval(&self) -> std::time::Duration {
std::time::Duration::from_secs(self.search.crawl_interval_hours * 3600)
}
pub fn index_rebuild_interval(&self) -> std::time::Duration {
std::time::Duration::from_secs(self.search.index_rebuild_interval_hours * 3600)
}
}

View File

@@ -0,0 +1,706 @@
use anyhow::{Result, Context};
use chrono::Utc;
use gurt::{GurtClient, GurtClientConfig};
use scraper::{Html, Selector};
use std::collections::{HashSet, VecDeque};
use std::sync::Arc;
use tracing::{info, debug, warn, error};
use url::Url;
use crate::config::Config;
use crate::models::{Domain, DomainRepository, CrawledPage};
#[derive(Debug, Clone)]
struct CrawledPageWithHtml {
crawled_page: CrawledPage,
original_html: String,
}
use crate::indexer::SearchEngine;
#[derive(Clone)]
pub struct DomainCrawler {
config: Config,
gurt_client: GurtClient,
domain_repo: DomainRepository,
search_engine: Arc<SearchEngine>,
}
impl DomainCrawler {
pub async fn new(config: Config, domain_repo: DomainRepository, search_engine: Arc<SearchEngine>) -> Result<Self> {
// Fetch the Gurted CA certificate from the DNS server
let ca_cert = Self::fetch_ca_certificate().await
.context("Failed to fetch Gurted CA certificate")?;
let gurt_config = GurtClientConfig {
request_timeout: config.crawler_timeout(),
user_agent: config.search.crawler_user_agent.clone(),
max_redirects: config.crawler.max_redirects,
custom_ca_certificates: vec![ca_cert],
..Default::default()
};
let gurt_client = GurtClient::with_config(gurt_config);
Ok(Self {
config,
gurt_client,
domain_repo,
search_engine,
})
}
async fn fetch_ca_certificate() -> Result<String> {
// Use GurtClient's DNS server configuration to build the HTTP URL
let dns_ip = GurtClientConfig::default().dns_server_ip;
// The HTTP bootstrap server runs on port 8876 (hardcoded in DNS server)
let http_url = format!("http://{}:8876/ca/root", dns_ip);
let response = reqwest::get(&http_url).await
.context("Failed to fetch CA certificate from HTTP bootstrap server")?;
if !response.status().is_success() {
return Err(anyhow::anyhow!("Failed to fetch CA certificate: HTTP {}", response.status()));
}
let ca_cert = response.text().await
.context("Failed to read CA certificate response")?;
if ca_cert.trim().is_empty() {
return Err(anyhow::anyhow!("Received empty CA certificate"));
}
Ok(ca_cert)
}
pub async fn crawl_domain(&self, domain: &Domain) -> Result<CrawlStats> {
info!("Starting crawl for domain: {}", domain.full_domain());
let start_time = std::time::Instant::now();
let mut stats = CrawlStats::new();
self.domain_repo
.update_crawl_status(domain.id, "crawling", None, None, None)
.await
.context("Failed to update crawl status to crawling")?;
let result = self.crawl_domain_internal(domain, &mut stats).await;
let duration = start_time.elapsed();
stats.duration_seconds = duration.as_secs();
match result {
Ok(()) => {
info!(
"Successfully crawled domain {} - {} pages found, {} indexed in {:.2}s",
domain.full_domain(),
stats.pages_found,
stats.pages_indexed,
duration.as_secs_f64()
);
self.domain_repo
.update_crawl_status(
domain.id,
"completed",
None,
Some(stats.pages_found as i32),
Some(self.config.search.crawl_interval_hours as i64),
)
.await
.context("Failed to update crawl status to completed")?;
}
Err(e) => {
error!(
"Failed to crawl domain {}: {}",
domain.full_domain(),
e
);
self.domain_repo
.update_crawl_status(
domain.id,
"failed",
Some(&e.to_string()),
Some(stats.pages_found as i32),
Some(24),
)
.await
.context("Failed to update crawl status to failed")?;
return Err(e);
}
}
Ok(stats)
}
async fn check_clanker_txt(&self, base_url: &str) -> Result<Vec<String>> {
let clanker_url = format!("{}/clanker.txt", base_url);
debug!("Checking clanker.txt at: {}", clanker_url);
match self.gurt_client.get(&clanker_url).await {
Ok(response) => {
if response.status_code == 200 {
let content = String::from_utf8_lossy(&response.body);
let urls = self.parse_clanker_txt(&content, base_url)?;
debug!("Found {} allowed URLs in clanker.txt", urls.len());
return Ok(urls);
}
// If clanker.txt doesn't exist (404), crawling is allowed
Ok(vec![])
}
Err(_) => {
// If we can't fetch clanker.txt, assume crawling is allowed
Ok(vec![])
}
}
}
fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
let user_agent = &self.config.search.crawler_user_agent;
let mut disallow_all = false;
let mut user_agent_matches = false;
let mut allowed_urls = Vec::new();
for line in content.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if let Some(user_agent_value) = line.to_lowercase().strip_prefix("user-agent:") {
let current_user_agent = user_agent_value.trim().to_string();
user_agent_matches = current_user_agent == "*" || current_user_agent.eq_ignore_ascii_case(user_agent);
continue;
}
if user_agent_matches {
if let Some(path_value) = line.to_lowercase().strip_prefix("disallow:") {
let path = path_value.trim();
if path == "/" {
disallow_all = true;
break;
}
} else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") {
let path = path_value.trim();
if !path.is_empty() {
let full_url = format!("{}{}", base_url, path);
debug!("Added allowed URL from clanker.txt: {}", full_url);
allowed_urls.push(full_url);
}
}
}
}
if disallow_all {
Err(anyhow::anyhow!("Crawling disallowed by clanker.txt"))
} else {
Ok(allowed_urls)
}
}
async fn crawl_domain_internal(&self, domain: &Domain, stats: &mut CrawlStats) -> Result<()> {
let base_url = domain.gurt_url();
let mut visited_urls = HashSet::new();
let mut queue = VecDeque::new();
let mut pages_to_index = Vec::new();
// Check clanker.txt if enabled and get allowed URLs
let mut clanker_urls = Vec::new();
if self.config.crawler.clanker_txt {
match self.check_clanker_txt(&base_url).await {
Ok(urls) => {
clanker_urls = urls;
info!("Found {} URLs in clanker.txt for {}", clanker_urls.len(), domain.full_domain());
},
Err(e) => {
warn!("Clanker.txt check failed for {}: {}", domain.full_domain(), e);
return Err(anyhow::anyhow!("Crawling disabled by clanker.txt: {}", e));
}
}
}
// Start with the root URL
queue.push_back(CrawlItem {
url: base_url.clone(),
depth: 0,
});
// Add all URLs from clanker.txt to the queue
for url in clanker_urls {
if !visited_urls.contains(&url) {
queue.push_back(CrawlItem {
url: url.clone(),
depth: 0, // Treat clanker.txt URLs as root level
});
debug!("Added clanker.txt URL to queue: {}", url);
}
}
while let Some(item) = queue.pop_front() {
if visited_urls.contains(&item.url) {
continue;
}
if item.depth > self.config.crawler.max_depth {
debug!("Skipping URL due to depth limit: {}", item.url);
continue;
}
if stats.pages_found >= self.config.search.max_pages_per_domain {
info!("Reached page limit for domain: {}", domain.full_domain());
break;
}
visited_urls.insert(item.url.clone());
stats.pages_found += 1;
// Add crawl delay between requests
if stats.pages_found > 1 {
tokio::time::sleep(self.config.crawl_delay()).await;
}
match self.crawl_page(&item.url, domain).await {
Ok(Some(page_with_html)) => {
// Extract links if not at max depth
if item.depth < self.config.crawler.max_depth {
if let Ok(links) = self.extract_links(&page_with_html.original_html, &base_url).await {
debug!("Found {} links on {}", links.len(), item.url);
for link in links {
if self.should_crawl_url(&link, domain) {
debug!("Adding link to crawl queue: {}", link);
queue.push_back(CrawlItem {
url: link,
depth: item.depth + 1,
});
}
}
}
}
pages_to_index.push(page_with_html.crawled_page);
stats.pages_indexed += 1;
// Index in batches
if pages_to_index.len() >= 50 {
let batch = std::mem::take(&mut pages_to_index);
self.search_engine.index_pages(batch).await?;
}
}
Ok(None) => {
debug!("Skipped page: {}", item.url);
stats.pages_skipped += 1;
}
Err(e) => {
warn!("Failed to crawl page {}: {}", item.url, e);
stats.errors += 1;
}
}
}
// Index remaining pages
if !pages_to_index.is_empty() {
self.search_engine.index_pages(pages_to_index).await?;
}
Ok(())
}
async fn crawl_page(&self, url: &str, domain: &Domain) -> Result<Option<CrawledPageWithHtml>> {
debug!("Crawling page: {}", url);
let response = match self.gurt_client.get(url).await {
Ok(response) => response,
Err(e) => {
return Err(anyhow::anyhow!("Failed to fetch URL: {} - {}", url, e));
}
};
let status_code = response.status_code;
let content_type = response
.headers
.get("content-type")
.map(|s| s.to_string());
// Check if we should process this content type
if let Some(ref ct) = content_type {
if !self.is_allowed_content_type(ct) {
debug!("Skipping URL with unsupported content type: {} ({})", url, ct);
return Ok(None);
}
}
if status_code != 200 {
return Err(anyhow::anyhow!(
"HTTP error {}: {}",
status_code,
response.status_message
));
}
let content_bytes = response.body;
// Check content size limit
if content_bytes.len() > self.config.content_size_limit_bytes() {
warn!("Skipping URL due to size limit: {} ({} bytes)", url, content_bytes.len());
return Ok(None);
}
// Convert bytes to string
let content = String::from_utf8_lossy(&content_bytes);
// Extract metadata from HTML
let title = self.extract_title(&content);
let icon = self.extract_icon(&content, url);
let description = self.extract_meta_description(&content);
let cleaned_content = self.clean_content(&content);
let page = CrawledPageWithHtml {
crawled_page: CrawledPage {
url: url.to_string(),
domain: domain.full_domain(),
title,
content: cleaned_content.clone(),
content_hash: Self::calculate_content_hash(&cleaned_content),
indexed_at: Utc::now(),
icon,
description,
},
original_html: content.to_string(),
};
Ok(Some(page))
}
async fn extract_links(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
debug!("Extracting links from content length: {} chars", content.len());
let document = Html::parse_document(content);
let link_selector = Selector::parse("a[href]").unwrap();
let base = Url::parse(base_url)?;
let mut links = Vec::new();
let all_links = document.select(&link_selector).collect::<Vec<_>>();
debug!("Found {} anchor tags in HTML", all_links.len());
for element in all_links {
if let Some(href) = element.value().attr("href") {
// Skip empty links and fragments
if href.is_empty() || href.starts_with('#') {
continue;
}
// Skip mailto, tel, javascript links
if href.starts_with("mailto:") || href.starts_with("tel:") || href.starts_with("javascript:") {
continue;
}
// Resolve relative URLs
match base.join(href) {
Ok(absolute_url) => {
let url_str = absolute_url.to_string();
// Only include GURT protocol URLs for the same domain
if url_str.starts_with("gurt://") {
if let Ok(parsed) = Url::parse(&url_str) {
if let Some(host) = parsed.host_str() {
if let Ok(base_parsed) = Url::parse(base_url) {
if let Some(base_host) = base_parsed.host_str() {
if host == base_host {
links.push(url_str);
}
}
}
}
}
}
}
Err(e) => {
debug!("Failed to resolve URL {}: {}", href, e);
}
}
}
}
// Remove duplicates
links.sort();
links.dedup();
Ok(links)
}
fn extract_title(&self, content: &str) -> Option<String> {
let document = Html::parse_document(content);
// Try <title> tag first
if let Ok(title_selector) = Selector::parse("title") {
if let Some(title_element) = document.select(&title_selector).next() {
let title_text = title_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
if !title_text.is_empty() {
return Some(title_text);
}
}
}
// Fallback to first <h1>
if let Ok(h1_selector) = Selector::parse("h1") {
if let Some(h1_element) = document.select(&h1_selector).next() {
let h1_text = h1_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
if !h1_text.is_empty() {
return Some(h1_text);
}
}
}
None
}
fn extract_icon(&self, content: &str, base_url: &str) -> Option<String> {
let document = Html::parse_document(content);
// Try to find icon tag first (custom GURT standard)
if let Ok(icon_selector) = Selector::parse("icon") {
if let Some(icon_element) = document.select(&icon_selector).next() {
if let Some(src) = icon_element.value().attr("src") {
return Some(src.to_string());
}
}
}
// Fallback to standard link rel="icon" or link rel="shortcut icon"
if let Ok(link_selector) = Selector::parse("link[rel~=\"icon\"], link[rel=\"shortcut icon\"]") {
if let Some(link_element) = document.select(&link_selector).next() {
if let Some(href) = link_element.value().attr("href") {
// Convert relative URLs to absolute
if href.starts_with("http") || href.starts_with("gurt") {
return Some(href.to_string());
} else if let Ok(base) = Url::parse(base_url) {
if let Ok(absolute_url) = base.join(href) {
return Some(absolute_url.to_string());
}
}
}
}
}
None
}
fn extract_meta_description(&self, content: &str) -> Option<String> {
let document = Html::parse_document(content);
// Look for meta name="description"
if let Ok(meta_selector) = Selector::parse("meta[name=\"description\"]") {
if let Some(meta_element) = document.select(&meta_selector).next() {
if let Some(content_attr) = meta_element.value().attr("content") {
let description = content_attr.trim();
if !description.is_empty() {
return Some(description.to_string());
}
}
}
}
None
}
fn clean_content(&self, content: &str) -> String {
use lol_html::{element, rewrite_str, RewriteStrSettings};
// First pass: remove script, style, noscript elements
let settings = RewriteStrSettings {
element_content_handlers: vec![
element!("script", |el| {
el.remove();
Ok(())
}),
element!("style", |el| {
el.remove();
Ok(())
}),
element!("noscript", |el| {
el.remove();
Ok(())
}),
],
..RewriteStrSettings::default()
};
let cleaned_html = match rewrite_str(content, settings) {
Ok(html) => html,
Err(_) => content.to_string(),
};
// Second pass: extract text using scraper (already imported)
let document = Html::parse_document(&cleaned_html);
let text_content = document.root_element()
.text()
.collect::<Vec<_>>()
.join(" ");
// Clean up whitespace
text_content
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn should_crawl_url(&self, url: &str, domain: &Domain) -> bool {
// Parse the URL
let parsed_url = match Url::parse(url) {
Ok(u) => u,
Err(_) => return false,
};
// Must be GURT protocol
if parsed_url.scheme() != "gurt" {
return false;
}
// Must be same domain
if let Some(host) = parsed_url.host_str() {
if host != domain.full_domain() {
return false;
}
} else {
return false;
}
if let Some(path) = parsed_url.path().split('/').last() {
if let Some(extension) = path.split('.').last() {
if path.contains('.') && extension != path {
if self.config.is_blocked_extension(extension) {
return false;
}
if !self.config.search.allowed_extensions.is_empty()
&& !self.config.is_allowed_extension(extension) {
return false;
}
}
}
}
true
}
fn is_allowed_content_type(&self, content_type: &str) -> bool {
let ct_lower = content_type.to_lowercase();
if ct_lower.contains("text/html") || ct_lower.contains("application/xhtml") {
return true;
}
if ct_lower.contains("text/plain") {
return true;
}
if ct_lower.contains("text/markdown") || ct_lower.contains("application/json") {
return true;
}
false
}
fn calculate_content_hash(content: &str) -> String {
use sha2::{Sha256, Digest};
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
}
pub async fn run_crawl_all(config: Config) -> Result<()> {
info!("Starting crawl of all registered domains");
let pool = sqlx::PgPool::connect(&config.database_url()).await
.context("Failed to connect to database")?;
let domain_repo = DomainRepository::new(pool);
let search_engine = Arc::new(SearchEngine::new(config.clone())?);
let crawler = DomainCrawler::new(config.clone(), domain_repo.clone(), search_engine).await?;
let domains = domain_repo.get_domains_for_crawling(None).await
.context("Failed to fetch domains for crawling")?;
if domains.is_empty() {
info!("No domains found that need crawling");
return Ok(());
}
info!("Found {} domains to crawl", domains.len());
let mut total_stats = CrawlStats::new();
let max_concurrent = config.search.max_concurrent_crawls;
let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(max_concurrent));
let mut tasks = Vec::new();
for domain in domains {
let crawler = Arc::new(crawler.clone());
let permit = semaphore.clone().acquire_owned().await
.context("Failed to acquire semaphore permit")?;
let task = tokio::spawn(async move {
let _permit = permit; // Keep permit alive
crawler.crawl_domain(&domain).await
});
tasks.push(task);
}
for task in tasks {
match task.await {
Ok(Ok(stats)) => {
total_stats.pages_found += stats.pages_found;
total_stats.pages_indexed += stats.pages_indexed;
total_stats.pages_skipped += stats.pages_skipped;
total_stats.errors += stats.errors;
}
Ok(Err(e)) => {
error!("Crawl task failed: {}", e);
total_stats.errors += 1;
}
Err(e) => {
error!("Task join error: {}", e);
total_stats.errors += 1;
}
}
}
info!(
"Crawl completed - {} pages found, {} indexed, {} skipped, {} errors",
total_stats.pages_found,
total_stats.pages_indexed,
total_stats.pages_skipped,
total_stats.errors
);
Ok(())
}
#[derive(Debug, Clone)]
struct CrawlItem {
url: String,
depth: usize,
}
#[derive(Debug, Clone)]
pub struct CrawlStats {
pub pages_found: usize,
pub pages_indexed: usize,
pub pages_skipped: usize,
pub errors: usize,
pub duration_seconds: u64,
}
impl CrawlStats {
fn new() -> Self {
Self {
pages_found: 0,
pages_indexed: 0,
pages_skipped: 0,
errors: 0,
duration_seconds: 0,
}
}
}

View File

@@ -0,0 +1,405 @@
use anyhow::{Result, Context};
use chrono::{DateTime, Utc};
use std::collections::HashSet;
use std::path::Path;
use std::time::Instant;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::{doc, Index, IndexReader, IndexWriter, ReloadPolicy, Term, TantivyDocument};
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::tokenizer::*;
use tantivy::schema::Value;
use tracing::{info, debug};
use crate::config::Config;
use crate::models::{SearchResult, SearchResponse, IndexStats, CrawledPage};
pub struct SearchEngine {
config: Config,
index: Index,
reader: IndexReader,
schema: Schema,
}
impl SearchEngine {
pub fn new(config: Config) -> Result<Self> {
let index_path = &config.search.index_path;
std::fs::create_dir_all(index_path)
.with_context(|| format!("Failed to create index directory: {:?}", index_path))?;
let schema = build_schema();
let index = if index_path.join("meta.json").exists() {
info!("Loading existing search index from {:?}", index_path);
Index::open_in_dir(index_path)
.with_context(|| format!("Failed to open existing index at {:?}", index_path))?
} else {
info!("Creating new search index at {:?}", index_path);
Index::create_in_dir(index_path, schema.clone())
.with_context(|| format!("Failed to create new index at {:?}", index_path))?
};
// Configure tokenizers
let tokenizer_manager = index.tokenizers();
tokenizer_manager.register(
"gurted_text",
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(StopWordFilter::new(Language::English).unwrap())
.build(),
);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()
.context("Failed to create index reader")?;
Ok(Self {
config,
index,
reader,
schema,
})
}
pub async fn index_pages(&self, pages: Vec<CrawledPage>) -> Result<usize> {
if pages.is_empty() {
return Ok(0);
}
let start_time = Instant::now();
let mut writer = self.get_writer()?;
let mut indexed_count = 0;
let mut duplicate_count = 0;
let url_field = self.schema.get_field("url").unwrap();
let title_field = self.schema.get_field("title").unwrap();
let content_field = self.schema.get_field("content").unwrap();
let preview_field = self.schema.get_field("preview").unwrap();
let domain_field = self.schema.get_field("domain").unwrap();
let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
let content_hash_field = self.schema.get_field("content_hash").unwrap();
let icon_field = self.schema.get_field("icon").unwrap();
let description_field = self.schema.get_field("description").unwrap();
info!("Indexing {} pages...", pages.len());
for page in pages {
// Check for duplicates (always enabled)
if let Ok(existing_hash) = self.get_document_hash(&page.url).await {
if existing_hash == page.content_hash {
duplicate_count += 1;
continue;
}
}
// Remove existing document for this URL
let url_term = Term::from_field_text(url_field, &page.url);
writer.delete_term(url_term);
let preview = page.generate_preview(500);
let title = page.title.unwrap_or_else(|| extract_title_from_content(&page.content));
// Add new document
writer.add_document(doc!(
url_field => page.url.clone(),
title_field => title,
content_field => page.content.clone(),
preview_field => preview,
domain_field => page.domain.clone(),
indexed_at_field => page.indexed_at.timestamp(),
content_hash_field => page.content_hash.clone(),
icon_field => page.icon.unwrap_or_default(),
description_field => page.description.unwrap_or_default()
))?;
indexed_count += 1;
// Commit in batches
if indexed_count % 100 == 0 {
writer.commit()
.context("Failed to commit batch of documents")?;
writer = self.get_writer()?; // Get new writer after commit
let elapsed = start_time.elapsed().as_secs_f64();
let rate = indexed_count as f64 / elapsed;
info!("Indexed {} pages ({:.1} pages/sec)", indexed_count, rate);
}
}
// Final commit
writer.commit().context("Failed to commit final batch")?;
let total_time = start_time.elapsed();
info!(
"Indexing completed: {} pages indexed, {} duplicates skipped in {:.2}s",
indexed_count,
duplicate_count,
total_time.as_secs_f64()
);
Ok(indexed_count)
}
pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
let start_time = Instant::now();
let searcher = self.reader.searcher();
let url_field = self.schema.get_field("url").unwrap();
let title_field = self.schema.get_field("title").unwrap();
let content_field = self.schema.get_field("content").unwrap();
let preview_field = self.schema.get_field("preview").unwrap();
let domain_field = self.schema.get_field("domain").unwrap();
let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
let icon_field = self.schema.get_field("icon").unwrap();
let description_field = self.schema.get_field("description").unwrap();
// Create query parser for title and content fields
let query_parser = QueryParser::for_index(
&self.index,
vec![title_field, content_field]
);
let parsed_query = query_parser
.parse_query(query)
.with_context(|| format!("Failed to parse query: {}", query))?;
let top_docs = searcher
.search(&parsed_query, &TopDocs::with_limit(limit))
.context("Search query execution failed")?;
let mut results = Vec::new();
for (score, doc_address) in top_docs {
let doc: TantivyDocument = searcher.doc(doc_address)?;
let url = doc.get_first(url_field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = doc.get_first(title_field)
.and_then(|v| v.as_str())
.unwrap_or("Untitled")
.to_string();
let preview = doc.get_first(preview_field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let domain = doc.get_first(domain_field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let indexed_at_timestamp = doc.get_first(indexed_at_field)
.and_then(|v| v.as_i64())
.unwrap_or(0);
let indexed_at = DateTime::from_timestamp(indexed_at_timestamp, 0)
.unwrap_or_else(|| Utc::now());
let icon = doc.get_first(icon_field)
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(|s| s.to_string());
let description = doc.get_first(description_field)
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(|s| s.to_string());
results.push(SearchResult {
url,
title,
preview,
domain,
score,
indexed_at,
icon,
description,
});
}
let search_time = start_time.elapsed();
debug!(
"Search completed: {} results for '{}' in {:.2}ms",
results.len(),
query,
search_time.as_millis()
);
Ok(results)
}
pub async fn search_with_response(&self, query: &str, page: usize, per_page: usize) -> Result<SearchResponse> {
let offset = page.saturating_sub(1) * per_page;
let limit = std::cmp::min(per_page, self.config.search.max_search_results);
let all_results = self.search(query, offset + limit).await?;
let results = all_results.into_iter().skip(offset).take(per_page).collect();
let total_results = self.get_total_document_count().await?;
Ok(SearchResponse {
query: query.to_string(),
results,
total_results,
page,
per_page,
})
}
pub async fn get_stats(&self) -> Result<IndexStats> {
let searcher = self.reader.searcher();
let total_documents = searcher.num_docs() as usize;
// Count unique domains (simplified approach)
let domains: HashSet<String> = HashSet::new();
// TODO: Implement domain counting when needed
let total_domains = domains.len();
// Calculate index size
let index_size_mb = calculate_directory_size(&self.config.search.index_path)?;
// Get last update time (approximate)
let last_updated = get_index_last_modified(&self.config.search.index_path)?;
Ok(IndexStats {
total_documents,
total_domains,
index_size_mb,
last_updated,
})
}
pub async fn get_total_document_count(&self) -> Result<usize> {
let searcher = self.reader.searcher();
Ok(searcher.num_docs() as usize)
}
async fn get_document_hash(&self, url: &str) -> Result<String> {
let searcher = self.reader.searcher();
let url_field = self.schema.get_field("url").unwrap();
let content_hash_field = self.schema.get_field("content_hash").unwrap();
let query_parser = QueryParser::for_index(&self.index, vec![url_field]);
let query = query_parser.parse_query(&format!("\"{}\"", url))?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_, doc_address)) = top_docs.first() {
let doc: TantivyDocument = searcher.doc(*doc_address)?;
if let Some(hash_value) = doc.get_first(content_hash_field) {
if let Some(hash_str) = hash_value.as_str() {
return Ok(hash_str.to_string());
}
}
}
Err(anyhow::anyhow!("Document not found: {}", url))
}
fn get_writer(&self) -> Result<IndexWriter> {
self.index
.writer_with_num_threads(4, 256 * 1024 * 1024) // 256MB buffer
.context("Failed to create index writer")
}
}
pub async fn rebuild_index(config: Config) -> Result<()> {
info!("Starting index rebuild...");
// Remove existing index
if config.search.index_path.exists() {
std::fs::remove_dir_all(&config.search.index_path)
.context("Failed to remove existing index")?;
}
// Create new search engine (which will create a new index)
let _search_engine = SearchEngine::new(config)?;
info!("Index rebuild completed - new empty index created");
info!("Run a crawl to populate the index with content");
Ok(())
}
fn build_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("url", STRING | STORED | FAST);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("content", TEXT);
schema_builder.add_text_field("preview", STRING | STORED);
schema_builder.add_text_field("domain", STRING | STORED | FAST);
schema_builder.add_i64_field("indexed_at", INDEXED | STORED | FAST);
schema_builder.add_text_field("content_hash", STRING | STORED);
schema_builder.add_text_field("icon", STRING | STORED);
schema_builder.add_text_field("description", STRING | STORED);
schema_builder.build()
}
fn extract_title_from_content(content: &str) -> String {
// Try to extract title from HTML content
let document = scraper::Html::parse_document(content);
let title_selector = scraper::Selector::parse("title").unwrap();
if let Some(title_element) = document.select(&title_selector).next() {
let title = title_element.text().collect::<Vec<_>>().join(" ");
if !title.trim().is_empty() {
return title.trim().to_string();
}
}
// Fallback to h1
let h1_selector = scraper::Selector::parse("h1").unwrap();
if let Some(h1_element) = document.select(&h1_selector).next() {
let h1_text = h1_element.text().collect::<Vec<_>>().join(" ");
if !h1_text.trim().is_empty() {
return h1_text.trim().to_string();
}
}
// Fallback to first line of content
content.lines()
.find(|line| !line.trim().is_empty())
.unwrap_or("Untitled")
.trim()
.to_string()
}
fn calculate_directory_size(path: &Path) -> Result<f64> {
let mut total_size = 0u64;
if path.is_dir() {
for entry in std::fs::read_dir(path)? {
let entry = entry?;
let metadata = entry.metadata()?;
if metadata.is_file() {
total_size += metadata.len();
} else if metadata.is_dir() {
total_size += (calculate_directory_size(&entry.path())? * 1024.0 * 1024.0) as u64;
}
}
}
Ok(total_size as f64 / 1024.0 / 1024.0) // Convert to MB
}
fn get_index_last_modified(path: &Path) -> Result<DateTime<Utc>> {
let meta_path = path.join("meta.json");
if meta_path.exists() {
let metadata = std::fs::metadata(meta_path)?;
let modified = metadata.modified()?;
let datetime = DateTime::<Utc>::from(modified);
Ok(datetime)
} else {
Ok(Utc::now())
}
}

124
search-engine/src/main.rs Normal file
View File

@@ -0,0 +1,124 @@
mod config;
mod indexer;
mod crawler;
mod scheduler;
mod server;
mod models;
use anyhow::Result;
use clap::{Parser, Subcommand};
use config::Config;
use tracing::info;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
#[derive(Parser)]
#[command(name = "gurted-search-engine")]
#[command(about = "Crawl and index registered GURT domains")]
#[command(version = "0.1.0")]
struct Cli {
#[command(subcommand)]
command: Commands,
#[arg(long, default_value = "config.toml")]
config: String,
#[arg(short, long, action = clap::ArgAction::Count)]
verbose: u8,
}
#[derive(Subcommand)]
enum Commands {
Server,
Crawl,
RebuildIndex,
Search {
query: String,
#[arg(short, long, default_value = "10")]
limit: usize,
},
Stats,
}
#[tokio::main]
async fn main() -> Result<()> {
let cli = Cli::parse();
init_logging(&cli)?;
let config = Config::load_from_file(&cli.config).unwrap();
info!("Starting Gurted Search Engine v{}", env!("CARGO_PKG_VERSION"));
info!("Configuration loaded from: {}", cli.config);
match cli.command {
Commands::Server => {
info!("Starting search engine server on {}", config.server_bind_address());
server::run_server(config).await?;
}
Commands::Crawl => {
info!("Starting one-time crawl of all registered domains");
crawler::run_crawl_all(config).await?;
}
Commands::RebuildIndex => {
info!("Rebuilding search index from scratch");
indexer::rebuild_index(config).await?;
}
Commands::Search { query, limit } => {
info!("Testing search with query: '{}'", query);
test_search(config, query, limit).await?;
}
Commands::Stats => {
info!("Displaying search index statistics");
show_stats(config).await?;
}
}
Ok(())
}
fn init_logging(cli: &Cli) -> Result<()> {
let log_level = match cli.verbose {
0 => "info",
1 => "debug",
_ => "trace",
};
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| format!("gurted_search_engine={}", log_level).into()),
)
.with(tracing_subscriber::fmt::layer())
.init();
Ok(())
}
async fn test_search(config: Config, query: String, limit: usize) -> Result<()> {
let search_engine = indexer::SearchEngine::new(config)?;
let results = search_engine.search(&query, limit).await?;
println!("Search results for '{}' (showing {} results):",
query, results.len());
for (i, result) in results.iter().enumerate() {
println!("{}. {} - {}", i + 1, result.title, result.url);
println!(" {}", result.preview);
println!();
}
Ok(())
}
async fn show_stats(config: Config) -> Result<()> {
let search_engine = indexer::SearchEngine::new(config)?;
let stats = search_engine.get_stats().await?;
println!("Search Index Statistics:");
println!(" Total documents: {}", stats.total_documents);
println!(" Total domains: {}", stats.total_domains);
println!(" Index size: {} MB", stats.index_size_mb);
println!(" Last updated: {}", stats.last_updated);
Ok(())
}

185
search-engine/src/models.rs Normal file
View File

@@ -0,0 +1,185 @@
use serde::{Deserialize, Serialize};
use sqlx::{FromRow, types::chrono::{DateTime, Utc}};
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
pub struct Domain {
pub id: i32,
pub name: String,
pub tld: String,
pub user_id: Option<i32>,
pub status: Option<String>,
pub created_at: Option<DateTime<Utc>>,
}
impl Domain {
pub fn full_domain(&self) -> String {
format!("{}.{}", self.name, self.tld)
}
pub fn gurt_url(&self) -> String {
format!("gurt://{}.{}", self.name, self.tld)
}
}
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
pub struct DnsRecord {
pub id: i32,
pub domain_id: i32,
pub record_type: String,
pub name: String,
pub value: String,
pub ttl: Option<i32>,
pub priority: Option<i32>,
pub created_at: Option<DateTime<Utc>>,
}
#[derive(Clone, Debug, Serialize)]
pub struct SearchResult {
pub url: String,
pub title: String,
pub preview: String,
pub domain: String,
pub score: f32,
pub indexed_at: DateTime<Utc>,
pub icon: Option<String>,
pub description: Option<String>,
}
#[derive(Clone, Debug, Serialize)]
pub struct SearchResponse {
pub query: String,
pub results: Vec<SearchResult>,
pub total_results: usize,
pub page: usize,
pub per_page: usize,
}
#[derive(Clone, Debug, Serialize)]
pub struct IndexStats {
pub total_documents: usize,
pub total_domains: usize,
pub index_size_mb: f64,
pub last_updated: DateTime<Utc>,
}
#[derive(Clone, Debug)]
pub struct CrawledPage {
pub url: String,
pub domain: String,
pub title: Option<String>,
pub content: String,
pub content_hash: String,
pub indexed_at: DateTime<Utc>,
pub icon: Option<String>,
pub description: Option<String>,
}
impl CrawledPage {
pub fn generate_preview(&self, max_len: usize) -> String {
let text = self.content.trim();
if text.len() <= max_len {
text.to_string()
} else {
let mut preview = text.chars().take(max_len).collect::<String>();
if let Some(last_space) = preview.rfind(' ') {
preview.truncate(last_space);
}
preview.push_str("...");
preview
}
}
}
#[derive(Clone)]
pub struct DomainRepository {
pool: sqlx::PgPool,
}
impl DomainRepository {
pub fn new(pool: sqlx::PgPool) -> Self {
Self { pool }
}
pub async fn get_domains_for_crawling(&self, limit: Option<i32>) -> Result<Vec<Domain>, sqlx::Error> {
let query = if let Some(limit) = limit {
sqlx::query_as::<_, Domain>(
"SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at
FROM domains d
LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
WHERE d.status = 'approved'
AND (dcs.crawl_status IS NULL
OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC
LIMIT $1"
)
.bind(limit)
} else {
sqlx::query_as::<_, Domain>(
"SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at
FROM domains d
LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
WHERE d.status = 'approved'
AND (dcs.crawl_status IS NULL
OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC"
)
};
query.fetch_all(&self.pool).await
}
pub async fn update_crawl_status(
&self,
domain_id: i32,
status: &str,
error_message: Option<&str>,
pages_found: Option<i32>,
next_crawl_hours: Option<i64>
) -> Result<(), sqlx::Error> {
let next_crawl_at = next_crawl_hours
.map(|hours| chrono::Utc::now() + chrono::Duration::hours(hours));
sqlx::query(
"INSERT INTO domain_crawl_status (domain_id, crawl_status, error_message, pages_found, last_crawled_at, next_crawl_at, updated_at)
VALUES ($1, $2, $3, $4,
CASE WHEN $2 IN ('completed','failed') THEN NOW() ELSE NULL END,
$5, NOW())
ON CONFLICT (domain_id)
DO UPDATE SET
crawl_status = $2,
error_message = $3,
pages_found = CASE WHEN $2 = 'completed'
THEN COALESCE($4, domain_crawl_status.pages_found)
ELSE domain_crawl_status.pages_found END,
last_crawled_at = CASE WHEN $2 IN ('completed','failed')
THEN NOW()
ELSE domain_crawl_status.last_crawled_at END,
next_crawl_at = COALESCE($5, domain_crawl_status.next_crawl_at),
updated_at = NOW()"
)
.bind(domain_id)
.bind(status)
.bind(error_message)
.bind(pages_found)
.bind(next_crawl_at)
.execute(&self.pool)
.await?;
Ok(())
}
}
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
pub struct CrawlStatus {
pub domain_id: i32,
pub last_crawled_at: Option<DateTime<Utc>>,
pub next_crawl_at: Option<DateTime<Utc>>,
pub crawl_status: Option<String>,
pub error_message: Option<String>,
pub pages_found: Option<i32>,
pub updated_at: Option<DateTime<Utc>>,
}

View File

@@ -0,0 +1,152 @@
use anyhow::{Result, Context};
use tokio::time::{interval, Instant};
use tracing::{info, error};
use crate::config::Config;
use crate::crawler::run_crawl_all;
use crate::indexer::SearchEngine;
pub struct CrawlScheduler {
config: Config,
}
impl CrawlScheduler {
pub fn new(config: Config) -> Self {
Self { config }
}
pub async fn start(&self) -> Result<()> {
info!("Starting crawl scheduler");
info!(
"Crawl interval: {} hours ({} seconds)",
self.config.search.crawl_interval_hours,
self.config.crawl_interval().as_secs()
);
info!(
"Index rebuild interval: {} hours ({} seconds)",
self.config.search.index_rebuild_interval_hours,
self.config.index_rebuild_interval().as_secs()
);
let mut crawl_interval = interval(self.config.crawl_interval());
let mut index_rebuild_interval = interval(self.config.index_rebuild_interval());
crawl_interval.tick().await;
index_rebuild_interval.tick().await;
info!("Running initial crawl...");
if let Err(e) = self.run_scheduled_crawl().await {
error!("Initial crawl failed: {}", e);
error!("Error details: {:?}", e);
// Log the error chain
let mut source = e.source();
let mut depth = 1;
while let Some(err) = source {
error!(" Caused by ({}): {}", depth, err);
source = err.source();
depth += 1;
}
}
loop {
tokio::select! {
_ = crawl_interval.tick() => {
info!("Running scheduled crawl");
if let Err(e) = self.run_scheduled_crawl().await {
error!("Scheduled crawl failed: {}", e);
error!("Error details: {:?}", e);
// Log the error chain
let mut source = e.source();
let mut depth = 1;
while let Some(err) = source {
error!(" Caused by ({}): {}", depth, err);
source = err.source();
depth += 1;
}
}
}
_ = index_rebuild_interval.tick() => {
info!("Running scheduled index rebuild");
if let Err(e) = self.run_scheduled_index_rebuild().await {
error!("Scheduled index rebuild failed: {}", e);
}
}
_ = tokio::signal::ctrl_c() => {
info!("Received shutdown signal, stopping scheduler");
break;
}
}
}
info!("Crawl scheduler stopped");
Ok(())
}
async fn run_scheduled_crawl(&self) -> Result<()> {
let start_time = Instant::now();
run_crawl_all(self.config.clone()).await
.context("Scheduled crawl failed")?;
let duration = start_time.elapsed();
info!("Scheduled crawl completed in {:.2} seconds", duration.as_secs_f64());
Ok(())
}
async fn run_scheduled_index_rebuild(&self) -> Result<()> {
let start_time = Instant::now();
let search_engine = SearchEngine::new(self.config.clone())?;
let stats_before = search_engine.get_stats().await?;
info!(
"Starting index rebuild - current index has {} documents from {} domains ({:.2} MB)",
stats_before.total_documents,
stats_before.total_domains,
stats_before.index_size_mb
);
crate::indexer::rebuild_index(self.config.clone()).await
.context("Index rebuild failed")?;
info!("Repopulating rebuilt index with fresh crawl");
run_crawl_all(self.config.clone()).await
.context("Post-rebuild crawl failed")?;
let duration = start_time.elapsed();
let new_search_engine = SearchEngine::new(self.config.clone())?;
let stats_after = new_search_engine.get_stats().await?;
info!(
"Index rebuild completed in {:.2} seconds - new index has {} documents from {} domains ({:.2} MB)",
duration.as_secs_f64(),
stats_after.total_documents,
stats_after.total_domains,
stats_after.index_size_mb
);
Ok(())
}
}
pub struct BackgroundScheduler {
scheduler: CrawlScheduler,
}
impl BackgroundScheduler {
pub fn new(config: Config) -> Self {
Self {
scheduler: CrawlScheduler::new(config),
}
}
pub fn start(self) -> tokio::task::JoinHandle<Result<()>> {
tokio::spawn(async move {
self.scheduler.start().await
})
}
}

195
search-engine/src/server.rs Normal file
View File

@@ -0,0 +1,195 @@
use anyhow::{Result, Context};
use gurt::prelude::*;
use gurt::GurtError;
use serde_json::json;
use std::sync::Arc;
use tracing::{info, error};
use crate::config::Config;
use crate::indexer::SearchEngine;
use crate::scheduler::BackgroundScheduler;
pub struct SearchServer {
config: Config,
search_engine: Arc<SearchEngine>,
}
impl SearchServer {
pub async fn new(config: Config) -> Result<Self> {
// Connect to database
sqlx::PgPool::connect(&config.database_url()).await
.context("Failed to connect to database")?;
let search_engine = Arc::new(SearchEngine::new(config.clone())?);
Ok(Self {
config,
search_engine,
})
}
pub async fn run(self) -> Result<()> {
info!("Starting GURT search server on {}", self.config.server_bind_address());
let scheduler_config = self.config.clone();
let _scheduler_handle = BackgroundScheduler::new(scheduler_config).start();
info!("Background crawler scheduler started");
let server = GurtServer::with_tls_certificates(
&self.config.server.cert_path.to_string_lossy(),
&self.config.server.key_path.to_string_lossy()
)?;
let search_engine = self.search_engine.clone();
let config = self.config.clone();
let server = server
.get("/search", {
let search_engine = search_engine.clone();
let config = config.clone();
move |ctx| {
let search_engine = search_engine.clone();
let config = config.clone();
let path = ctx.path().to_string();
async move {
handle_search(path, search_engine, config).await
}
}
})
.get("/api/search*", {
let search_engine = search_engine.clone();
let config = config.clone();
move |ctx| {
let search_engine = search_engine.clone();
let config = config.clone();
let path = ctx.path().to_string();
async move {
handle_api_search(path, search_engine, config).await
}
}
})
.get("/", {
move |_ctx| async {
Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.html")))
}
})
.get("/search.lua", {
move |_ctx| async {
Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.lua")))
}
})
.get("/health", |_ctx| async {
Ok(GurtResponse::ok().with_json_body(&json!({"status": "healthy"}))?)
})
.get("/test*", |ctx| {
let path = ctx.path().to_string();
async move {
println!("Test request path: '{}'", path);
Ok(GurtResponse::ok().with_string_body(format!("Path received: {}", path)))
}
});
info!("GURT search server listening on {}", self.config.gurt_protocol_url());
server.listen(&self.config.server_bind_address()).await.map_err(|e| anyhow::anyhow!("GURT server error: {}", e))
}
}
pub async fn run_server(config: Config) -> Result<()> {
let server = SearchServer::new(config).await?;
server.run().await
}
fn parse_query_param(path: &str, param: &str) -> String {
let param_with_eq = format!("{}=", param);
if let Some(start) = path.find(&format!("?{}", param_with_eq)) {
let start_pos = start + 1 + param_with_eq.len(); // Skip the '?' and 'param='
let query_part = &path[start_pos..];
let end_pos = query_part.find('&').unwrap_or(query_part.len());
urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
} else if let Some(start) = path.find(&format!("&{}", param_with_eq)) {
let start_pos = start + 1 + param_with_eq.len(); // Skip the '&' and 'param='
let query_part = &path[start_pos..];
let end_pos = query_part.find('&').unwrap_or(query_part.len());
urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
} else {
String::new()
}
}
fn parse_query_param_usize(path: &str, param: &str) -> Option<usize> {
let value = parse_query_param(path, param);
if value.is_empty() { None } else { value.parse().ok() }
}
async fn handle_search(
path: String,
search_engine: Arc<SearchEngine>,
config: Config
) -> Result<GurtResponse, GurtError> {
let query = parse_query_param(&path, "q");
if query.is_empty() {
return Ok(GurtResponse::bad_request()
.with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
}
println!("Search query: '{}'", query);
let limit = parse_query_param_usize(&path, "limit")
.unwrap_or(config.search.search_results_per_page)
.min(config.search.max_search_results);
match search_engine.search(&query, limit).await {
Ok(results) => {
let response = json!({
"query": query,
"results": results,
"count": results.len()
});
Ok(GurtResponse::ok()
.with_header("content-type", "application/json")
.with_json_body(&response)?)
}
Err(e) => {
error!("Search failed: {}", e);
Ok(GurtResponse::internal_server_error()
.with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
}
}
}
async fn handle_api_search(
path: String,
search_engine: Arc<SearchEngine>,
config: Config
) -> Result<GurtResponse, GurtError> {
let query = parse_query_param(&path, "q");
if query.is_empty() {
return Ok(GurtResponse::bad_request()
.with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
}
let page = parse_query_param_usize(&path, "page")
.unwrap_or(1)
.max(1);
let per_page = parse_query_param_usize(&path, "per_page")
.unwrap_or(config.search.search_results_per_page)
.min(config.search.max_search_results);
match search_engine.search_with_response(&query, page, per_page).await {
Ok(response) => {
Ok(GurtResponse::ok()
.with_header("content-type", "application/json")
.with_json_body(&response)?)
}
Err(e) => {
error!("API search failed: {}", e);
Ok(GurtResponse::internal_server_error()
.with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
}
}
}