Files
leonwww/search-engine/src/scheduler.rs
2025-08-27 20:23:05 +03:00

153 lines
4.8 KiB
Rust

use anyhow::{Result, Context};
use tokio::time::{interval, Instant};
use tracing::{info, error};
use crate::config::Config;
use crate::crawler::run_crawl_all;
use crate::indexer::SearchEngine;
pub struct CrawlScheduler {
config: Config,
}
impl CrawlScheduler {
pub fn new(config: Config) -> Self {
Self { config }
}
pub async fn start(&self) -> Result<()> {
info!("Starting crawl scheduler");
info!(
"Crawl interval: {} hours ({} seconds)",
self.config.search.crawl_interval_hours,
self.config.crawl_interval().as_secs()
);
info!(
"Index rebuild interval: {} hours ({} seconds)",
self.config.search.index_rebuild_interval_hours,
self.config.index_rebuild_interval().as_secs()
);
let mut crawl_interval = interval(self.config.crawl_interval());
let mut index_rebuild_interval = interval(self.config.index_rebuild_interval());
crawl_interval.tick().await;
index_rebuild_interval.tick().await;
info!("Running initial crawl...");
if let Err(e) = self.run_scheduled_crawl().await {
error!("Initial crawl failed: {}", e);
error!("Error details: {:?}", e);
// Log the error chain
let mut source = e.source();
let mut depth = 1;
while let Some(err) = source {
error!(" Caused by ({}): {}", depth, err);
source = err.source();
depth += 1;
}
}
loop {
tokio::select! {
_ = crawl_interval.tick() => {
info!("Running scheduled crawl");
if let Err(e) = self.run_scheduled_crawl().await {
error!("Scheduled crawl failed: {}", e);
error!("Error details: {:?}", e);
// Log the error chain
let mut source = e.source();
let mut depth = 1;
while let Some(err) = source {
error!(" Caused by ({}): {}", depth, err);
source = err.source();
depth += 1;
}
}
}
_ = index_rebuild_interval.tick() => {
info!("Running scheduled index rebuild");
if let Err(e) = self.run_scheduled_index_rebuild().await {
error!("Scheduled index rebuild failed: {}", e);
}
}
_ = tokio::signal::ctrl_c() => {
info!("Received shutdown signal, stopping scheduler");
break;
}
}
}
info!("Crawl scheduler stopped");
Ok(())
}
async fn run_scheduled_crawl(&self) -> Result<()> {
let start_time = Instant::now();
run_crawl_all(self.config.clone()).await
.context("Scheduled crawl failed")?;
let duration = start_time.elapsed();
info!("Scheduled crawl completed in {:.2} seconds", duration.as_secs_f64());
Ok(())
}
async fn run_scheduled_index_rebuild(&self) -> Result<()> {
let start_time = Instant::now();
let search_engine = SearchEngine::new(self.config.clone())?;
let stats_before = search_engine.get_stats().await?;
info!(
"Starting index rebuild - current index has {} documents from {} domains ({:.2} MB)",
stats_before.total_documents,
stats_before.total_domains,
stats_before.index_size_mb
);
crate::indexer::rebuild_index(self.config.clone()).await
.context("Index rebuild failed")?;
info!("Repopulating rebuilt index with fresh crawl");
run_crawl_all(self.config.clone()).await
.context("Post-rebuild crawl failed")?;
let duration = start_time.elapsed();
let new_search_engine = SearchEngine::new(self.config.clone())?;
let stats_after = new_search_engine.get_stats().await?;
info!(
"Index rebuild completed in {:.2} seconds - new index has {} documents from {} domains ({:.2} MB)",
duration.as_secs_f64(),
stats_after.total_documents,
stats_after.total_domains,
stats_after.index_size_mb
);
Ok(())
}
}
pub struct BackgroundScheduler {
scheduler: CrawlScheduler,
}
impl BackgroundScheduler {
pub fn new(config: Config) -> Self {
Self {
scheduler: CrawlScheduler::new(config),
}
}
pub fn start(self) -> tokio::task::JoinHandle<Result<()>> {
tokio::spawn(async move {
self.scheduler.start().await
})
}
}