153 lines
4.8 KiB
Rust
153 lines
4.8 KiB
Rust
use anyhow::{Result, Context};
|
|
use tokio::time::{interval, Instant};
|
|
use tracing::{info, error};
|
|
|
|
use crate::config::Config;
|
|
use crate::crawler::run_crawl_all;
|
|
use crate::indexer::SearchEngine;
|
|
|
|
pub struct CrawlScheduler {
|
|
config: Config,
|
|
}
|
|
|
|
impl CrawlScheduler {
|
|
pub fn new(config: Config) -> Self {
|
|
Self { config }
|
|
}
|
|
|
|
pub async fn start(&self) -> Result<()> {
|
|
info!("Starting crawl scheduler");
|
|
info!(
|
|
"Crawl interval: {} hours ({} seconds)",
|
|
self.config.search.crawl_interval_hours,
|
|
self.config.crawl_interval().as_secs()
|
|
);
|
|
info!(
|
|
"Index rebuild interval: {} hours ({} seconds)",
|
|
self.config.search.index_rebuild_interval_hours,
|
|
self.config.index_rebuild_interval().as_secs()
|
|
);
|
|
|
|
let mut crawl_interval = interval(self.config.crawl_interval());
|
|
let mut index_rebuild_interval = interval(self.config.index_rebuild_interval());
|
|
|
|
crawl_interval.tick().await;
|
|
index_rebuild_interval.tick().await;
|
|
|
|
info!("Running initial crawl...");
|
|
if let Err(e) = self.run_scheduled_crawl().await {
|
|
error!("Initial crawl failed: {}", e);
|
|
error!("Error details: {:?}", e);
|
|
|
|
// Log the error chain
|
|
let mut source = e.source();
|
|
let mut depth = 1;
|
|
while let Some(err) = source {
|
|
error!(" Caused by ({}): {}", depth, err);
|
|
source = err.source();
|
|
depth += 1;
|
|
}
|
|
}
|
|
|
|
loop {
|
|
tokio::select! {
|
|
_ = crawl_interval.tick() => {
|
|
info!("Running scheduled crawl");
|
|
if let Err(e) = self.run_scheduled_crawl().await {
|
|
error!("Scheduled crawl failed: {}", e);
|
|
error!("Error details: {:?}", e);
|
|
|
|
// Log the error chain
|
|
let mut source = e.source();
|
|
let mut depth = 1;
|
|
while let Some(err) = source {
|
|
error!(" Caused by ({}): {}", depth, err);
|
|
source = err.source();
|
|
depth += 1;
|
|
}
|
|
}
|
|
}
|
|
_ = index_rebuild_interval.tick() => {
|
|
info!("Running scheduled index rebuild");
|
|
if let Err(e) = self.run_scheduled_index_rebuild().await {
|
|
error!("Scheduled index rebuild failed: {}", e);
|
|
}
|
|
}
|
|
_ = tokio::signal::ctrl_c() => {
|
|
info!("Received shutdown signal, stopping scheduler");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Crawl scheduler stopped");
|
|
Ok(())
|
|
}
|
|
|
|
async fn run_scheduled_crawl(&self) -> Result<()> {
|
|
let start_time = Instant::now();
|
|
|
|
run_crawl_all(self.config.clone()).await
|
|
.context("Scheduled crawl failed")?;
|
|
|
|
let duration = start_time.elapsed();
|
|
info!("Scheduled crawl completed in {:.2} seconds", duration.as_secs_f64());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn run_scheduled_index_rebuild(&self) -> Result<()> {
|
|
let start_time = Instant::now();
|
|
|
|
let search_engine = SearchEngine::new(self.config.clone())?;
|
|
let stats_before = search_engine.get_stats().await?;
|
|
|
|
info!(
|
|
"Starting index rebuild - current index has {} documents from {} domains ({:.2} MB)",
|
|
stats_before.total_documents,
|
|
stats_before.total_domains,
|
|
stats_before.index_size_mb
|
|
);
|
|
|
|
crate::indexer::rebuild_index(self.config.clone()).await
|
|
.context("Index rebuild failed")?;
|
|
|
|
info!("Repopulating rebuilt index with fresh crawl");
|
|
run_crawl_all(self.config.clone()).await
|
|
.context("Post-rebuild crawl failed")?;
|
|
|
|
let duration = start_time.elapsed();
|
|
|
|
let new_search_engine = SearchEngine::new(self.config.clone())?;
|
|
let stats_after = new_search_engine.get_stats().await?;
|
|
|
|
info!(
|
|
"Index rebuild completed in {:.2} seconds - new index has {} documents from {} domains ({:.2} MB)",
|
|
duration.as_secs_f64(),
|
|
stats_after.total_documents,
|
|
stats_after.total_domains,
|
|
stats_after.index_size_mb
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
pub struct BackgroundScheduler {
|
|
scheduler: CrawlScheduler,
|
|
}
|
|
|
|
impl BackgroundScheduler {
|
|
pub fn new(config: Config) -> Self {
|
|
Self {
|
|
scheduler: CrawlScheduler::new(config),
|
|
}
|
|
}
|
|
|
|
pub fn start(self) -> tokio::task::JoinHandle<Result<()>> {
|
|
tokio::spawn(async move {
|
|
self.scheduler.start().await
|
|
})
|
|
}
|
|
}
|