fix text-related tag events, remove warnings, make search engine not fetch /index.html separately but /

This commit is contained in:
Face
2025-08-27 20:56:29 +03:00
parent 347b40ed71
commit 5769982bbc
15 changed files with 123 additions and 71 deletions

View File

@@ -185,7 +185,7 @@ impl DomainCrawler {
} else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") {
let path = path_value.trim();
if !path.is_empty() {
let full_url = format!("{}{}", base_url, path);
let full_url = Self::normalize_url(format!("{}{}", base_url, path));
debug!("Added allowed URL from clanker.txt: {}", full_url);
allowed_urls.push(full_url);
}
@@ -222,19 +222,21 @@ impl DomainCrawler {
}
// Start with the root URL
let normalized_base_url = Self::normalize_url(base_url.clone());
queue.push_back(CrawlItem {
url: base_url.clone(),
url: normalized_base_url,
depth: 0,
});
// Add all URLs from clanker.txt to the queue
for url in clanker_urls {
if !visited_urls.contains(&url) {
let normalized_url = Self::normalize_url(url);
if !visited_urls.contains(&normalized_url) {
queue.push_back(CrawlItem {
url: url.clone(),
url: normalized_url.clone(),
depth: 0, // Treat clanker.txt URLs as root level
});
debug!("Added clanker.txt URL to queue: {}", url);
debug!("Added clanker.txt URL to queue: {}", normalized_url);
}
}
@@ -268,10 +270,11 @@ impl DomainCrawler {
if let Ok(links) = self.extract_links(&page_with_html.original_html, &base_url).await {
debug!("Found {} links on {}", links.len(), item.url);
for link in links {
if self.should_crawl_url(&link, domain) {
debug!("Adding link to crawl queue: {}", link);
let normalized_link = Self::normalize_url(link);
if self.should_crawl_url(&normalized_link, domain) && !visited_urls.contains(&normalized_link) {
debug!("Adding link to crawl queue: {}", normalized_link);
queue.push_back(CrawlItem {
url: link,
url: normalized_link,
depth: item.depth + 1,
});
}
@@ -358,7 +361,7 @@ impl DomainCrawler {
let page = CrawledPageWithHtml {
crawled_page: CrawledPage {
url: url.to_string(),
url: Self::normalize_url(url.to_string()),
domain: domain.full_domain(),
title,
content: cleaned_content.clone(),
@@ -398,7 +401,7 @@ impl DomainCrawler {
// Resolve relative URLs
match base.join(href) {
Ok(absolute_url) => {
let url_str = absolute_url.to_string();
let url_str = Self::normalize_url(absolute_url.to_string());
// Only include GURT protocol URLs for the same domain
if url_str.starts_with("gurt://") {
@@ -601,6 +604,19 @@ impl DomainCrawler {
false
}
fn normalize_url(url: String) -> String {
if url.ends_with("/index.html") {
let without_index = &url[..url.len() - 11]; // Remove "/index.html" (11 chars)
if without_index.ends_with('/') {
without_index.to_string()
} else {
format!("{}/", without_index)
}
} else {
url
}
}
fn calculate_content_hash(content: &str) -> String {
use sha2::{Sha256, Digest};
let mut hasher = Sha256::new();