This commit is contained in:
Face
2025-09-25 21:56:56 +03:00
4 changed files with 75 additions and 22 deletions

View File

@@ -14,15 +14,15 @@ crate-type = ["cdylib"]
[dependencies] [dependencies]
gurtlib = { path = "../library" } gurtlib = { path = "../library" }
godot = "0.1" godot = { version = "0.1", features = ["experimental-threads"] }
tokio = { version = "1.0", features = ["rt"] } tokio = { version = "1.0", features = ["rt"] }
url = "2.5" url = "2.5"
serde_json = "1.0" serde_json = "1.0"
[profile.release] [profile.release]
opt-level = "z" opt-level = 3
lto = true lto = true
codegen-units = 1 codegen-units = 1
strip = true
panic = "abort" panic = "abort"
strip = true

View File

@@ -33,7 +33,7 @@ while [[ $# -gt 0 ]]; do
echo "" echo ""
echo "Options:" echo "Options:"
echo " -t, --target TARGET Build target (debug|release) [default: release]" echo " -t, --target TARGET Build target (debug|release) [default: release]"
echo " -p, --platform PLATFORM Target platform (windows|linux|macos|current)" echo " -p, --platform PLATFORM Target platform (windows|linux|macos|macos-intel|current)"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
exit 0 exit 0
@@ -82,6 +82,10 @@ case $PLATFORM in
LIB_NAME="libgurt_godot.so" LIB_NAME="libgurt_godot.so"
;; ;;
macos) macos)
RUST_TARGET="aarch64-apple-darwin"
LIB_NAME="libgurt_godot.dylib"
;;
macos-intel)
RUST_TARGET="x86_64-apple-darwin" RUST_TARGET="x86_64-apple-darwin"
LIB_NAME="libgurt_godot.dylib" LIB_NAME="libgurt_godot.dylib"
;; ;;

View File

@@ -158,7 +158,14 @@ impl DomainCrawler {
} }
fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> { fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
let user_agent = &self.config.search.crawler_user_agent; Self::parse_clanker_rules(
content,
base_url,
&self.config.search.crawler_user_agent,
)
}
fn parse_clanker_rules(content: &str, base_url: &str, user_agent: &str) -> Result<Vec<String>> {
let mut disallow_all = false; let mut disallow_all = false;
let mut user_agent_matches = false; let mut user_agent_matches = false;
let mut allowed_urls = Vec::new(); let mut allowed_urls = Vec::new();
@@ -169,26 +176,31 @@ impl DomainCrawler {
continue; continue;
} }
if let Some(user_agent_value) = line.to_lowercase().strip_prefix("user-agent:") { let (directive, value) = match line.split_once(':') {
let current_user_agent = user_agent_value.trim().to_string(); Some((directive, value)) => (directive.trim().to_lowercase(), value.trim()),
user_agent_matches = current_user_agent == "*" || current_user_agent.eq_ignore_ascii_case(user_agent); None => continue,
};
if directive == "user-agent" {
user_agent_matches =
value == "*" || value.eq_ignore_ascii_case(user_agent);
continue; continue;
} }
if user_agent_matches { if !user_agent_matches {
if let Some(path_value) = line.to_lowercase().strip_prefix("disallow:") { continue;
let path = path_value.trim(); }
if path == "/" {
disallow_all = true; if directive == "disallow" {
break; if value == "/" {
} disallow_all = true;
} else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") { break;
let path = path_value.trim(); }
if !path.is_empty() { } else if directive == "allow" {
let full_url = Self::normalize_url(format!("{}{}", base_url, path)); if !value.is_empty() {
debug!("Added allowed URL from clanker.txt: {}", full_url); let full_url = Self::normalize_url(format!("{}{}", base_url, value));
allowed_urls.push(full_url); debug!("Added allowed URL from clanker.txt: {}", full_url);
} allowed_urls.push(full_url);
} }
} }
} }
@@ -719,4 +731,41 @@ impl CrawlStats {
duration_seconds: 0, duration_seconds: 0,
} }
} }
}
#[cfg(test)]
mod tests {
use super::DomainCrawler;
#[test]
fn parse_clanker_rules_preserves_case_in_allowed_urls() {
let content = "User-agent: TestBot\nAllow: /getpage?l=Fri,12Sep2025000605_ZzesV.txt\n";
let result = DomainCrawler::parse_clanker_rules(content, "gurt://wi.ki", "TestBot")
.expect("expected allow list");
assert_eq!(
result,
vec!["gurt://wi.ki/getpage?l=Fri,12Sep2025000605_ZzesV.txt".to_string()]
);
}
#[test]
fn parse_clanker_rules_handles_case_insensitive_directives() {
let content = "user-Agent: AnotherBot\nAlLoW: /MiXeD/Path.HTML\n";
let result = DomainCrawler::parse_clanker_rules(content, "gurt://example", "AnotherBot")
.expect("expected allow list");
assert_eq!(
result,
vec!["gurt://example/MiXeD/Path.HTML".to_string()]
);
}
#[test]
fn parse_clanker_rules_respects_disallow_all() {
let content = "User-agent: Bot\nDisallow: /\n";
let result = DomainCrawler::parse_clanker_rules(content, "gurt://example", "Bot");
assert!(result.is_err());
}
} }