Merge branch 'main' of https://github.com/outpoot/gurted
This commit is contained in:
Binary file not shown.
@@ -14,15 +14,15 @@ crate-type = ["cdylib"]
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
gurtlib = { path = "../library" }
|
gurtlib = { path = "../library" }
|
||||||
|
|
||||||
godot = "0.1"
|
godot = { version = "0.1", features = ["experimental-threads"] }
|
||||||
|
|
||||||
tokio = { version = "1.0", features = ["rt"] }
|
tokio = { version = "1.0", features = ["rt"] }
|
||||||
url = "2.5"
|
url = "2.5"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
opt-level = "z"
|
opt-level = 3
|
||||||
lto = true
|
lto = true
|
||||||
codegen-units = 1
|
codegen-units = 1
|
||||||
|
strip = true
|
||||||
panic = "abort"
|
panic = "abort"
|
||||||
strip = true
|
|
||||||
@@ -33,7 +33,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo ""
|
echo ""
|
||||||
echo "Options:"
|
echo "Options:"
|
||||||
echo " -t, --target TARGET Build target (debug|release) [default: release]"
|
echo " -t, --target TARGET Build target (debug|release) [default: release]"
|
||||||
echo " -p, --platform PLATFORM Target platform (windows|linux|macos|current)"
|
echo " -p, --platform PLATFORM Target platform (windows|linux|macos|macos-intel|current)"
|
||||||
echo " -h, --help Show this help message"
|
echo " -h, --help Show this help message"
|
||||||
echo ""
|
echo ""
|
||||||
exit 0
|
exit 0
|
||||||
@@ -82,6 +82,10 @@ case $PLATFORM in
|
|||||||
LIB_NAME="libgurt_godot.so"
|
LIB_NAME="libgurt_godot.so"
|
||||||
;;
|
;;
|
||||||
macos)
|
macos)
|
||||||
|
RUST_TARGET="aarch64-apple-darwin"
|
||||||
|
LIB_NAME="libgurt_godot.dylib"
|
||||||
|
;;
|
||||||
|
macos-intel)
|
||||||
RUST_TARGET="x86_64-apple-darwin"
|
RUST_TARGET="x86_64-apple-darwin"
|
||||||
LIB_NAME="libgurt_godot.dylib"
|
LIB_NAME="libgurt_godot.dylib"
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -158,7 +158,14 @@ impl DomainCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
|
fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
|
||||||
let user_agent = &self.config.search.crawler_user_agent;
|
Self::parse_clanker_rules(
|
||||||
|
content,
|
||||||
|
base_url,
|
||||||
|
&self.config.search.crawler_user_agent,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_clanker_rules(content: &str, base_url: &str, user_agent: &str) -> Result<Vec<String>> {
|
||||||
let mut disallow_all = false;
|
let mut disallow_all = false;
|
||||||
let mut user_agent_matches = false;
|
let mut user_agent_matches = false;
|
||||||
let mut allowed_urls = Vec::new();
|
let mut allowed_urls = Vec::new();
|
||||||
@@ -169,26 +176,31 @@ impl DomainCrawler {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(user_agent_value) = line.to_lowercase().strip_prefix("user-agent:") {
|
let (directive, value) = match line.split_once(':') {
|
||||||
let current_user_agent = user_agent_value.trim().to_string();
|
Some((directive, value)) => (directive.trim().to_lowercase(), value.trim()),
|
||||||
user_agent_matches = current_user_agent == "*" || current_user_agent.eq_ignore_ascii_case(user_agent);
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
if directive == "user-agent" {
|
||||||
|
user_agent_matches =
|
||||||
|
value == "*" || value.eq_ignore_ascii_case(user_agent);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if user_agent_matches {
|
if !user_agent_matches {
|
||||||
if let Some(path_value) = line.to_lowercase().strip_prefix("disallow:") {
|
continue;
|
||||||
let path = path_value.trim();
|
}
|
||||||
if path == "/" {
|
|
||||||
disallow_all = true;
|
if directive == "disallow" {
|
||||||
break;
|
if value == "/" {
|
||||||
}
|
disallow_all = true;
|
||||||
} else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") {
|
break;
|
||||||
let path = path_value.trim();
|
}
|
||||||
if !path.is_empty() {
|
} else if directive == "allow" {
|
||||||
let full_url = Self::normalize_url(format!("{}{}", base_url, path));
|
if !value.is_empty() {
|
||||||
debug!("Added allowed URL from clanker.txt: {}", full_url);
|
let full_url = Self::normalize_url(format!("{}{}", base_url, value));
|
||||||
allowed_urls.push(full_url);
|
debug!("Added allowed URL from clanker.txt: {}", full_url);
|
||||||
}
|
allowed_urls.push(full_url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -719,4 +731,41 @@ impl CrawlStats {
|
|||||||
duration_seconds: 0,
|
duration_seconds: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::DomainCrawler;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_clanker_rules_preserves_case_in_allowed_urls() {
|
||||||
|
let content = "User-agent: TestBot\nAllow: /getpage?l=Fri,12Sep2025000605_ZzesV.txt\n";
|
||||||
|
let result = DomainCrawler::parse_clanker_rules(content, "gurt://wi.ki", "TestBot")
|
||||||
|
.expect("expected allow list");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!["gurt://wi.ki/getpage?l=Fri,12Sep2025000605_ZzesV.txt".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_clanker_rules_handles_case_insensitive_directives() {
|
||||||
|
let content = "user-Agent: AnotherBot\nAlLoW: /MiXeD/Path.HTML\n";
|
||||||
|
let result = DomainCrawler::parse_clanker_rules(content, "gurt://example", "AnotherBot")
|
||||||
|
.expect("expected allow list");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
result,
|
||||||
|
vec!["gurt://example/MiXeD/Path.HTML".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_clanker_rules_respects_disallow_all() {
|
||||||
|
let content = "User-agent: Bot\nDisallow: /\n";
|
||||||
|
let result = DomainCrawler::parse_clanker_rules(content, "gurt://example", "Bot");
|
||||||
|
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user